#### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
##Imports needed for Lasso regression
from sklearn.linear_model import Lasso
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

#### Helper functions

In [32]:
def calculate_rmse(y_test, predicted):
    y_test_arr = np.array(y_test)
    predicted_arr = np.array(predicted)
    errors_arr = predicted_arr - y_test_arr
    return np.sqrt(sum(np.square(errors))/len(errors_arr))

#### Loading Training Data and Data Manipulation

In [4]:
##Loading the training data
train_df = pd.read_csv('../data/train.csv')

In [5]:
ncols = len(train_df.columns)
cols = train_df.columns

In [6]:
##Preprocessing the training data
#Removing the zero columns
nonzero_cols = cols[(train_df != 0).any(axis=0)]

In [7]:
train = train_df[nonzero_cols]

In [8]:
##Creating more features
#Counting the length of the smiles column --> len_smiles
train['len_smiles'] = train['smiles'].str.len()
elements = ['nH', 'n', 'c', 'SiH2', '=', 'CC', 'ncc', 'C1', 'C', 'H', 'cc', 'ccc']
for elem in elements:
    col_name = 'count_' + elem
    train[col_name] = train['smiles'].str.count(elem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
##Splitting the training set into test and training set
train_cols = train.columns
train_cols = train_cols.difference(['smiles', 'gap'])
X = train[train_cols]
y = train['gap']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1)
print X_train.shape, X_test.shape

(900000, 44) (100000, 44)


In [54]:
##Model parameters
clf = Lasso(alpha = 10e-5, normalize=True)

In [55]:
clf.fit(X_train, y_train)

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [56]:
##Testing the error
pred_values = clf.predict(X_test)

In [57]:
##Calculating the error
y_test_arr = np.array(y_test)
errors = y_test_arr - pred_values
test_error = np.sqrt(sum(np.square(errors))/len(errors))
print test_error

0.315887793595


In [27]:
##Using gridsearchCV to find right alpha
lasso = GridSearchCV(Lasso(), {'alpha': np.logspace(-7, 0, 10)})
lasso.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e-07,   5.99484e-07,   3.59381e-06,   2.15443e-05,
         1.29155e-04,   7.74264e-04,   4.64159e-03,   2.78256e-02,
         1.66810e-01,   1.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [28]:
pred_values = lasso.predict(X_test)

In [33]:
calculate_rmse(y_test, pred_values)

0.23536556329126096

#### Load and predict on test dataset

In [34]:
##Loading the test data
test_df = pd.read_csv('../data/test.csv')

In [37]:
##Removing non zero columns
test_nonzero_cols = test_df.columns[(train_df != 0).any(axis=0)]
test = test_df[test_nonzero_cols]

In [38]:
##Creating same features as training dataset
test['len_smiles'] = test['smiles'].str.len()
elements = ['nH', 'n', 'c', 'SiH2', '=', 'CC', 'ncc', 'C1', 'C', 'H', 'cc', 'ccc']
for elem in elements:
    col_name = 'count_' + elem
    test[col_name] = test['smiles'].str.count(elem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [41]:
pred_test = lasso.predict(test[test.columns[2:]])

In [50]:
out_df = pd.DataFrame({'Id':range(1,len(pred_test)+1), 'Prediction': pred_test})
out_df = out_df.set_index('Id')
out_df.Prediction = out_df.Prediction.astype(float)

In [53]:
#Saving the values to csv
out_df.to_csv('../output/lasso_test.csv')

In [58]:
type(lasso)

sklearn.grid_search.GridSearchCV