In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [11]:
##Imports needed for Lasso regression
from sklearn.linear_model import Lasso
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

In [2]:
##Loading the training data
train_df = pd.read_csv('../data/train.csv')

In [4]:
ncols = len(train_df.columns)
cols = train_df.columns

In [5]:
##Preprocessing the training data
nonzero_cols = cols[(train_df != 0).any(axis=0)]

In [7]:
train_df['smiles'].head()

0    c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...
1    C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...
2    [nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...
3    [nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...
4       c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1
Name: smiles, dtype: object

In [68]:
train = train_df[nonzero_cols]

In [69]:
##Creating more features
#Counting the length of the smiles column --> len_smiles
train['len_smiles'] = train['smiles'].str.len()
elements = ['nH', 'n', 'c', 'SiH2', '=', 'CC', 'ncc', 'C1', 'C', 'H', 'cc', 'ccc']
for elem in elements:
    col_name = 'count_' + elem
    train[col_name] = train['smiles'].str.count(elem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [67]:
train.smiles[2]

'[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-23)c2ccccc12'

In [None]:
##Loading the test data
test_df = pd.read_csv('../data/test.csv')

In [70]:
##Splitting the training set into test and training set
train_cols = train.columns
train_cols = train_cols.difference(['smiles', 'gap'])
X = train[train_cols]
y = train['gap']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1)
print X_train.shape, X_test.shape

(900000, 44) (100000, 44)


In [71]:
##Model parameters
clf = Lasso(alpha = 10e-5)

In [72]:
clf.fit(X_train, y_train)

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [73]:
##Testing the error
pred_values = clf.predict(X_test)

In [74]:
##Calculating the error
y_test_arr = np.array(y_test)
errors = y_test_arr - pred_values
test_error = np.sqrt(sum(np.square(errors))/len(errors))
print test_error

0.235461006709


In [75]:
##Using gridsearchCV to find right alpha
lasso = GridSearchCV(Lasso(), {'alpha': np.logspace(-10, -8, 5)})
lasso.fit(X_train, y_train)



GridSearchCV(cv=None, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e-10,   3.16228e-10,   1.00000e-09,   3.16228e-09,
         1.00000e-08])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

#### Function for calculating rmse on test and training sets

In [None]:
def calculate_rmse(y_test, predicted):
    y_test_arr = np.array(y_test)
    predicted_arr = np.array(predicted)
    errors_arr = predicted_arr - y_test_arr
    return np.sqrt(sum(np.square(errors))/len(error_arr))