This Notebook is for testing ridge regression against a dataset and validate using the performance against standard RF Regressor.

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from harness import RMSE

#### Exploratory data analysis

In [63]:
np.array(train_df[['smiles','gap']].head(40))

array([['c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2nsnc12', 1.19],
       ['C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[SiH2]C=c12', 1.6],
       ['[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-23)c2ccccc12', 1.49],
       ['[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13)c1=C[SiH2]C=c21',
        1.36],
       ['c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1', 1.98],
       ['C1=Cc2cnc3cc4cc(-c5scc6[nH]ccc56)c5ccccc5c4cc3c2[SiH2]1', 1.81],
       ['c1ncc(s1)-c1cnc2c(c1)oc1c2ccc2ccccc12', 2.91],
       ['c1sc(-c2ccc3c(c2)sc2c3c3=CCC=c3c3cccnc23)c2[se]ccc12', 2.17],
       ['c1ccc(o1)-c1cc2cc3cc4c5c[nH]cc5ccc4cc3cc2o1', 2.19],
       ['[nH]1ccc2c3c[nH]cc3c3cc(-c4cncs4)c4=CCC=c4c3c12', 1.71],
       ['[nH]1c(cc2c3cocc3c3c(ccc4ccc5=CCC=c5c34)c12)-c1cccs1', 2.08],
       ['c1cc2oc3c(sc4cc([se]c34)-c3cncc4nsnc34)c2o1', 1.42],
       ['[nH]1c(cc2cnc3cc4ccoc4cc3c12)-c1ccccc1', 2.96],
       ['[nH]1ccc2ccc3c4ncc(cc4[nH]c3c12)-c1scc2occc12', 2.59],
       ['c1sc(-c2sc(-c3sc(-c4ncncn4)c4nccnc34

#### Training dataset operations - Prepping for training

In [60]:
##Loading training set
train_df = pd.read_csv('../data/train_new_features.csv')

In [64]:
def elem_counts(df):
    df['len_smiles'] = df['smiles'].str.len()
    elements = ['nH', 'n', 'c', 'c1', 'Si', 'SiH2', '=', '-', 'CC', 'ncc', 'C1', 'C', 'H', 'cc', 'ccc', 'cccc', 'cc1',\
           '(C1)', '(c1)', '(o1)', '(s1)', 'nc', 'c12', 'c2', 'c1cc', '(cc1)', 'c2C', 'cc3', 'oc', 'ncc', 'C1=C',\
                'C=c', 'C=C', 'ccn', 'c3', '[se]', '=CCC=', 'c21', 'c1c', 'cn', 'c4c', 'c3c', 'coc',\
               'ccccc', '[SiH2]C', 'cc4']
    for elem in elements:
        col_name = 'count_' + elem
        df[col_name] = df['smiles'].str.count(elem)
    return df

In [65]:
train_df = elem_counts(train_df)

In [66]:
train_df.columns

Index([u'Unnamed: 0', u'feat_001', u'feat_005', u'feat_006', u'feat_007',
       u'feat_025', u'feat_037', u'feat_044', u'feat_068', u'feat_069',
       u'feat_072', u'feat_087', u'feat_090', u'feat_102', u'feat_119',
       u'feat_123', u'feat_126', u'feat_132', u'feat_173', u'feat_176',
       u'feat_187', u'feat_196', u'feat_199', u'feat_200', u'feat_208',
       u'feat_218', u'feat_225', u'feat_226', u'feat_243', u'feat_248',
       u'feat_251', u'feat_252', u'gap', u'smiles', u'MolLogP', u'NOCount',
       u'NumHAcceptors', u'NumHDonors', u'NumHeteroAtoms',
       u'NumRadicalElectrons', u'NumSaturatedRings', u'NumValenceElectrons',
       u'RingCount', u'TPSA', u'len_smiles', u'count_nH', u'count_n',
       u'count_c', u'count_c1', u'count_Si', u'count_SiH2', u'count_=',
       u'count_-', u'count_CC', u'count_ncc', u'count_C1', u'count_C',
       u'count_H', u'count_cc', u'count_ccc', u'count_cccc', u'count_cc1',
       u'count_(C1)', u'count_(c1)', u'count_(o1)', u'count_(s1)',

In [67]:
train_cols = train_df.columns
train_cols = train_cols.difference(['smiles', 'gap', 'Unnamed: 0'])
X = train_df[train_cols]
y = train_df['gap']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1)
print X_train.shape, X_test.shape

(900000, 87) (100000, 87)


In [68]:
RF = RandomForestRegressor(n_estimators=10)
RF.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [69]:
#Checking RMSE for training test set
pred = RF.predict(X_test)
print(RMSE(y_test, pred))

RMSE =  0.0943146098186
0.0943146098186


#### To produce the submission file (including the test set preprocessing)

In [70]:
##Load training set
test_df = pd.read_csv('../data/test_new_features.csv')

In [71]:
test_df = elem_counts(test_df)

In [72]:
test_df.columns

Index([u'Unnamed: 0', u'smiles', u'feat_001', u'feat_005', u'feat_006',
       u'feat_007', u'feat_025', u'feat_037', u'feat_044', u'feat_068',
       u'feat_069', u'feat_072', u'feat_087', u'feat_090', u'feat_102',
       u'feat_119', u'feat_123', u'feat_126', u'feat_132', u'feat_173',
       u'feat_176', u'feat_187', u'feat_196', u'feat_199', u'feat_200',
       u'feat_208', u'feat_218', u'feat_225', u'feat_226', u'feat_243',
       u'feat_248', u'feat_251', u'feat_252', u'smiles_mol', u'MolLogP',
       u'NOCount', u'NumHAcceptors', u'NumHDonors', u'NumHeteroAtoms',
       u'NumValenceElectrons', u'RingCount', u'TPSA', u'NumRadicalElectrons',
       u'NumSaturatedRings', u'len_smiles', u'count_nH', u'count_n',
       u'count_c', u'count_c1', u'count_Si', u'count_SiH2', u'count_=',
       u'count_-', u'count_CC', u'count_ncc', u'count_C1', u'count_C',
       u'count_H', u'count_cc', u'count_ccc', u'count_cccc', u'count_cc1',
       u'count_(C1)', u'count_(c1)', u'count_(o1)', u'count

In [73]:
test_cols = test_df.columns
test_cols = test_cols.difference(['smiles', 'smiles_mol', 'Unnamed: 0'])
test_data = test_df[train_cols]
test_data.shape

(824230, 87)

In [74]:
##Predicting through Random Forests
test_pred = RF.predict(test_data)

In [75]:
#Saving to file
out_df = pd.DataFrame({'Id':np.array(test_df.index), 'Prediction': test_pred})
out_df['Id'] = out_df['Id'] + 1
out_df = out_df.set_index('Id')
out_df.Prediction = out_df.Prediction.astype(float)

In [76]:
out_df.to_csv('../data/abhi_trial_2.csv')