In [1]:
import sys
sys.version

'2.7.11 |Anaconda 2.4.1 (x86_64)| (default, Dec  6 2015, 18:57:58) \n[GCC 4.2.1 (Apple Inc. build 5577)]'

In [2]:
import time
import numpy as np
import math
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

%matplotlib inline
import matplotlib.pyplot as plt

<div class="alert alert-danger">
<strong>BASELINES</strong>
</div>

<div class="alert alert-info">
<strong>LOAD DATA</strong>
</div>

In [3]:
# load all data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [4]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,1,0,0,0,0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,1,0,0,0,0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1.98


In [5]:
df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0,0,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [6]:
# store gap values
Y_train = df_train.gap.values

# row where testing examples start
test_idx = df_train.shape[0]

# delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)

# delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [7]:
# original dataframe sizes
df_train.shape, df_test.shape

((1000000, 257), (824230, 257))

<div class="alert alert-info">
<strong>FEATURE ENGINEERING</strong>
</div>

In [8]:
# dataframe with all train and test examples so we can more easily apply feature engineering
df_all = pd.concat((df_train, df_test), axis=0)

# extract 'smiles' column - for separate processing
df_smiles = pd.DataFrame(df_all.smiles.values, columns=['smiles'], index=df_all.index.values)

# drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)

df_all.shape, df_smiles.shape

((1824230, 256), (1824230, 1))

In [9]:
# drop columns that are all zeros
df_all_sum = pd.DataFrame(df_all.sum(axis=0), index=df_all.columns, columns=['SUM'])
df_all_zeros = df_all_sum[df_all_sum.SUM==0].index.values
df_all.drop(df_all_zeros, axis=1, inplace=True)
df_all.shape

(1824230, 31)

In [10]:
# number of features
num_features = df_all.shape[1]
num_features

31

In [11]:
# split back up into training and test data
lcols = df_all.columns.values.tolist()
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]

print 'Train features:', X_train.shape
print 'Train gap:', Y_train.shape
print 'Test features:', X_test.shape

Train features: (1000000, 31)
Train gap: (1000000,)
Test features: (824230, 31)


In [12]:
# clear memory
df_train = None
df_test = None
# df_all = None # keep to facilitate later feature engineering

<div class="alert alert-info">
<strong>COMMON FUNCTIONS</strong>
</div>

In [13]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

<div class="alert alert-info">
<strong>BASELINE PREDICTORS</strong>
</div>

**Linear Regression**

In [14]:
# linear regression - test & validation split
LR = LinearRegression()
LR.fit(X_train[:800000], Y_train[:800000])
LR_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], LR.predict(X_train[:800000])))
LR_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], LR.predict(X_train[800000:])))
print 'Linear regression RMSE - training set = %0.5f' % LR_rmse_train
print 'Linear regression RMSE - validation set = %0.5f' % LR_rmse_val
print 'Baseline linear regression RMSE (leaderboard) = 0.29892'

Linear regression RMSE - training set = 0.29875
Linear regression RMSE - validation set = 0.29962
Baseline linear regression RMSE (leaderboard) = 0.29892


In [15]:
# linear regression - training & test split

start = time.time()

LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)
LR_rmse = math.sqrt(mean_squared_error(Y_train, LR.predict(X_train)))
print 'New linear regression RMSE = %0.5f' % LR_rmse
print 'Baseline linear regression RMSE (leaderboard) = 0.29892'
# write_to_file('data/LR.csv', LR_pred)

timer = time.time() - start
print '\n%d seconds to run baseline linear regression' % timer

New linear regression RMSE = 0.29892
Baseline linear regression RMSE (leaderboard) = 0.29892

3 seconds to run baseline linear regression


In [16]:
# look for significant coefficients
LR_coeff = pd.DataFrame({'features': lcols,'coefficients': LR.coef_})
LR_coeff.sort_values(by=['coefficients', 'features'], ascending=False)

Unnamed: 0,coefficients,features
21,270354800000.0,feat_199
24,3663788000.0,feat_218
27,103197500.0,feat_243
6,0.3267645,feat_044
3,0.2643649,feat_007
5,0.2248903,feat_037
23,0.1897814,feat_208
1,0.1875479,feat_005
26,0.1413031,feat_226
17,0.0390553,feat_173


**Random Forest**

In [17]:
# random forest regressor - test & validation split
RF = RandomForestRegressor()
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE (leaderboard) = 0.27188'

Random forest RMSE - training set = 0.27161
Random forest RMSE - validation set = 0.27338
Baseline random forest RMSE (leaderboard) = 0.27188


In [18]:
# random forest regressor - training & test split

start = time.time()

RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)
RF_rmse = math.sqrt(mean_squared_error(Y_train, RF.predict(X_train)))
print 'New random forest RMSE = %0.5f' % RF_rmse
print 'Baseline random forest RMSE (leaderboard) = 0.27188'
# write_to_file('data/RF.csv', RF_pred)

timer = time.time() - start
print '\n%d seconds to run baseline random forest' % timer

New random forest RMSE = 0.27188
Baseline random forest RMSE (leaderboard) = 0.27188

29 seconds to run baseline random forest


In [19]:
# look for important features
RF_feat = pd.DataFrame({'features': lcols,'coefficients': RF.feature_importances_})
RF_feat.sort_values(by=['coefficients', 'features'], ascending=False)

Unnamed: 0,coefficients,features
4,0.3176567,feat_025
13,0.3036892,feat_119
7,0.07824871,feat_068
25,0.03745894,feat_225
29,0.02998368,feat_251
23,0.02261285,feat_208
9,0.02207508,feat_072
11,0.02026754,feat_090
17,0.01949376,feat_173
15,0.01929272,feat_126


<div class="alert alert-danger">
<strong>SEARCH FOR IMPROVED PREDICTORS</strong>
</div>

<div class="alert alert-info">
<strong>ADD SUM OF BINARY VARS</strong>
</div>

In [21]:
# sum binary features
sum_feat = df_all.sum(axis=1)
df_all['sum_feat'] = sum_feat

In [22]:
# split back up into training and test data
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]

In [23]:
# linear regression - test & validation split
LR = LinearRegression()
LR.fit(X_train[:800000], Y_train[:800000])
LR_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], LR.predict(X_train[:800000])))
LR_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], LR.predict(X_train[800000:])))
print 'Linear regression RMSE - training set = %0.5f' % LR_rmse_train
print 'Linear regression RMSE - validation set = %0.5f' % LR_rmse_val
print 'Baseline linear regression RMSE (leaderboard) = 0.29892'

Linear regression RMSE - training set = 0.29875
Linear regression RMSE - validation set = 0.29962
Baseline linear regression RMSE (leaderboard) = 0.29892


In [24]:
# linear regression - training & test split

start = time.time()

LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)
LR_rmse = math.sqrt(mean_squared_error(Y_train, LR.predict(X_train)))
print 'New linear regression RMSE = %0.5f' % LR_rmse
print 'Baseline linear regression RMSE (leaderboard) = 0.29892'
# write_to_file('data/LR.csv', LR_pred)

timer = time.time() - start
print '\n%d seconds to run baseline linear regression' % timer

New linear regression RMSE = 0.29893
Baseline linear regression RMSE (leaderboard) = 0.29892

3 seconds to run baseline linear regression


In [25]:
# random forest regressor - test & validation split
RF = RandomForestRegressor()
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE (leaderboard) = 0.27188'

Random forest RMSE - training set = 0.27161
Random forest RMSE - validation set = 0.27337
Baseline random forest RMSE (leaderboard) = 0.27188


In [26]:
# random forest regressor - training & test split

start = time.time()

RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)
RF_rmse = math.sqrt(mean_squared_error(Y_train, RF.predict(X_train)))
print 'New random forest RMSE = %0.5f' % RF_rmse
print 'Baseline random forest RMSE (leaderboard) = 0.27188'
# write_to_file('data/RF.csv', RF_pred)

timer = time.time() - start
print '\n%d seconds to run baseline random forest' % timer

New random forest RMSE = 0.27188
Baseline random forest RMSE (leaderboard) = 0.27188

33 seconds to run baseline random forest


<div class="alert alert-info">
<strong>ADD INTERACTIONS</strong>
</div>

In [27]:
# create new dataframe
df_new = df_all.copy()
df_all.shape, df_new.shape

((1824230, 32), (1824230, 32))

In [33]:
# identify features to interact
key_feat = list(df_new.columns.values)
key_feat.remove('sum_feat')

In [34]:
# create linear combinations of key features
for i, ft1 in enumerate(key_feat):
    for j, ft2 in enumerate(key_feat):
        if j<=i:
            newft = 'feat_' + ft1[-3:] + '_' + ft2[-3:]
            tmp = df_new[ft1] * df_new[ft2]
            df_new[newft] = tmp

In [35]:
# new number of features
df_new.shape

(1824230, 528)

In [36]:
# split back up into training and test data
vals = df_new.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]

In [37]:
# linear regression - test & validation split
LR = LinearRegression()
LR.fit(X_train[:800000], Y_train[:800000])
LR_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], LR.predict(X_train[:800000])))
LR_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], LR.predict(X_train[800000:])))
print 'Linear regression RMSE - training set = %0.5f' % LR_rmse_train
print 'Linear regression RMSE - validation set = %0.5f' % LR_rmse_val
print 'Baseline linear regression RMSE (leaderboard) = 0.29892'

Linear regression RMSE - training set = 0.27986
Linear regression RMSE - validation set = 0.28072
Baseline linear regression RMSE (leaderboard) = 0.29892


In [38]:
# linear regression - training & test split

start = time.time()

LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)
LR_rmse = math.sqrt(mean_squared_error(Y_train, LR.predict(X_train)))
print 'New linear regression RMSE = %0.5f' % LR_rmse
print 'Baseline linear regression RMSE (leaderboard) = 0.29892'
# write_to_file('data/LR.csv', LR_pred)

timer = time.time() - start
print '\n%d seconds to run baseline linear regression' % timer

New linear regression RMSE = 0.28001
Baseline linear regression RMSE (leaderboard) = 0.29892

240 seconds to run baseline linear regression


In [39]:
# random forest regressor - test & validation split
RF = RandomForestRegressor()
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE (leaderboard) = 0.27188'

Random forest RMSE - training set = 0.27161
Random forest RMSE - validation set = 0.27338
Baseline random forest RMSE (leaderboard) = 0.27188


In [40]:
# random forest regressor - training & test split

start = time.time()

RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)
RF_rmse = math.sqrt(mean_squared_error(Y_train, RF.predict(X_train)))
print 'New random forest RMSE = %0.5f' % RF_rmse
print 'Baseline random forest RMSE (leaderboard) = 0.27188'
# write_to_file('data/RF.csv', RF_pred)

timer = time.time() - start
print '\n%d seconds to run baseline random forest' % timer

New random forest RMSE = 0.27188
Baseline random forest RMSE (leaderboard) = 0.27188

502 seconds to run baseline random forest
