In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from collections import Counter

In [3]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,1,0,0,0,0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,...,1,0,0,0,1,0,0,0,0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1.98


In [4]:
df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0,0,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [4]:
#store gap values
Y_train = df_train.gap.values

# Take a sample with training and validation sets
df_sample = df_train.sample(100000, random_state = 42)
msk = np.random.rand(len(df_sample)) < 0.8

zero_cols = []
for i in range(1,256):
    if df_sample['feat_%03d' % i].sum() == 0:
        zero_cols.append('feat_%03d' % i)
df_sample = df_sample.drop(zero_cols, axis=1)

In [48]:
df_sample_train = df_sample[msk]
Y_sample_train = df_sample_train.gap.values
df_sample_train = df_sample_train.drop(['gap'], axis=1)

df_sample_test = df_sample[~msk]
Y_sample_test = df_sample_test.gap.values
df_sample_test = df_sample_test.drop(['gap'], axis=1)

print "Training Sample Shape", df_sample_train.shape
print "Y Training Sample Shape", Y_sample_train.shape
print "Test Sample Shape", df_sample_test.shape
print "Y Test Sample Shape", Y_sample_test.shape

Training Sample Shape (80035, 54)
Y Training Sample Shape (80035,)
Test Sample Shape (19965, 54)
Y Test Sample Shape (19965,)


In [19]:
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [6]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

Unnamed: 0,Id,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap,smiles
0,,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1.19,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...
1,,1,0,0,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,1.6,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...
2,,1,0,0,0,1,1,1,0,0,...,0,0,0,1,0,0,0,0,1.49,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...
3,,1,0,0,0,1,1,1,0,0,...,0,0,0,1,0,0,0,0,1.36,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...
4,,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1.98,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1


In [45]:
smiles = df_sample.smiles
smileydict = smiles.map(lambda x: dict(Counter(x)))
smile_alphabet=list(set(''.join(smiles.iloc[0:50])))
for smile in smile_alphabet:
    df_sample['smile_'+smile] = smileydict.map(lambda x: x[smile] if smile in x.keys() else 0)

In [8]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


'\nExample Feature Engineering\n\nthis calculates the length of each smile string and adds a feature column with those lengths\nNote: this is NOT a good feature and will result in a lower score!\n'

In [50]:
#Drop the 'smiles' column
# df_all = df_all.drop(['smiles'], axis=1)
# vals = df_all.values
# X_train = vals[:test_idx]
# X_test = vals[test_idx:]
# print "Train features:", X_train.shape
# print "Train gap:", Y_train.shape
# print "Test features:", X_test.shape

df_sample_train = df_sample_train.drop(['smiles'], axis=1)
df_sample_test = df_sample_test.drop(['smiles'], axis=1)

In [51]:
# Sample Linear Regression Testing
LR = LinearRegression()

# Train the model using the training sets
LR.fit(df_sample_train, Y_sample_train)

# The coefficients
#print('Coefficients: \n', LR.coef_)
# RMSE
print("RMSE: %.5f" % np.sqrt(np.mean((LR.predict(df_sample_test) - Y_sample_test) ** 2)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % LR.score(df_sample_test, Y_sample_test))

RMSE: 0.21757
Variance score: 0.72


In [31]:
# Sample Polynomial Interpolation Testing
degreearray=[]
scorearray=[]

for degree in range(2, 4):
    degreearray.append(degree)
    model = make_pipeline(PolynomialFeatures(degree), Ridge())
    model.fit(df_sample_train, Y_sample_train)
    score = np.sqrt(np.mean((model.predict(df_sample_test) - Y_sample_test) ** 2))
    scorearray.append(score)
    print "Degree %d RMSE: %f" % (degree, score)

Degree 2 RMSE: 0.282621


KeyboardInterrupt: 

In [38]:
# Original

LR = LinearRegression()
LR.fit(X_train, Y_train)

In [29]:
RF = RandomForestRegressor()
RF.fit(df_sample_train, Y_sample_train)
RF_pred = RF.predict(df_sample_test)

In [52]:
num_trees = [3,5,10,20]
for trees in num_trees:
    RF = RandomForestRegressor(n_estimators=trees)
    RF.fit(df_sample_train, Y_sample_train)
    RF_pred = RF.predict(df_sample_test)
    print np.sqrt(np.mean((RF_pred - Y_sample_test) ** 2))

0.188096061759
0.177614466818
0.171624280843
0.168089577593


In [30]:
np.sqrt(np.mean((RF_pred - Y_sample_test) ** 2))

0.27786663313133098

In [40]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [48]:
write_to_file("sample1.csv", LR_pred)
write_to_file("sample2.csv", RF_pred)