In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import sklearn.model_selection as ms
from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem.Fingerprints import FingerprintMols

In [2]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.98


In [4]:
df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# def make_fold(k, data):
def convert_fp(smile):
    m = Chem.MolFromSmiles(smile)
    print(m)
    # http://www.rdkit.org/docs/GettingStartedInPython.html#fingerprinting-and-molecular-similarity
    return FingerprintMols.FingerprintMol(m)
print (len(df_train['smiles'].head().apply(convert_fp)))



<rdkit.Chem.rdchem.Mol object at 0x00000138422706C0>


KeyError: 'bitsPerHash'

In [9]:
%%capture
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
# df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column

df_train = df_train.drop(['gap'], axis=1)
# code from docs about fingerprinting
# ms = Chem.MolFromSmiles(df_train['smiles'])
# fps = [FingerprintMols.FingerprintMol(x) for x in ms]

# split training set into validation set to check rmse
df_train = df_train.drop(['smiles'], axis=1)
X_train, X_validate, y_train, y_validate = ms.train_test_split(df_train, Y_train, test_size=0.2, random_state=42)

In [10]:
# #DataFrame with all train and test examples so we can more easily apply feature engineering on
# df_all = pd.concat((df_train, df_test), axis=0)
# df_all.head()

In [11]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


'\nExample Feature Engineering\n\nthis calculates the length of each smile string and adds a feature column with those lengths\nNote: this is NOT a good feature and will result in a lower score!\n'

In [12]:
# #Drop the 'smiles' column
# df_all = df_all.drop(['smiles'], axis=1)
# vals = df_all.values
# X_train = vals[:test_idx]
# X_test = vals[test_idx:]
# print "Train features:", X_train.shape
# print "Train gap:", Y_train.shape
# print "Test features:", X_test.shape
X_train.head()

Unnamed: 0,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
566853,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
382311,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
241519,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
719220,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
905718,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
LR = LinearRegression()
LR.fit(X_train, y_train)
LR_pred = LR.predict(X_validate)

In [14]:
RF = RandomForestRegressor()
RF.fit(X_train, y_train)
RF_pred = RF.predict(X_validate)

In [22]:
for i in np.arange(30, 210, 10):
    MLP = MLPRegressor(hidden_layer_sizes=(i,))
    MLP.fit(X_train, y_train)
    MLP_pred = MLP.predict(X_validate)
    print ("layer size: %d" % i)for i in np.arange(30, 210, 10):
    MLP = MLPRegressor(hidden_layer_sizes=(i,))
    MLP.fit(X_train, y_train)
    MLP_pred = MLP.predict(X_validate)
    print ("layer size: %d" % i)
    print (rmse(MLP_pred, y_validate))
    print (rmse(MLP_pred, y_validate))

layer size: 30
0.27868018645
layer size: 40
0.278063942675
layer size: 50
0.277013552636
layer size: 60
0.276017448522
layer size: 70
0.276049560085
layer size: 80
0.275505090495
layer size: 90
0.276229289181
layer size: 100
0.27613835309
layer size: 110
0.276061288217
layer size: 120
0.275038242309
layer size: 130
0.282767715154
layer size: 140
0.275184978214
layer size: 150
0.275588051196
layer size: 160
0.27496848515
layer size: 170
0.27504779328
layer size: 180
0.275710338253
layer size: 190
0.275023196674
layer size: 200
0.27505688205


In [1]:
for i in np.arange(210, 400, 10):
    MLP = MLPRegressor(hidden_layer_sizes=(i,), early_stopping = True)
    MLP.fit(X_train, y_train)
    MLP_pred = MLP.predict(X_validate)
    print ("layer size: %d" % i)
    print (rmse(MLP_pred, y_validate))

NameError: name 'np' is not defined

In [16]:
clf = BayesianRidge(compute_score=True)
clf.fit(X_train, y_train)
ridge_pred = clf.predict(X_validate)

In [17]:
def rmse(predictions, actual):
    return (np.sum((predictions - actual)**2) / len(predictions))**.5

In [18]:
print(rmse(LR_pred, y_validate))
print(rmse(RF_pred, y_validate))
print(rmse(MLP_pred, y_validate))
print(rmse(ridge_pred, y_validate))


0.29962897606
0.272792190702
0.27734905536
0.299629749179


In [19]:
# def write_to_file(filename, predictions):
#     with open(filename, "w") as f:
#         f.write("Id,Prediction\n")
#         for i,p in enumerate(predictions):
#             f.write(str(i+1) + "," + str(p) + "\n")

In [20]:
# write_to_file("sample1.csv", LR_pred)
# write_to_file("sample2.csv", RF_pred)