In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn import linear_model
from sklearn import neighbors
from sklearn.neighbors import KNeighborsRegressor
from collections import Counter

In [4]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [5]:
# Store gap values
Y_train = df_train.gap.values

# Take a sample with training and validation sets
df_sample = df_train.sample(100000)
msk = np.random.rand(len(df_sample)) < 0.8

# Remove 0 columns (columns with no data)
zero_cols = []
for i in range(1,257):
    if df_sample['feat_%03d' % i].sum() == 0:
        zero_cols.append('feat_%03d' % i)
df_sample = df_sample.drop(zero_cols, axis=1)

In [6]:
# Add smile character counts
smiles = df_sample.smiles
smileydict = smiles.map(lambda x: dict(Counter(x)))
smile_alphabet=list(set(''.join(smiles.iloc[0:50])))
for smile in smile_alphabet:
    smilechar = smile
    if smile == '=':
        smilechar = 'equal'
    df_sample['smile_'+smilechar] = smileydict.map(lambda x: x[smile] if smile in x.keys() else 0)

In [7]:
# Feature Engineering
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)

# Add length of smile
df_sample['smile_length'] = df_sample.smiles.map(lambda x: len(x))

# Add number of C's divided by length
df_sample['smile_percentc'] = (df_sample.smile_c / df_sample.smile_length)
df_sample['smile_percentC'] = (df_sample.smile_C / df_sample.smile_length)

# Count specific molecules
# [nH]
df_sample['smile_nh'] = df_sample.smiles.map(lambda x: x.count('[nH]'))
df_sample['smile_si'] = df_sample.smiles.map(lambda x: x.count('Si'))
df_sample['smile_sih2'] = df_sample.smiles.map(lambda x: x.count('[SiH2]'))
df_sample['smile_se'] = df_sample.smiles.map(lambda x: x.count('[se]'))
df_sample['smile_CdoubleC'] = df_sample.smiles.map(lambda x: x.count('C=C'))
df_sample['smile_doubleC'] = df_sample.smiles.map(lambda x: x.count('CC'))
df_sample['smile_doublec'] = df_sample.smiles.map(lambda x: x.count('cc'))
df_sample['smile_triplec'] = df_sample.smiles.map(lambda x: x.count('ccc'))
df_sample['smile_quadc'] = df_sample.smiles.map(lambda x: x.count('cccc'))
df_sample['smile_C1equalCc2'] = df_sample.smiles.map(lambda x: x.count('C1=Cc2'))
df_sample['smile_C1'] = df_sample.smiles.map(lambda x: x.count('C1'))
df_sample['smile_c1'] = df_sample.smiles.map(lambda x: x.count('c1'))
df_sample['smile_equalCCCequal'] = df_sample.smiles.map(lambda x: x.count('=CCC='))
df_sample['smile_parenC1'] = df_sample.smiles.map(lambda x: x.count('(C1)'))
df_sample['smile_parenc1'] = df_sample.smiles.map(lambda x: x.count('(c1)'))
df_sample['smile_parencc1'] = df_sample.smiles.map(lambda x: x.count('(cc1)'))
df_sample['smile_pareno1'] = df_sample.smiles.map(lambda x: x.count('(o1)'))
df_sample['smile_pareno1'] = df_sample.smiles.map(lambda x: x.count('(o1)'))
df_sample['smile_percent_aromatic'] = (df_sample.smile_c + df_sample.smile_o + df_sample.smile_n + df_sample.smile_s / df_sample.smile_length)

# Start
df_sample['smile_start_C1'] = df_sample.smiles.map(lambda x: x.startswith('C1'))
df_sample['smile_start_C1equal'] = df_sample.smiles.map(lambda x: x.startswith('C1='))
df_sample['smile_start_c1'] = df_sample.smiles.map(lambda x: x.startswith('c1'))
df_sample['smile_start_c1sc'] = df_sample.smiles.map(lambda x: x.startswith('c1sc'))
df_sample['smile_start_c1ccc'] = df_sample.smiles.map(lambda x: x.startswith('c1ccc'))
df_sample['smile_start_nH'] = df_sample.smiles.map(lambda x: x.startswith('[nH]'))

# End
df_sample['smile_end_c1ccc'] = df_sample.smiles.map(lambda x: x.endswith('c1ccc'))
df_sample['smile_end_o1'] = df_sample.smiles.map(lambda x: x.endswith('o1'))
df_sample['smile_end_ccsc12'] = df_sample.smiles.map(lambda x: x.endswith('ccsc12'))


#df_sample['smile_percent_bond'] = df_sample.smile_equal / df_sample.smile_length

df_sample = df_sample.drop('smile_length', axis=1)

In [8]:
# Create the training sample
df_sample_train = df_sample[msk]
Y_sample_train = df_sample_train.gap.values
df_sample_train = df_sample_train.drop(['gap'], axis=1)

# Create the testing sample
df_sample_test = df_sample[~msk]
Y_sample_test = df_sample_test.gap.values
df_sample_test = df_sample_test.drop(['gap'], axis=1)

print "Training Sample Shape", df_sample_train.shape
print "Y Training Sample Shape", Y_sample_train.shape
print "Test Sample Shape", df_sample_test.shape
print "Y Test Sample Shape", Y_sample_test.shape

Training Sample Shape (80226, 82)
Y Training Sample Shape (80226,)
Test Sample Shape (19774, 82)
Y Test Sample Shape (19774,)


In [57]:
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [58]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [9]:
#Drop the 'smiles' column
#df_all = df_all.drop(['smiles'], axis=1)
#vals = df_all.values
#X_train = vals[:test_idx]
#X_test = vals[test_idx:]
#print "Train features:", X_train.shape
#print "Train gap:", Y_train.shape
#print "Test features:", X_test.shape

df_sample_train = df_sample_train.drop(['smiles'], axis=1)
df_sample_test = df_sample_test.drop(['smiles'], axis=1)

In [10]:
# Sample Linear Regression Testing
LR = LinearRegression()

# Train the model using the training sets
LR.fit(df_sample_train, Y_sample_train)

# The coefficients
#print('Coefficients: \n', LR.coef_)
# RMSE
print("RMSE: %.3f" % np.sqrt(np.mean((LR.predict(df_sample_test) - Y_sample_test) ** 2)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.3f' % LR.score(df_sample_test, Y_sample_test))

RMSE: 0.208
Variance score: 0.741


In [11]:
# Sample Polynomial Interpolation Testing
Poly = make_pipeline(PolynomialFeatures(degree), Ridge())
Poly.fit(df_sample_train, Y_sample_train)

print "RMSE: %.3f" % np.sqrt(np.mean((Poly.predict(df_sample_test) - Y_sample_test) ** 2))

KeyboardInterrupt: 

In [12]:
# Sample Ridge Testing
Ridge = linear_model.Ridge(alpha = .001)

Ridge.fit(df_sample_train, Y_sample_train)

print("RMSE: %.3f" % np.sqrt(np.mean((Ridge.predict(df_sample_test) - Y_sample_test) ** 2)))
print('Variance score: %.3f' % Ridge.score(df_sample_test, Y_sample_test))

RMSE: 0.209
Variance score: 0.741


In [13]:
# Sample Lasso Testing
Lasso = linear_model.Lasso(alpha = .001)

Lasso.fit(df_sample_train, Y_sample_train)

print("RMSE: %.3f" % np.sqrt(np.mean((Lasso.predict(df_sample_test) - Y_sample_test) ** 2)))
print('Variance score: %.3f' % Lasso.score(df_sample_test, Y_sample_test))

RMSE: 0.213
Variance score: 0.729




In [14]:
# KNN
KNN = neighbors.KNeighborsRegressor(n_neighbors=12)

KNN.fit(df_sample_train, Y_sample_train)

#print "N_neighbors: %d" % 12
print("RMSE: %.3f" % np.sqrt(np.mean((KNN.predict(df_sample_test) - Y_sample_test) ** 2)))
#print('Variance score: %.3f' % KNN.score(df_sample_test, Y_sample_test))

RMSE: 0.186


In [17]:
# Random Forest Testing
RF = RandomForestRegressor(n_estimators=40, min_samples_split=8)
RF.fit(df_sample_train, Y_sample_train)

print("RMSE: %.3f" % np.sqrt(np.mean((RF.predict(df_sample_test) - Y_sample_test) ** 2)))

RMSE: 0.151


In [18]:
# Building an Ensemble
lr_predictions = LR.predict(df_sample_train)
ridge_predictions = Ridge.predict(df_sample_train)
lasso_predictions = Lasso.predict(df_sample_train)
knn_predictions = KNN.predict(df_sample_train)
rf_predictions = RF.predict(df_sample_train)


dfensemble=pd.DataFrame.from_dict({'lr':lr_predictions,
                                   'ridge':ridge_predictions,
                                   'lasso':lasso_predictions, 
                                   'knn':knn_predictions,
                                   'rf':rf_predictions,
                                   'y':Y_sample_train})

est = LinearRegression()
est.fit(dfensemble[['lr', 'ridge', 'lasso', 'knn', 'rf']].values, dfensemble['y'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
# Testing the Ensemble
lr_predictions_test = LR.predict(df_sample_test)
ridge_predictions_test = Ridge.predict(df_sample_test)
lasso_predictions_test = Lasso.predict(df_sample_test)
knn_predictions_test = KNN.predict(df_sample_test)
rf_predictions_test = RF.predict(df_sample_test)

dfensembletest=pd.DataFrame.from_dict({'lr':lr_predictions_test,
                                   'ridge':ridge_predictions_test,
                                   'lasso':lasso_predictions_test,
                                   'knn':knn_predictions_test,
                                   'rf':rf_predictions_test,
                                   'y':Y_sample_test})

epreds = est.predict(dfensembletest[['lr', 'ridge', 'lasso', 'knn', 'rf']].values)

print("RMSE: %.3f" % np.sqrt(np.mean((epreds - Y_sample_test) ** 2)))
#print('Variance score: %.3f' % est.score(df_sample_test, Y_sample_test))

RMSE: 0.156


In [None]:
# Original

LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)

In [None]:
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)

In [None]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [None]:
write_to_file("sample1.csv", LR_pred)
write_to_file("sample2.csv", RF_pred)