In [1]:
import sys
sys.version

'2.7.11 |Anaconda 2.4.1 (x86_64)| (default, Dec  6 2015, 18:57:58) \n[GCC 4.2.1 (Apple Inc. build 5577)]'

In [2]:
import time
import numpy as np
import math
import pandas as pd
import itertools

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

%matplotlib inline
import matplotlib.pyplot as plt

<div class="alert alert-info">
<strong>LOAD DATA & CREATE FEATURES</strong>
</div>

In [3]:
# load all data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [4]:
# # smaller dataframe for testing purposes
# df_train = df_train[:100]
# df_test = df_test[:100]

In [5]:
# store gap values
Y_train = df_train.gap.values

# row where testing examples start
test_idx = df_train.shape[0]

# delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)

# delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [6]:
# original dataframe sizes
df_train.shape, df_test.shape

((1000000, 257), (824230, 257))

In [7]:
# dataframe with all train and test examples so we can more easily apply feature engineering
df_all = pd.concat((df_train, df_test), axis=0)

# extract 'smiles' column - for separate processing
df_smiles = pd.DataFrame(df_all.smiles.values, columns=['smiles'], index=df_all.index.values)

# drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)

df_all.shape, df_smiles.shape

((1824230, 256), (1824230, 1))

In [8]:
# drop columns that are all zeros
df_all_sum = pd.DataFrame(df_all.sum(axis=0), index=df_all.columns, columns=['SUM'])
df_all_zeros = df_all_sum[df_all_sum.SUM==0].index.values
df_all.drop(df_all_zeros, axis=1, inplace=True)
df_all.shape

(1824230, 31)

In [9]:
# number of original features
num_orig_features = df_all.shape[1]
num_orig_features

31

In [10]:
def split_smiles(smile, chars):
    result = []
    for s in range(len(smile)-chars+1):
        result.append(smile[s:s+chars])
    return result

In [11]:
# create all string combinations

i = 2 # max=63

df_tmp = df_smiles.copy()
df_tmp = df_tmp.applymap(lambda x: split_smiles(x, i))
df_smiles['SEQ_' + str(i) + '_CHARS'] = df_tmp.smiles.copy()

In [12]:
unique_feat = sorted(set(list(itertools.chain(*df_smiles['SEQ_' + str(i) + '_CHARS'].values))))
len(unique_feat)

159

In [13]:
# turn into features

i = 2 # max=63

for f in unique_feat:
    df_tmp = df_smiles['SEQ_' + str(i) + '_CHARS'].copy()
    df_tmp = df_tmp.map(lambda x: x.count(f))
    df_smiles['SEQ_' + str(i) + '_CHARS_' + f] = df_tmp

In [14]:
df_smiles.shape

(1824230, 161)

In [15]:
# df_smiles.to_csv('data/smiles_features_2char.csv') 

In [16]:
# combine old and new features
df_enh = pd.concat([df_smiles, df_all], axis=1)

# drop the 'smiles' column
df_enh = df_enh.drop(['smiles'], axis=1)

# drop the list columns
i = 2 # max=63
df_enh = df_enh.drop(['SEQ_' + str(i) + '_CHARS'], axis=1)

# see what we ended up with!
df_enh.shape

(1824230, 190)

In [17]:
# df_enh.to_csv('data/enhaced_features_2char.csv') 

In [18]:
# split back up into training and test data
lcols = df_enh.columns.values.tolist()
vals = df_enh.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]

print 'Train features:', X_train.shape
print 'Train gap:', Y_train.shape
print 'Test features:', X_test.shape

Train features: (1000000, 190)
Train gap: (1000000,)
Test features: (824230, 190)


<div class="alert alert-info">
<strong>PREDICTORS</strong>
</div>

In [19]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

**Linear Regression**

In [20]:
# # linear regression - test & validation split
# LR = LinearRegression()
# LR.fit(X_train[:800000], Y_train[:800000])
# LR_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], LR.predict(X_train[:800000])))
# LR_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], LR.predict(X_train[800000:])))
# print 'Linear regression RMSE - training set = %0.5f' % LR_rmse_train
# print 'Linear regression RMSE - validation set = %0.5f' % LR_rmse_val
# print 'Baseline linear regression RMSE = 0.29892'

In [21]:
# # linear regression - training & test split
# LR = LinearRegression()
# LR.fit(X_train, Y_train)
# LR_pred = LR.predict(X_test)
# LR_rmse = math.sqrt(mean_squared_error(Y_train, LR.predict(X_train)))
# print 'New linear regression RMSE = %0.5f' % LR_rmse
# print 'Baseline linear regression RMSE = 0.29892'
# write_to_file('data/LR_2char.csv', LR_pred)

**Random Forest**

In [22]:
# random forest regressor - test & validation split
RF = RandomForestRegressor()
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.05110
Random forest RMSE - validation set = 0.10020
Baseline random forest RMSE = 0.27188


In [23]:
# random forest regressor - training & test split
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)
RF_rmse = math.sqrt(mean_squared_error(Y_train, RF.predict(X_train)))
print 'New random forest RMSE = %0.5f' % RF_rmse
print 'Baseline random forest RMSE = 0.27188'
write_to_file('data/RF_2char.csv', RF_pred)

New random forest RMSE = 0.05146
Baseline random forest RMSE = 0.27188
