# Generate Tanimioto Similarity Coefficients

- Used RDKit to calculate Tanimoto similarity coefficient to 200 representative molecules that were evenly spaced across the full range of HOMO-LUMO gaps from min to max. 
- Also added chlorophyll to the list of representative molecules out of curiosity, given its role in photosynthesis

## Table of Contents

1. [Load Data](#section1)
2. [Baseline Models](#section2)
3. [Generate Similarity Coefficients](#section3)
4. [Random Forest - 200 similarities, 3 bond types, original 256-bits](#section4)
5. [Random Forest - 3 char sequences, 3 bond types](#section5)
6. [Random Forest - 200 similarities, 3 bond types, 3-char sequence](#section6)

In [3]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import scipy as sp
import math
from scipy.stats import pearsonr
import time

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

%matplotlib inline 
import matplotlib.pyplot as plt 
pd.options.display.mpl_style = 'default'

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")




 <a id = "section1"></a>
## Load All Data

In [4]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [5]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,...,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1.98


In [6]:
df_train.shape

(1000000, 258)

In [7]:
df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,...,feat_207,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [8]:
df_test.shape

(824230, 258)

In [9]:
#store gap values
Y_train = df_train.gap.values

#row where testing examples start
test_idx = df_train.shape[0]

#delete 'Id' column
df_test2 = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train2 = df_train.drop(['gap'], axis=1)

#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train2, df_test2), axis=0)

all_smiles = df_all.smiles

In [10]:
df_all.shape

(1824230, 257)

In [11]:
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,...,feat_207,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [12]:
# drop smiles
df_all = df_all.drop(['smiles'], axis=1)

In [13]:
# reset index
df_all = df_all.reset_index().drop(['index'], axis=1)

In [14]:
df_all.head()

Unnamed: 0,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,feat_050,...,feat_207,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
2,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
3,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


### Utility Functions

In [109]:
def get_pred_rmse(model, dataframe, columns, model_name, track_dict, ytrain, test_idx=None, mask=None, train_size=None,
                  parameters=None, score_func='mean_squared_error', n_folds=5):
    """
    TODO add docs
    Suggested train size: 0.8
    """
    if mask and train_size:
        raise 'Only one of mask or train_size should have arguments, not both.'

    vals = dataframe[columns].values

    # Separate prediction data out
    if test_idx:
        x_train = vals[:test_idx]
        x_pred = vals[test_idx:]
    else:
        x_train = vals

    # Separate data further into training and test sets
    if train_size:
        x_train, ytrain, x_train, ytest = create_train_test_split(x_train, ytrain, train_size)
    elif mask:
        # Be careful about order here
        x_test = x_train[~mask]
        x_train = x_train[mask]

        ytest = ytrain[mask]
        ytrain = ytrain[~mask]

    # run cross-validation to find best hyper parameters
    if parameters:
        model = cv_optimize(model, parameters, x_train, ytrain, n_folds=n_folds, score_func=score_func)

    # Fit on entire training set
    model = model.fit(x_train, ytrain)

    rmse_train = math.sqrt(mean_squared_error(ytrain, model.predict(x_train)))
    print 'Training set RMSE = %0.5f' % rmse_train

    # Keep track of model
    track_dict[model_name] = [model, rmse_train]

    # test set error
    if train_size:
        # TODO how to get around this?
        rmse_val = math.sqrt(mean_squared_error(ytest, model.predict(x_test)))
        print 'Test set RMSE = %0.5f' % rmse_val

        track_dict[model_name].append(rmse_val)

    # Create predictions on prediction data
    if test_idx:
        # TODO how to get around this?
        pred = model.predict(x_pred)
        return pred

In [110]:
# TODO check this works: scoring function 'mean_squared_error' particularly
def cv_optimize(clf, parameters, X, y, n_folds=5, score_func=None):
    """
    Function
    --------
    cv_optimize

    Inputs
    ------
    clf : an instance of a scikit-learn classifier
    parameters: a parameter grid dictionary that is passed to GridSearchCV
    X: a samples-features matrix in the scikit-learn style
    y: the response vectors of 1s and 0s (+ives and -ives)
    n_folds: the number of cross-validation folds (default 5)
    score_func: a score function we might want to pass (default python None)

    Returns
    -------
    The best estimator from the GridSearchCV, after the GridSearchCV has been used to
    fit the model.

    Notes
    -----
    see do_classify and the code below for an example of how this is used
    """

    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(X, y)

    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_

    return best

In [None]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):

 <a id = "section2"></a>
## Baseline Models

In [15]:
# baseline scores
scores = {'leaderboard_RF': 0.27207, 'leaderboard_LR': 0.29846}

In [16]:
LR = LinearRegression()

LR_pred = get_pred_rmse(LR, df_all, 'LR_baseline', scores, Y_train, test_idx=test_idx)

Training set RMSE = 0.29893
Your RMSE - LR baseline rmse = 0.00047
Your RMSE - RF baseline rmse = 0.02686
Train features: (1000000, 256)
Train gap: (1000000,)
Test features: (824230, 256)


In [17]:
RF = RandomForestRegressor()

RF_pred = get_pred_rmse(RF, df_all, 'RF_baseline', scores, Y_train, test_idx=test_idx)

Training set RMSE = 0.27188
Your RMSE - LR baseline rmse = -0.02658
Your RMSE - RF baseline rmse = -0.00019
Train features: (1000000, 256)
Train gap: (1000000,)
Test features: (824230, 256)


In [18]:
scores

{'LR_baseline': 0.29893086588711254,
 'RF_baseline': 0.27187609725038553,
 'leaderboard_LR': 0.29846,
 'leaderboard_RF': 0.27207}

 <a id = "section3"></a>
## Generate Similarity Coefficients

In [19]:
from rdkit import Chem, RDConfig
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs

In [20]:
df_train_sorted = df_train.sort_values(['gap'], ascending=True).reset_index()

In [None]:
df_200 = df_cumul.copy()

In [21]:
test_array = np.linspace(0, df_train_sorted.shape[0]-1, 500, dtype=int)

In [22]:
test_array

array([     0,   5025,  10050,  15075,  20100,  25125,  30150,  35175,
        40200,  45226,  50251,  55276,  60301,  65326,  70351,  75376,
        80401,  85427,  90452,  95477, 100502, 105527, 110552, 115577,
       120602, 125628, 130653, 135678, 140703, 145728, 150753, 155778,
       160803, 165828, 170854, 175879, 180904, 185929, 190954, 195979,
       201004, 206029, 211055, 216080, 221105, 226130, 231155, 236180,
       241205, 246230, 251256, 256281, 261306, 266331, 271356, 276381,
       281406, 286431, 291456, 296482, 301507, 306532, 311557, 316582,
       321607, 326632, 331657, 336683, 341708, 346733, 351758, 356783,
       361808, 366833, 371858, 376884, 381909, 386934, 391959, 396984,
       402009, 407034, 412059, 417085, 422110, 427135, 432160, 437185,
       442210, 447235, 452260, 457285, 462311, 467336, 472361, 477386,
       482411, 487436, 492461, 497486, 502512, 507537, 512562, 517587,
       522612, 527637, 532662, 537687, 542713, 547738, 552763, 557788,
      

In [23]:
smiles_fps = []
for row_num in test_array: 
    sm = df_train_sorted.iat[row_num, 1]
    smiles_fps.append(FingerprintMols.FingerprintMol(Chem.MolFromSmiles(sm)))   

In [24]:
# add chorophyll as #201
chpyl = 'CCC1=C(C2=NC1=CC3=C(C4=C([N-]3)C(=C5[C@H]([C@@H](C(=N5)C=C6C(=C(C(=C2)[N-]6)C=C)C)C)CCC(=O)OC/C=C(\C)/CCCC(C)CCCC(C)CCCC(C)C)[C@H](C4=O)C(=O)OC)C)C.[Mg+2]'

c_fp = FingerprintMols.FingerprintMol(Chem.MolFromSmiles(chpyl))

smiles_fps.append(c_fp)

In [25]:
def smiles_to_simil(all_smiles, sample_fps, start, nexts):
    mols = all_smiles[start:nexts].astype(str).apply(lambda x: Chem.MolFromSmiles(x))
    
    fps = mols.apply(lambda y: FingerprintMols.FingerprintMol(y))

    simils = fps.apply(lambda z: [round(fl, 5) for fl in DataStructs.BulkTanimotoSimilarity(z, sample_fps)])

    result_df = pd.DataFrame(np.vstack(simils))
    
    return result_df

In [26]:
# go in chunks of 20,000
start = 0
chunk = 20000

# size of the full thing
sizeofdf = df_all.shape[0]
nexts = start + chunk

# Initialize the first part
print start, nexts
df_cumul = smiles_to_simil(all_smiles, smiles_fps, start, nexts)

start += chunk
nexts = start + chunk

while nexts < sizeofdf: 
    print start, nexts
    
    partial_df = smiles_to_simil(all_smiles, smiles_fps, start, nexts)
    
    # Add to next
    df_cumul = pd.concat((df_cumul, partial_df), axis=0)    

    start += chunk
    nexts = start + chunk

# Handle the remainder here 
print 'At the end!', start, sizeofdf
partial_df = smiles_to_simil(all_smiles, smiles_fps, start, sizeofdf)
df_cumul = pd.concat((df_cumul, partial_df), axis=0)    

0 20000
20000 40000
40000 60000
60000 80000
80000 100000
100000 120000
120000 140000
140000 160000
160000 180000
180000 200000
200000 220000
220000 240000
240000 260000
260000 280000
280000 300000
300000 320000
320000 340000
340000 360000
360000 380000
380000 400000
400000 420000
420000 440000
440000 460000
460000 480000
480000 500000
500000 520000
520000 540000
540000 560000
560000 580000
580000 600000
600000 620000
620000 640000
640000 660000
660000 680000
680000 700000
700000 720000
720000 740000
740000 760000
760000 780000
780000 800000
800000 820000
820000 840000
840000 860000
860000 880000
880000 900000
900000 920000
920000 940000
940000 960000
960000 980000
980000 1000000
1000000 1020000
1020000 1040000
1040000 1060000
1060000 1080000
1080000 1100000
1100000 1120000
1120000 1140000
1140000 1160000
1160000 1180000
1180000 1200000
1200000 1220000
1220000 1240000
1240000 1260000
1260000 1280000
1280000 1300000
1300000 1320000
1320000 1340000
1340000 1360000
1360000 1380000
1380000 

In [27]:
df_cumul.shape

(1824230, 201)

In [28]:
# reset and drop the old index
df_cumul = df_cumul.reset_index().drop(['index'], axis=1)

In [29]:
# Check
df_cumul.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200
0,0.411624,0.587703,0.467847,0.651353,0.592127,0.66091,0.730488,0.701887,0.645665,0.475722,0.714634,0.446686,0.546675,0.44572,0.557108,0.556492,0.538506,0.560263,0.512687,0.451477,0.533406,0.522026,0.533573,0.528627,0.582575,0.530795,0.458458,0.602496,0.592633,0.546092,0.473304,0.51768,0.563321,0.562144,0.498866,0.482173,0.545045,0.509456,0.554217,0.458333,0.534464,0.493889,0.374921,0.573443,0.552204,0.482799,0.569775,0.4967,0.474315,0.6,...,0.556075,0.449559,0.473716,0.426683,0.478755,0.450391,0.553057,0.565365,0.566118,0.520846,0.433727,0.498804,0.49866,0.35873,0.467236,0.439181,0.325535,0.328239,0.487948,0.504991,0.555877,0.52359,0.428481,0.446795,0.411032,0.435866,0.477488,0.489518,0.403425,0.441919,0.520556,0.337938,0.387964,0.417959,0.532399,0.331891,0.377551,0.555689,0.395692,0.544617,0.479167,0.345904,0.336518,0.500986,0.415643,0.508386,0.395604,0.411802,0.380802,0.54015
1,0.485714,0.464612,0.620898,0.478311,0.560066,0.553778,0.521142,0.484524,0.525247,0.463614,0.520222,0.486306,0.445226,0.523543,0.532555,0.619305,0.554827,0.463793,0.482019,0.4625,0.551391,0.518018,0.431034,0.486696,0.418465,0.553613,0.43932,0.513529,0.535831,0.613833,0.552014,0.554846,0.479396,0.594272,0.533333,0.490941,0.540587,0.530788,0.578348,0.3575,0.594059,0.561048,0.392622,0.49399,0.519814,0.477612,0.420108,0.526467,0.471621,0.482938,...,0.51547,0.394375,0.454821,0.40353,0.459016,0.43014,0.508621,0.508949,0.462303,0.437207,0.385378,0.452055,0.382653,0.345455,0.48607,0.468098,0.273271,0.324417,0.373838,0.45736,0.546045,0.474286,0.404215,0.39321,0.448903,0.419554,0.464976,0.490385,0.326859,0.384135,0.497658,0.351974,0.369863,0.366089,0.47889,0.45537,0.364644,0.472643,0.380176,0.47407,0.500538,0.322325,0.363938,0.506958,0.346447,0.504792,0.361328,0.386247,0.361996,0.546806
2,0.433919,0.551133,0.492705,0.583239,0.637812,0.659609,0.648412,0.638952,0.62574,0.466097,0.643172,0.49415,0.537195,0.452696,0.554861,0.626659,0.600562,0.539779,0.554505,0.472603,0.600431,0.579344,0.514143,0.56928,0.512478,0.594076,0.465769,0.609086,0.658023,0.595801,0.51098,0.559295,0.558101,0.606178,0.52546,0.518498,0.576216,0.600705,0.656577,0.433155,0.583511,0.544816,0.373396,0.576327,0.555434,0.5,0.511278,0.557889,0.557377,0.598632,...,0.577308,0.432464,0.500284,0.534659,0.619485,0.469852,0.572448,0.611629,0.528428,0.604415,0.440372,0.523322,0.494724,0.460518,0.513499,0.489253,0.304479,0.340123,0.448968,0.519444,0.558038,0.568409,0.438938,0.443353,0.441243,0.45465,0.519088,0.506173,0.370142,0.43207,0.583475,0.346432,0.396181,0.423331,0.603784,0.397598,0.391408,0.529542,0.423571,0.547425,0.485279,0.351417,0.342268,0.501961,0.40679,0.503043,0.394139,0.425089,0.38809,0.602207
3,0.436246,0.520471,0.51955,0.514863,0.606989,0.592473,0.571429,0.545253,0.582251,0.475,0.576965,0.579365,0.481566,0.478079,0.547479,0.556893,0.555054,0.518331,0.579871,0.499698,0.56614,0.555008,0.49801,0.553297,0.51083,0.692354,0.469352,0.541111,0.609307,0.585891,0.509413,0.549095,0.528077,0.587042,0.509151,0.50783,0.548299,0.556805,0.628492,0.379988,0.574863,0.513957,0.432467,0.533111,0.574943,0.550296,0.469078,0.54415,0.483547,0.514254,...,0.532141,0.422754,0.47907,0.443463,0.485714,0.456407,0.551821,0.545109,0.503123,0.473864,0.418106,0.474315,0.395054,0.368519,0.5,0.454441,0.364238,0.313924,0.425725,0.500853,0.533001,0.505525,0.429268,0.422143,0.433742,0.543671,0.520402,0.506224,0.350699,0.39539,0.524535,0.349372,0.382389,0.392157,0.526554,0.382043,0.374079,0.524217,0.404192,0.531632,0.480612,0.352015,0.338174,0.500986,0.378954,0.521739,0.376,0.425387,0.372951,0.583914
4,0.478577,0.409257,0.432915,0.435514,0.454545,0.477535,0.486423,0.478315,0.493499,0.396644,0.497596,0.398467,0.423522,0.44936,0.480528,0.478865,0.488945,0.46833,0.405983,0.514141,0.455399,0.479419,0.439228,0.421053,0.443003,0.442055,0.390164,0.457388,0.443697,0.494428,0.397959,0.429329,0.422291,0.503106,0.405079,0.437774,0.424207,0.466842,0.520531,0.350211,0.472156,0.4375,0.343434,0.459158,0.463354,0.372222,0.436019,0.427467,0.37318,0.483002,...,0.451393,0.484621,0.416227,0.404925,0.423561,0.430917,0.437311,0.437133,0.459144,0.393152,0.545171,0.431565,0.46148,0.357196,0.486486,0.383075,0.294163,0.331483,0.353549,0.494337,0.440657,0.399038,0.392704,0.419972,0.457183,0.392906,0.414444,0.539663,0.388755,0.363889,0.489664,0.35801,0.367876,0.561199,0.490944,0.372476,0.36803,0.498684,0.54726,0.458642,0.461187,0.453488,0.353083,0.510204,0.333809,0.517523,0.353201,0.511905,0.370457,0.442188


In [35]:
bonds = pd.read_csv("bond_type_counts.csv")

In [36]:
bonds.head()

Unnamed: 0.1,Unnamed: 0,Single,Double,Aromatic
0,0,3,0,29
1,1,9,5,16
2,2,7,1,25
3,3,8,4,21
4,4,1,0,34


In [37]:
bonds = bonds.drop(['Unnamed: 0'], axis=1)

In [39]:
bonds.head()

Unnamed: 0,Single,Double,Aromatic
0,3,0,29
1,9,5,16
2,7,1,25
3,8,4,21
4,1,0,34


In [38]:
bonds.shape

(1824230, 3)

In [40]:
df_all.head()

Unnamed: 0,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,feat_050,...,feat_207,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
2,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
3,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [None]:
df_all.shape

In [116]:
df_all = pd.concat((df_all, df_cumul, bonds), axis=1)

In [117]:
df_all.head()

Unnamed: 0,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,feat_050,...,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,Single,Double,Aromatic
0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0.4267,0.4788,0.4504,0.5531,0.5654,0.5661,0.5208,0.4337,0.4988,0.4987,0.3587,0.4672,0.4392,0.3255,0.3282,0.4879,0.505,0.5559,0.5236,0.4285,0.4468,0.411,0.4359,0.4775,0.4895,0.4034,0.4419,0.5206,0.3379,0.388,0.418,0.5324,0.3319,0.3776,0.5557,0.3957,0.5446,0.4792,0.3459,0.3365,0.501,0.4156,0.5084,0.3956,0.4118,0.3808,0.5401,3,0,29
1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0.4035,0.459,0.4301,0.5086,0.5089,0.4623,0.4372,0.3854,0.4521,0.3827,0.3455,0.4861,0.4681,0.2733,0.3244,0.3738,0.4574,0.546,0.4743,0.4042,0.3932,0.4489,0.4196,0.465,0.4904,0.3269,0.3841,0.4977,0.352,0.3699,0.3661,0.4789,0.4554,0.3646,0.4726,0.3802,0.4741,0.5005,0.3223,0.3639,0.507,0.3464,0.5048,0.3613,0.3862,0.362,0.5468,9,5,16
2,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0.5347,0.6195,0.4699,0.5724,0.6116,0.5284,0.6044,0.4404,0.5233,0.4947,0.4605,0.5135,0.4893,0.3045,0.3401,0.449,0.5194,0.558,0.5684,0.4389,0.4434,0.4412,0.4546,0.5191,0.5062,0.3701,0.4321,0.5835,0.3464,0.3962,0.4233,0.6038,0.3976,0.3914,0.5295,0.4236,0.5474,0.4853,0.3514,0.3423,0.502,0.4068,0.503,0.3941,0.4251,0.3881,0.6022,7,1,25
3,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0.4435,0.4857,0.4564,0.5518,0.5451,0.5031,0.4739,0.4181,0.4743,0.3951,0.3685,0.5,0.4544,0.3642,0.3139,0.4257,0.5009,0.533,0.5055,0.4293,0.4221,0.4337,0.5437,0.5204,0.5062,0.3507,0.3954,0.5245,0.3494,0.3824,0.3922,0.5266,0.382,0.3741,0.5242,0.4042,0.5316,0.4806,0.352,0.3382,0.501,0.379,0.5217,0.376,0.4254,0.373,0.5839,8,4,21
4,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0.4049,0.4236,0.4309,0.4373,0.4371,0.4591,0.3932,0.5452,0.4316,0.4615,0.3572,0.4865,0.3831,0.2942,0.3315,0.3535,0.4943,0.4407,0.399,0.3927,0.42,0.4572,0.3929,0.4144,0.5397,0.3888,0.3639,0.4897,0.358,0.3679,0.5612,0.4909,0.3725,0.368,0.4987,0.5473,0.4586,0.4612,0.4535,0.3531,0.5102,0.3338,0.5175,0.3532,0.5119,0.3705,0.4422,1,0,34


In [43]:
df_all.shape

(1824230, 460)

 <a id = "section4"></a>
## Random Forest - 200 similarities, 3 bond types, original 256-bits

In [47]:
RF = RandomForestRegressor()

RF_pred = get_pred_rmse(RF, df_all, 'RF_200similarities+three_bondtypes+orig256', scores, Y_train, test_idx=test_idx)

Training set RMSE = 0.05415
Your RMSE - LR baseline rmse = -0.24431
Your RMSE - RF baseline rmse = -0.21792
Train features: (1000000, 460)
Train gap: (1000000,)
Test features: (824230, 460)


In [48]:
scores

{'LR_baseline': 0.29893086588711254,
 'RF_200similarities+three_bondtypes+orig256': 0.05414656667044845,
 'RF_baseline': 0.27187609725038553,
 'RF_similarities_only': 0.07106175728158828,
 'leaderboard_LR': 0.29846,
 'leaderboard_RF': 0.27207}

In [33]:
write_to_file('rf_similarities_200_pred.csv', RF_pred)

In [34]:
df_cumul.to_csv('similarities_200.csv')

In [136]:
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(df_all[800000:1000000])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.05537
Random forest RMSE - validation set = 0.12621
Baseline random forest RMSE = 0.27188


 <a id = "section5"></a>
## Random Forest - 3 char sequences, 3 bond types

In [16]:
# HERE NOW
# get 3-char sequences from Gioia's file
char_3 = pd.read_csv('/Users/amylee/Copy/CS181/practical1/enhaced_features_3char.csv')

In [17]:
char_3.head()

Unnamed: 0.1,Unnamed: 0,SEQ_3_CHARS_(-[,SEQ_3_CHARS_(-c,SEQ_3_CHARS_(-n,SEQ_3_CHARS_(=C,SEQ_3_CHARS_(C1,SEQ_3_CHARS_(C2,SEQ_3_CHARS_(C3,SEQ_3_CHARS_(C4,SEQ_3_CHARS_(C5,SEQ_3_CHARS_(CC,SEQ_3_CHARS_(Cc,SEQ_3_CHARS_([S,SEQ_3_CHARS_([n,SEQ_3_CHARS_([o,SEQ_3_CHARS_([s,SEQ_3_CHARS_(c-,SEQ_3_CHARS_(c1,SEQ_3_CHARS_(c2,SEQ_3_CHARS_(c3,SEQ_3_CHARS_(c4,SEQ_3_CHARS_(c5,SEQ_3_CHARS_(cc,SEQ_3_CHARS_(cn,SEQ_3_CHARS_(n1,SEQ_3_CHARS_(nc,SEQ_3_CHARS_(o1,SEQ_3_CHARS_(o2,SEQ_3_CHARS_(o3,SEQ_3_CHARS_(oc,SEQ_3_CHARS_(s1,SEQ_3_CHARS_(s2,SEQ_3_CHARS_(s3,SEQ_3_CHARS_(sc,SEQ_3_CHARS_)-[,SEQ_3_CHARS_)-c,SEQ_3_CHARS_)C1,SEQ_3_CHARS_)C2,SEQ_3_CHARS_)C3,SEQ_3_CHARS_)C=,SEQ_3_CHARS_)[n,SEQ_3_CHARS_)[o,SEQ_3_CHARS_)[s,SEQ_3_CHARS_)c-,SEQ_3_CHARS_)c1,SEQ_3_CHARS_)c2,SEQ_3_CHARS_)c3,SEQ_3_CHARS_)c4,SEQ_3_CHARS_)c5,SEQ_3_CHARS_)cc,...,SEQ_3_CHARS_occ,SEQ_3_CHARS_s1),SEQ_3_CHARS_s2),SEQ_3_CHARS_s3),SEQ_3_CHARS_s4),SEQ_3_CHARS_s5),SEQ_3_CHARS_s]-,SEQ_3_CHARS_s]1,SEQ_3_CHARS_s]c,SEQ_3_CHARS_sc(,SEQ_3_CHARS_sc-,SEQ_3_CHARS_sc1,SEQ_3_CHARS_sc2,SEQ_3_CHARS_sc3,SEQ_3_CHARS_sc4,SEQ_3_CHARS_sc5,SEQ_3_CHARS_scc,SEQ_3_CHARS_se],SEQ_3_CHARS_snc,feat_001,feat_005,feat_006,feat_007,feat_025,feat_037,feat_044,feat_068,feat_069,feat_072,feat_087,feat_090,feat_102,feat_119,feat_123,feat_126,feat_132,feat_173,feat_176,feat_187,feat_196,feat_199,feat_200,feat_208,feat_218,feat_225,feat_226,feat_243,feat_248,feat_251,feat_252
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,1,0,0,0,1,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,0,0,1,1,0,1,1,0,0,1,1,1,0,1,0,0,0,0,1,0,1,0,1,1,0
2,2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,0,1
3,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,1,0,0,1,1,1,0,1,1,1,1,0,0,0,0,0,0,1,0,1,1,1,0,1
4,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0


In [18]:
char_3.shape

(1824230, 594)

In [19]:
char_3 = char_3.drop(['Unnamed: 0'], axis=1)

In [20]:
# should be 593 features
char_3.shape

(1824230, 593)

In [21]:
# get the bonds
bonds = pd.read_csv('/Users/amylee/Copy/CS181/practical1/bond_type_counts.csv')

In [22]:
bonds.shape

(1824230, 4)

In [23]:
bonds.head()

Unnamed: 0.1,Unnamed: 0,Single,Double,Aromatic
0,0,3,0,29
1,1,9,5,16
2,2,7,1,25
3,3,8,4,21
4,4,1,0,34


In [24]:
bonds = bonds.drop(['Unnamed: 0'], axis=1)

In [25]:
# concat to 3 bonds
df_all = pd.concat((bonds, char_3), axis=1)

In [26]:
df_all.shape

(1824230, 596)

In [27]:
df_all.head()

Unnamed: 0,Single,Double,Aromatic,SEQ_3_CHARS_(-[,SEQ_3_CHARS_(-c,SEQ_3_CHARS_(-n,SEQ_3_CHARS_(=C,SEQ_3_CHARS_(C1,SEQ_3_CHARS_(C2,SEQ_3_CHARS_(C3,SEQ_3_CHARS_(C4,SEQ_3_CHARS_(C5,SEQ_3_CHARS_(CC,SEQ_3_CHARS_(Cc,SEQ_3_CHARS_([S,SEQ_3_CHARS_([n,SEQ_3_CHARS_([o,SEQ_3_CHARS_([s,SEQ_3_CHARS_(c-,SEQ_3_CHARS_(c1,SEQ_3_CHARS_(c2,SEQ_3_CHARS_(c3,SEQ_3_CHARS_(c4,SEQ_3_CHARS_(c5,SEQ_3_CHARS_(cc,SEQ_3_CHARS_(cn,SEQ_3_CHARS_(n1,SEQ_3_CHARS_(nc,SEQ_3_CHARS_(o1,SEQ_3_CHARS_(o2,SEQ_3_CHARS_(o3,SEQ_3_CHARS_(oc,SEQ_3_CHARS_(s1,SEQ_3_CHARS_(s2,SEQ_3_CHARS_(s3,SEQ_3_CHARS_(sc,SEQ_3_CHARS_)-[,SEQ_3_CHARS_)-c,SEQ_3_CHARS_)C1,SEQ_3_CHARS_)C2,SEQ_3_CHARS_)C3,SEQ_3_CHARS_)C=,SEQ_3_CHARS_)[n,SEQ_3_CHARS_)[o,SEQ_3_CHARS_)[s,SEQ_3_CHARS_)c-,SEQ_3_CHARS_)c1,SEQ_3_CHARS_)c2,SEQ_3_CHARS_)c3,SEQ_3_CHARS_)c4,...,SEQ_3_CHARS_occ,SEQ_3_CHARS_s1),SEQ_3_CHARS_s2),SEQ_3_CHARS_s3),SEQ_3_CHARS_s4),SEQ_3_CHARS_s5),SEQ_3_CHARS_s]-,SEQ_3_CHARS_s]1,SEQ_3_CHARS_s]c,SEQ_3_CHARS_sc(,SEQ_3_CHARS_sc-,SEQ_3_CHARS_sc1,SEQ_3_CHARS_sc2,SEQ_3_CHARS_sc3,SEQ_3_CHARS_sc4,SEQ_3_CHARS_sc5,SEQ_3_CHARS_scc,SEQ_3_CHARS_se],SEQ_3_CHARS_snc,feat_001,feat_005,feat_006,feat_007,feat_025,feat_037,feat_044,feat_068,feat_069,feat_072,feat_087,feat_090,feat_102,feat_119,feat_123,feat_126,feat_132,feat_173,feat_176,feat_187,feat_196,feat_199,feat_200,feat_208,feat_218,feat_225,feat_226,feat_243,feat_248,feat_251,feat_252
0,3,0,29,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,1,0,0,0,1,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1,9,5,16,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,0,0,1,1,0,1,1,0,0,1,1,1,0,1,0,0,0,0,1,0,1,0,1,1,0
2,7,1,25,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,0,1
3,8,4,21,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,1,0,0,1,1,1,0,1,1,1,1,0,0,0,0,0,0,1,0,1,1,1,0,1
4,1,0,34,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0


In [28]:
char_3 = None

In [29]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=100, max_features=0.6, min_samples_leaf=1)
RF.fit(df_all[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(df_all[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(df_all[800000:1000000])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02399
Random forest RMSE - validation set = 0.05817
Baseline random forest RMSE = 0.27188


 <a id = "section6"></a>
## Random Forest - 200 similarities, 3 bond types, 3-char sequence

In [29]:
simil = pd.read_csv('/Users/amylee/Copy/CS181/practical1/similarities_200_4sigfigs.csv')

In [30]:
simil.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,...,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200
0,0,0.4116,0.5877,0.4678,0.6514,0.5921,0.6609,0.7305,0.7019,0.6457,0.4757,0.7146,0.4467,0.5467,0.4457,0.5571,0.5565,0.5385,0.5603,0.5127,0.4515,0.5334,0.522,0.5336,0.5286,0.5826,0.5308,0.4585,0.6025,0.5926,0.5461,0.4733,0.5177,0.5633,0.5621,0.4989,0.4822,0.545,0.5095,0.5542,0.4583,0.5345,0.4939,0.3749,0.5734,0.5522,0.4828,0.5698,0.4967,0.4743,...,0.5561,0.4496,0.4737,0.4267,0.4788,0.4504,0.5531,0.5654,0.5661,0.5208,0.4337,0.4988,0.4987,0.3587,0.4672,0.4392,0.3255,0.3282,0.4879,0.505,0.5559,0.5236,0.4285,0.4468,0.411,0.4359,0.4775,0.4895,0.4034,0.4419,0.5206,0.3379,0.388,0.418,0.5324,0.3319,0.3776,0.5557,0.3957,0.5446,0.4792,0.3459,0.3365,0.501,0.4156,0.5084,0.3956,0.4118,0.3808,0.5401
1,1,0.4857,0.4646,0.6209,0.4783,0.5601,0.5538,0.5211,0.4845,0.5252,0.4636,0.5202,0.4863,0.4452,0.5235,0.5326,0.6193,0.5548,0.4638,0.482,0.4625,0.5514,0.518,0.431,0.4867,0.4185,0.5536,0.4393,0.5135,0.5358,0.6138,0.552,0.5548,0.4794,0.5943,0.5333,0.4909,0.5406,0.5308,0.5783,0.3575,0.5941,0.561,0.3926,0.494,0.5198,0.4776,0.4201,0.5265,0.4716,...,0.5155,0.3944,0.4548,0.4035,0.459,0.4301,0.5086,0.5089,0.4623,0.4372,0.3854,0.4521,0.3827,0.3455,0.4861,0.4681,0.2733,0.3244,0.3738,0.4574,0.546,0.4743,0.4042,0.3932,0.4489,0.4196,0.465,0.4904,0.3269,0.3841,0.4977,0.352,0.3699,0.3661,0.4789,0.4554,0.3646,0.4726,0.3802,0.4741,0.5005,0.3223,0.3639,0.507,0.3464,0.5048,0.3613,0.3862,0.362,0.5468
2,2,0.4339,0.5511,0.4927,0.5832,0.6378,0.6596,0.6484,0.639,0.6257,0.4661,0.6432,0.4942,0.5372,0.4527,0.5549,0.6267,0.6006,0.5398,0.5545,0.4726,0.6004,0.5793,0.5141,0.5693,0.5125,0.5941,0.4658,0.6091,0.658,0.5958,0.511,0.5593,0.5581,0.6062,0.5255,0.5185,0.5762,0.6007,0.6566,0.4332,0.5835,0.5448,0.3734,0.5763,0.5554,0.5,0.5113,0.5579,0.5574,...,0.5773,0.4325,0.5003,0.5347,0.6195,0.4699,0.5724,0.6116,0.5284,0.6044,0.4404,0.5233,0.4947,0.4605,0.5135,0.4893,0.3045,0.3401,0.449,0.5194,0.558,0.5684,0.4389,0.4434,0.4412,0.4546,0.5191,0.5062,0.3701,0.4321,0.5835,0.3464,0.3962,0.4233,0.6038,0.3976,0.3914,0.5295,0.4236,0.5474,0.4853,0.3514,0.3423,0.502,0.4068,0.503,0.3941,0.4251,0.3881,0.6022
3,3,0.4362,0.5205,0.5195,0.5149,0.607,0.5925,0.5714,0.5453,0.5823,0.475,0.577,0.5794,0.4816,0.4781,0.5475,0.5569,0.5551,0.5183,0.5799,0.4997,0.5661,0.555,0.498,0.5533,0.5108,0.6924,0.4694,0.5411,0.6093,0.5859,0.5094,0.5491,0.5281,0.587,0.5092,0.5078,0.5483,0.5568,0.6285,0.38,0.5749,0.514,0.4325,0.5331,0.5749,0.5503,0.4691,0.5442,0.4835,...,0.5321,0.4228,0.4791,0.4435,0.4857,0.4564,0.5518,0.5451,0.5031,0.4739,0.4181,0.4743,0.3951,0.3685,0.5,0.4544,0.3642,0.3139,0.4257,0.5009,0.533,0.5055,0.4293,0.4221,0.4337,0.5437,0.5204,0.5062,0.3507,0.3954,0.5245,0.3494,0.3824,0.3922,0.5266,0.382,0.3741,0.5242,0.4042,0.5316,0.4806,0.352,0.3382,0.501,0.379,0.5217,0.376,0.4254,0.373,0.5839
4,4,0.4786,0.4093,0.4329,0.4355,0.4545,0.4775,0.4864,0.4783,0.4935,0.3966,0.4976,0.3985,0.4235,0.4494,0.4805,0.4789,0.4889,0.4683,0.406,0.5141,0.4554,0.4794,0.4392,0.4211,0.443,0.4421,0.3902,0.4574,0.4437,0.4944,0.398,0.4293,0.4223,0.5031,0.4051,0.4378,0.4242,0.4668,0.5205,0.3502,0.4722,0.4375,0.3434,0.4592,0.4634,0.3722,0.436,0.4275,0.3732,...,0.4514,0.4846,0.4162,0.4049,0.4236,0.4309,0.4373,0.4371,0.4591,0.3932,0.5452,0.4316,0.4615,0.3572,0.4865,0.3831,0.2942,0.3315,0.3535,0.4943,0.4407,0.399,0.3927,0.42,0.4572,0.3929,0.4144,0.5397,0.3888,0.3639,0.4897,0.358,0.3679,0.5612,0.4909,0.3725,0.368,0.4987,0.5473,0.4586,0.4612,0.4535,0.3531,0.5102,0.3338,0.5175,0.3532,0.5119,0.3705,0.4422


In [31]:
simil = simil.drop(['Unnamed: 0'], axis=1)

In [32]:
simil.shape

(1824230, 201)

In [33]:
df_all.shape

(1824230, 596)

In [34]:
df_all = pd.concat((df_all, simil), axis=1)

In [35]:
simil = None

In [36]:
# random forest regressor - test & validation split
RF2 = RandomForestRegressor()
RF2.fit(df_all[:800000], Y_train[:800000])
RF2_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF2.predict(df_all[:800000])))
RF2_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF2.predict(df_all[800000:1000000])))
print 'Random forest RMSE - training set = %0.5f' % RF2_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF2_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.04321
Random forest RMSE - validation set = 0.09847
Baseline random forest RMSE = 0.27188


In [39]:
df_all[1000000:].shape

(824230, 797)

In [None]:
# TODO tune further? 
# random forest regressor - training & test split
RF3 = RandomForestRegressor()
RF3.fit(df_all[:1000000], Y_train)
RF3_pred = RF3.predict(df_all[1000000:])
RF3_rmse = math.sqrt(mean_squared_error(Y_train, RF3.predict(df_all[:1000000])))
print 'New random forest RMSE = %0.5f' % RF3_rmse
print 'Baseline random forest RMSE = 0.27188'
write_to_file('RF_3char_bondtypes_similarities.csv', RF3_pred)