# Generate Bondtype Counts

- For each molecule, generate total counts of each type of molecule (aromatic, double, single bonds)

## Table of Contents

1. [Load Data](#section1)
3. [Baseline Models](#section2)
4. [Generate Bond Types with RDKit](#section3)

In [1]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import scipy as sp
import math
from scipy.stats import pearsonr

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import train_test_split

pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

%matplotlib inline 
import matplotlib.pyplot as plt 
pd.options.display.mpl_style = 'default'

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")




 <a id = "section1"></a>
## Load All Data

In [2]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [3]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,...,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1.98


In [4]:
df_train.shape

(1000000, 258)

In [5]:
df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,...,feat_207,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [6]:
df_test.shape

(824230, 258)

In [7]:
#store gap values
Y_train = df_train.gap.values

#row where testing examples start
test_idx = df_train.shape[0]

#delete 'Id' column
df_test2 = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train2 = df_train.drop(['gap'], axis=1)

#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train2, df_test2), axis=0)

all_smiles = df_all.smiles

In [8]:
df_all.shape

(1824230, 257)

In [9]:
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,...,feat_207,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [10]:
# drop smiles
df_all = df_all.drop(['smiles'], axis=1)

In [11]:
# reset index
df_all = df_all.reset_index().drop(['index'], axis=1)

In [12]:
df_all.head()

Unnamed: 0,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,feat_050,...,feat_207,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
2,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
3,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


### Utility Functions

In [14]:
def get_pred_rmse(model, dataframe, model_name, track_dict, ytrain, test_idx=None, train_size=None, parameters=None,
                  score_func='mean_squared_error', n_folds=5):
    """
    returns predictions from model after training in dataframe
    updates track_dict with rmse score
    prints out report

    dataframe passed in:
        - should not have 'smiles', 'gap', or 'id' columns
        - should have 'smiles' column
        - should have all test and training data

    Suggested train size: 0.8
    """
    vals = dataframe.values

    if test_idx:
        X_train = vals[:test_idx]
        X_pred = vals[test_idx:]
    else:
        X_train = vals

    #X_train = X_train.drop(['gap'], axis=1)
    #X_train = X_train.drop(['smiles'], axis=1)

    # Split X_train into futher validation and training set
    if train_size:
        # random_state sets the seed, for reproducibility
        itrain, itest = train_test_split(xrange(X_train.shape[0]), train_size=train_size, random_state=123)
        print 'itrain size', len(itrain)
        print 'itest size', len(itest)
        mask = np.ones(X_train.shape[0], dtype='int')
        print 'mask size', mask.size
        mask[itrain] = 1
        mask[itest] = 0
        mask = (mask == 1)

        # Be careful about order here
        X_val = X_train[~mask]
        X_train = X_train[mask]

        yval = ytrain[~mask]
        ytrain = ytrain[mask]

    if parameters:
        # run cross-validation to find best parameter
        val_start = time.time()
        model = cv_optimize(model, parameters, X_train, ytrain, n_folds=n_folds, score_func=score_func)
        val_time = time.time() - val_start
        print 'Time to crossvalidate:', val_time

        
    train_start = time.time()
    # Fit on entire training set
    model = model.fit(X_train, ytrain)
    train_time = time.time() - train_start
    print 'Time to train model:', train_time

    pred_start = time.time()
    rmse_train = math.sqrt(mean_squared_error(ytrain, model.predict(X_train)))
    #     rmse_train = math.sqrt(mean_squared_error(ytrain[mask], model.predict(X_train)))
    pred_time = time.time() - pred_start
    print 'Training set RMSE = %0.5f' % rmse_train
    print 'Time to predict on training data:', pred_time
    
    track_dict[model_name] = [model, rmse_train]

    if train_size:
        val_pred_start = time.time()
        rmse_val = math.sqrt(mean_squared_error(yval, model.predict(X_val)))
        val_pred_time = time.time() - val_pred_start
        print 'Test/val set RMSE = %0.5f' % rmse_val
        track_dict[model_name].append(rmse_val)
        print 'Time to predict on val data', val_pred_time

    #     print 'Your RMSE - LR baseline rmse = %0.5f' % (rmse_train - track_dict['leaderboard_LR'])
    #     print 'Your RMSE - RF baseline rmse = %0.5f'% (rmse_train - track_dict['leaderboard_RF'])
    print 'Train features:', X_train.shape
    print 'Train gap:', ytrain.shape

    if test_idx:
        test_pred_start = time.time()
        print 'Test features:', X_pred.shape
        pred = model.predict(X_pred)
        test_pred_time = time.time() - test_pred_start
        print 'Time to predict on held data', test_pred_time
        return pred  

In [None]:
def cv_optimize(clf, parameters, X, y, n_folds=5, score_func=None):
    """
    Function
    --------
    cv_optimize

    Inputs
    ------
    clf : an instance of a scikit-learn classifier
    parameters: a parameter grid dictionary that is passed to GridSearchCV
    X: a samples-features matrix in the scikit-learn style
    y: the response vectors of 1s and 0s (+ives and -ives)
    n_folds: the number of cross-validation folds (default 5)
    score_func: a score function we might want to pass (default python None)

    Returns
    -------
    The best estimator from the GridSearchCV, after the GridSearchCV has been used to
    fit the model.

    Notes
    -----
    see do_classify and the code below for an example of how this is used
    """

    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(X, y)

    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_

    return best

In [None]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

 <a id = "section2"></a>
 
## Baseline Models

In [15]:
# baseline scores
scores = {'leaderboard_RF': 0.27207, 'leaderboard_LR': 0.29846}

In [16]:
LR = LinearRegression()

LR_pred = get_pred_rmse(LR, df_all, 'LR_baseline', scores, Y_train, test_idx=test_idx)

Training set RMSE = 0.29893
Your RMSE - LR baseline rmse = 0.00047
Your RMSE - RF baseline rmse = 0.02686
Train features: (1000000, 256)
Train gap: (1000000,)
Test features: (824230, 256)


In [17]:
RF = RandomForestRegressor()

RF_pred = get_pred_rmse(RF, df_all, 'RF_baseline', scores, Y_train, test_idx=test_idx)

Training set RMSE = 0.27188
Your RMSE - LR baseline rmse = -0.02658
Your RMSE - RF baseline rmse = -0.00019
Train features: (1000000, 256)
Train gap: (1000000,)
Test features: (824230, 256)


In [18]:
scores

{'LR_baseline': 0.29893086588711254,
 'RF_baseline': 0.27187727953055246,
 'leaderboard_LR': 0.29846,
 'leaderboard_RF': 0.27207}

 <a id = "section3"></a>
## Generate Bondtypes with RDKit

In [23]:
from rdkit import Chem, RDConfig
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
#from rdkit import DataStructs

In [24]:
def make_bond_type_dict(bond_list):
    bond_dict = {}
    for bond in bond_list: 
        bond_name = str(bond.GetBondType())
        if bond_name in bond_dict: 
            bond_dict[bond_name] += 1
        else: 
            bond_dict[bond_name] = 1

    return bond_dict

In [25]:
def smiles_to_bondtypecounts(all_smiles, start, nexts):
    mols = all_smiles[start:nexts].astype(str).apply(lambda x: Chem.MolFromSmiles(x))
    
    bonds = mols.apply(lambda x: x.GetBonds())
    
    bondtypes = bonds.apply(lambda y: make_bond_type_dict(y))
    
    bond_counts = bondtypes.apply(lambda x: [x.get('SINGLE', 0), x.get('DOUBLE', 0), x.get('AROMATIC', 0)])
    
    bondtypes_df = pd.DataFrame(np.vstack(bond_counts), columns=['Single', 'Double', 'Aromatic'])
    
    return bondtypes_df

In [26]:
# go in chunks of 20,000
start = 0
chunk = 20000

# size of the full thing
sizeofdf = df_all.shape[0]
nexts = start + chunk

# Initialize the first part
print start, nexts
df_all_bondtypes = smiles_to_bondtypecounts(all_smiles, start, nexts)

start += chunk
nexts = start + chunk

while nexts < sizeofdf: 
    print start, nexts
    
    bondtypes_df = smiles_to_bondtypecounts(all_smiles, start, nexts)
    
    # Add to next
    df_all_bondtypes = pd.concat((df_all_bondtypes, bondtypes_df), axis=0)    

    start += chunk
    nexts = start + chunk

# Handle the remainder here 
print 'At the end!', start, sizeofdf
bondtypes_df = smiles_to_bondtypecounts(all_smiles, start, sizeofdf)
df_all_bondtypes = pd.concat((df_all_bondtypes, bondtypes_df), axis=0)    

0 20000
20000 40000
40000 60000
60000 80000
80000 100000
100000 120000
120000 140000
140000 160000
160000 180000
180000 200000
200000 220000
220000 240000
240000 260000
260000 280000
280000 300000
300000 320000
320000 340000
340000 360000
360000 380000
380000 400000
400000 420000
420000 440000
440000 460000
460000 480000
480000 500000
500000 520000
520000 540000
540000 560000
560000 580000
580000 600000
600000 620000
620000 640000
640000 660000
660000 680000
680000 700000
700000 720000
720000 740000
740000 760000
760000 780000
780000 800000
800000 820000
820000 840000
840000 860000
860000 880000
880000 900000
900000 920000
920000 940000
940000 960000
960000 980000
980000 1000000
1000000 1020000
1020000 1040000
1040000 1060000
1060000 1080000
1080000 1100000
1100000 1120000
1120000 1140000
1140000 1160000
1160000 1180000
1180000 1200000
1200000 1220000
1220000 1240000
1240000 1260000
1260000 1280000
1280000 1300000
1300000 1320000
1320000 1340000
1340000 1360000
1360000 1380000
1380000 

In [27]:
df_all_bondtypes.shape

(1824230, 3)

In [28]:
# reset and drop the old index
dftouse_bondtypes = df_all_bondtypes.reset_index().drop(['index'], axis=1)

In [29]:
# Check
dftouse_bondtypes.head()

Unnamed: 0,Single,Double,Aromatic
0,3,0,29
1,9,5,16
2,7,1,25
3,8,4,21
4,1,0,34


In [30]:
# combine back together
df_all = pd.concat((df_all, dftouse_bondtypes), axis=1)

In [31]:
df_all.head()

Unnamed: 0,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,feat_050,...,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,Single,Double,Aromatic
0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,29
1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,9,5,16
2,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,7,1,25
3,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,8,4,21
4,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,34


In [32]:
df_all.shape

(1824230, 259)

In [33]:
RF = RandomForestRegressor()

RF_pred = get_pred_rmse(RF, df_all, 'RF_bondtypes_added', scores, Y_train, test_idx=test_idx)

Training set RMSE = 0.18393
Your RMSE - LR baseline rmse = -0.11453
Your RMSE - RF baseline rmse = -0.08814
Train features: (1000000, 259)
Train gap: (1000000,)
Test features: (824230, 259)


In [37]:
scores

{'LR_baseline': 0.29893086588711254,
 'RF_baseline': 0.27187727953055246,
 'RF_bondtypes_added': 0.18393403380474108,
 'leaderboard_LR': 0.29846,
 'leaderboard_RF': 0.27207}

In [36]:
write_to_file('rf_bond_types_pred.csv', RF_pred)

In [48]:
df_all[['Single', 'Double', 'Aromatic']].to_csv('bond_type_counts.csv')