# Exploratory Data Analysis & RDKit

Key items in this notebook: 
- Exploring functions of RDKit
- Generate a 2048-bit fingerprint sample set of data
- Exploratory data analysis
- Unsuccessful attempts at adding features through summing the 256 bit-binary fingerprints

## Table of Contents

1. [Load Data](#section1)
2. [Utility Functions](#section2)
3. [Baseline Models](#section3)
4. [Exploratory Data Analysis](#section4)
5. [Sums of 256-Bit Fingerprint](#section5)
6. [Experimenting with RDKit](#section6)
7. [Generate 2048-bit Fingerprints 50K sample set](#section7)

In [1]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import scipy as sp
import math
from scipy.stats import pearsonr

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

%matplotlib inline 
import matplotlib.pyplot as plt 
pd.options.display.mpl_style = 'default'

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")




 <a id = "section1"></a>
## Load Data

In [4]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [5]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,...,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1.98


In [6]:
df_train.shape

(1000000, 258)

In [7]:
df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,...,feat_207,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [8]:
df_test.shape

(824230, 258)

In [7]:
#store gap values
Y_train = df_train.gap.values

In [8]:
#row where testing examples start
test_idx = df_train.shape[0]

In [9]:
#delete 'Id' column
df_test2 = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train2 = df_train.drop(['gap'], axis=1)

In [10]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train2, df_test2), axis=0)
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,...,feat_207,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [11]:
df_all_ns = df_all.drop(['smiles'], axis=1)

 <a id = "section2"></a>
## Utility Functions

In [109]:
def get_pred_rmse(model, dataframe, model_name, track_dict, ytrain, test_idx=None, train_size=None, parameters=None,
                  score_func='mean_squared_error', n_folds=5):
    """
    returns predictions from model after training in dataframe
    updates track_dict with rmse score
    prints out report

    dataframe passed in:
        - should not have 'smiles', 'gap', or 'id' columns
        - should have 'smiles' column
        - should have all test and training data

    Suggested train size: 0.8
    """
    vals = dataframe.values

    if test_idx:
        X_train = vals[:test_idx]
        X_pred = vals[test_idx:]
    else:
        X_train = vals

    #X_train = X_train.drop(['gap'], axis=1)
    #X_train = X_train.drop(['smiles'], axis=1)

    # Split X_train into futher validation and training set
    if train_size:
        # random_state sets the seed, for reproducibility
        itrain, itest = train_test_split(xrange(X_train.shape[0]), train_size=train_size, random_state=123)
        print 'itrain size', len(itrain)
        print 'itest size', len(itest)
        mask = np.ones(X_train.shape[0], dtype='int')
        print 'mask size', mask.size
        mask[itrain] = 1
        mask[itest] = 0
        mask = (mask == 1)

        # Be careful about order here
        X_val = X_train[~mask]
        X_train = X_train[mask]

        yval = ytrain[~mask]
        ytrain = ytrain[mask]

    if parameters:
        # run cross-validation to find best parameter
        val_start = time.time()
        model = cv_optimize(model, parameters, X_train, ytrain, n_folds=n_folds, score_func=score_func)
        val_time = time.time() - val_start
        print 'Time to crossvalidate:', val_time

        
    train_start = time.time()
    # Fit on entire training set
    model = model.fit(X_train, ytrain)
    train_time = time.time() - train_start
    print 'Time to train model:', train_time

    pred_start = time.time()
    rmse_train = math.sqrt(mean_squared_error(ytrain, model.predict(X_train)))
    #     rmse_train = math.sqrt(mean_squared_error(ytrain[mask], model.predict(X_train)))
    pred_time = time.time() - pred_start
    print 'Training set RMSE = %0.5f' % rmse_train
    print 'Time to predict on training data:', pred_time
    
    track_dict[model_name] = [model, rmse_train]

    if train_size:
        val_pred_start = time.time()
        rmse_val = math.sqrt(mean_squared_error(yval, model.predict(X_val)))
        val_pred_time = time.time() - val_pred_start
        print 'Test/val set RMSE = %0.5f' % rmse_val
        track_dict[model_name].append(rmse_val)
        print 'Time to predict on val data', val_pred_time

    #     print 'Your RMSE - LR baseline rmse = %0.5f' % (rmse_train - track_dict['leaderboard_LR'])
    #     print 'Your RMSE - RF baseline rmse = %0.5f'% (rmse_train - track_dict['leaderboard_RF'])
    print 'Train features:', X_train.shape
    print 'Train gap:', ytrain.shape

    if test_idx:
        test_pred_start = time.time()
        print 'Test features:', X_pred.shape
        pred = model.predict(X_pred)
        test_pred_time = time.time() - test_pred_start
        print 'Time to predict on held data', test_pred_time
        return pred      




In [None]:
def cv_optimize(clf, parameters, X, y, n_folds=5, score_func=None):
    """
    Function
    --------
    cv_optimize

    Inputs
    ------
    clf : an instance of a scikit-learn classifier
    parameters: a parameter grid dictionary that is passed to GridSearchCV
    X: a samples-features matrix in the scikit-learn style
    y: the response vectors of 1s and 0s (+ives and -ives)
    n_folds: the number of cross-validation folds (default 5)
    score_func: a score function we might want to pass (default python None)

    Returns
    -------
    The best estimator from the GridSearchCV, after the GridSearchCV has been used to
    fit the model.

    Notes
    -----
    see do_classify and the code below for an example of how this is used
    """

    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(X, y)

    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_

    return best

In [44]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

 <a id = "section3"></a>
## Baseline Models

In [101]:
# baseline scores
scores = {'leaderboard_RF': 0.27207, 'leaderboard_LR': 0.29846}

In [None]:
LR = LinearRegression()

LR_pred = get_pred_rmse(LR, df_all_ns, 'LR_baseline', scores, Y_train)

In [26]:
print LR.coef_

[  2.18886402e-02   7.17168571e+11  -3.76681794e+11   1.88876977e+10
   1.89383753e-01   2.93634004e+10   2.47401816e-01   2.56366137e+10
   1.42883165e+10   9.13186371e+10   2.30033671e+11   4.59711248e+09
  -1.26567481e+11   4.79836615e+09   5.08976837e+10   1.66432017e+10
   1.25810485e+11  -1.51103263e+11   5.02720764e+09  -6.05748576e+10
   1.70101180e+10  -7.06651670e+10   2.75472494e+10   6.65348103e+08
  -6.08062744e-02  -9.86931120e+10  -1.11359883e+10   2.53568352e+09
   2.11803990e+09  -4.66476557e+10   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   2.21496582e-01   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   3.21601868e-01
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00

In [47]:
RF = RandomForestRegressor()

RF_pred = get_pred_rmse(RF, df_all_ns, 'RF_baseline', scores, Y_train)

Training set RMSE = 0.27188
Your RMSE - LR baseline rmse = -0.02658
Your RMSE - RF baseline rmse = -0.00019
Train features: (1000000, 256)
Test features: (824230, 256)
Train gap: (1000000,)


In [28]:
scores

{'LR_baseline': 0.29893086588711254,
 'RF_baseline': 0.2718746710900172,
 'leaderboard_LR': 0.29846,
 'leaderboard_RF': 0.27207}

In [29]:
min(scores.iteritems(), key=lambda x: x[1])

('RF_baseline', 0.2718746710900172)

In [None]:
write_to_file("predictions/samplebaselineLR.csv", LR_pred)
write_to_file("predictions/samplebaselineRF.csv", RF_pred)

In [52]:
# Look at provided files
sample1 = pd.read_csv("predictions/sample1.csv") # this is Linear Regression
sample2 = pd.read_csv("predictions/sample2.csv") # This is Random Forest

print sample1.shape
print sample2.shape

(824230, 2)
(824230, 2)


 <a id = "section4"></a>
## Exploratory Data Analysis

#### Is there any missing data?

In [30]:
missing_perc_train = df_train.isnull().sum()/df_train.shape[0]*100
missing_perc_test = df_test.isnull().sum()/df_test.shape[0]*100

In [31]:
sum(missing_perc_train)

0.0

In [32]:
sum(missing_perc_test)

0.0

Good, No missing data

#### Are there any features with all 0's?

In [36]:
df_all_ns.shape

(1824230, 256)

In [37]:
df_all_ns.sum(axis=0)

feat_001    1170681
feat_002          0
feat_003          0
feat_004          0
feat_005    1824202
feat_006     870838
feat_007    1780030
feat_008          0
feat_009          0
feat_010          0
feat_011          0
feat_012          0
feat_013          0
feat_014          0
feat_015          0
feat_016          0
feat_017          0
feat_018          0
feat_019          0
feat_020          0
feat_021          0
feat_022          0
feat_023          0
feat_024          0
feat_025     703691
feat_026          0
feat_027          0
feat_028          0
feat_029          0
feat_030          0
feat_031          0
feat_032          0
feat_033          0
feat_034          0
feat_035          0
feat_036          0
feat_037     653356
feat_038          0
feat_039          0
feat_040          0
feat_041          0
feat_042          0
feat_043          0
feat_044       9115
feat_045          0
feat_046          0
feat_047          0
feat_048          0
feat_049          0
feat_050          0


Interesting, lots of columns with all 0's

#### Is there correlation between the sum of binary features and the gap? 

In [38]:
summ = df_all_ns.sum(axis=1)

In [39]:
pearsonr(summ[:test_idx], Y_train)

(-0.39117080696498591, 0.0)

There's a negative correlation with the sum of the 256 binary features

 <a id = "section5"></a>
## Feature Engineering

In [136]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


'\nExample Feature Engineering\n\nthis calculates the length of each smile string and adds a feature column with those lengths\nNote: this is NOT a good feature and will result in a lower score!\n'

#### Sum across features per observation

In [40]:
df_all_summ = df_all_ns.copy()

In [41]:
# summ = df_all_ns.sum(axis=1)
df_all_summ['summ'] = summ

In [48]:
RF_sum = RandomForestRegressor()

RF_sum_pred = get_pred_rmse(RF_sum, df_all_summ, 'RF_with_summ', scores, Y_train)

Training set RMSE = 0.27188
Your RMSE - LR baseline rmse = -0.02658
Your RMSE - RF baseline rmse = -0.00019
Train features: (1000000, 257)
Test features: (824230, 257)
Train gap: (1000000,)


In [49]:
write_to_file("predictions/RF_sum_pred.csv", RF_sum_pred)

In [51]:
pd.read_csv("predictions/RF_sum_pred.csv").shape

(824230, 2)

#### Try using just the sum to predict

In [61]:
df_all_summ.head()

Unnamed: 0,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,feat_050,...,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,summ
0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,10
1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,17
2,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,19
3,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,18
4,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,10


In [62]:
df_sum_only = pd.DataFrame(df_all_summ.summ)

In [64]:
RF_sum_only = RandomForestRegressor()

RF_sum_only_pred = get_pred_rmse(RF_sum_only, df_sum_only, 'RF_with_summ_ONLY', scores, Y_train)

Training set RMSE = 0.36515
Your RMSE - LR baseline rmse = 0.06669
Your RMSE - RF baseline rmse = 0.09308
Train features: (1000000, 1)
Test features: (824230, 1)
Train gap: (1000000,)


 <a id = "section6"></a>
## Experimenting with RDkit

In [9]:
from rdkit import Chem, RDConfig
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs

In [118]:
df_all.smiles.ix[0]

0    c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...
0    c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...
Name: smiles, dtype: object

In [119]:
df_all.smiles

0         c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...
1         C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...
2         [nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...
3         [nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...
4            c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1
5         C1=Cc2cnc3cc4cc(-c5scc6[nH]ccc56)c5ccccc5c4cc3...
6                     c1ncc(s1)-c1cnc2c(c1)oc1c2ccc2ccccc12
7         c1sc(-c2ccc3c(c2)sc2c3c3=CCC=c3c3cccnc23)c2[se...
8               c1ccc(o1)-c1cc2cc3cc4c5c[nH]cc5ccc4cc3cc2o1
9           [nH]1ccc2c3c[nH]cc3c3cc(-c4cncs4)c4=CCC=c4c3c12
10        [nH]1c(cc2c3cocc3c3c(ccc4ccc5=CCC=c5c34)c12)-c...
11              c1cc2oc3c(sc4cc([se]c34)-c3cncc4nsnc34)c2o1
12                   [nH]1c(cc2cnc3cc4ccoc4cc3c12)-c1ccccc1
13            [nH]1ccc2ccc3c4ncc(cc4[nH]c3c12)-c1scc2occc12
14        c1sc(-c2sc(-c3sc(-c4ncncn4)c4nccnc34)c3cc[nH]c...
15           c1cc2ncc(cc2s1)-c1cc2c(ccc3ccccc23)c2c[nH]cc12
16              c1ccc(-c2cc3oc4ccc5c[nH]

In [130]:
type(df_all.smiles)

pandas.core.series.Series

In [133]:
test_smile = df_all.smiles.values[0]

In [142]:
test_smile

'c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2nsnc12'

In [144]:
len(test_smile)

51

### Read a single molecule

In [134]:
m = Chem.MolFromSmiles(test_smile) 

In [135]:
m

<rdkit.Chem.rdchem.Mol at 0x140195c90>

In [136]:
m is None

False

In [139]:
Chem.MolToMolBlock(m)

'\n     RDKit          \n\n 27 32  0  0  0  0  0  0  0  0999 V2000\n    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000    0.0000    0.0000 S   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n    0.000

In [141]:
AllChem.Compute2DCoords(m)

0

### Looping over atoms

In [145]:
len(m.GetAtoms())

27

In [143]:
for atom in m.GetAtoms():
    print(atom.GetAtomicNum())

6
6
6
6
8
6
6
6
6
16
6
6
7
6
6
16
6
6
34
6
6
6
6
7
16
7
6


### Looping over bonds

In [146]:
# https://en.wikipedia.org/wiki/Aromaticity
print(m.GetBonds()[0].GetBondType())

AROMATIC


In [148]:
len(m.GetBonds())

32

In [147]:
for bond in m.GetBonds(): 
    print bond.GetBondType()

AROMATIC
AROMATIC
AROMATIC
AROMATIC
SINGLE
AROMATIC
AROMATIC
AROMATIC
AROMATIC
SINGLE
AROMATIC
AROMATIC
AROMATIC
SINGLE
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC
AROMATIC


### Request individual bonds or atoms

In [155]:
m.GetAtomWithIdx(0).GetSymbol()

'C'

In [149]:
m.GetAtomWithIdx(0).GetExplicitValence()

3

In [154]:
m.GetBondWithIdx(0).GetBeginAtomIdx()

0

In [162]:
m.GetBondWithIdx(25).GetEndAtomIdx()

26

In [163]:
m.GetBondBetweenAtoms(0,1).GetBondType()

rdkit.Chem.rdchem.BondType.AROMATIC

### Molecular Fingerprinting & Similarity 

The default set of parameters used by the fingerprinter is: - minimum path size: 1 bond - maximum path size: 7 bonds - fingerprint size: 2048 bits - number of bits set per hash: 2 - minimum fingerprint size: 64 bits - target on-bit density 0.3

In [None]:
m_fp = FingerprintMols.FingerprintMol(m)

In [171]:
len(FingerprintMols.FingerprintMol(m))

2048

2048 / 256 features given = 8

In [179]:
m_fp1 = FingerprintMols.FingerprintMol(Chem.MolFromSmiles(df_all.smiles.values[1]))

In [180]:
DataStructs.FingerprintSimilarity(m_fp, m_fp1)

0.4904601571268238

In [196]:
# generate fingeprints: Morgan fingerprint with radius 2
mf = AllChem.GetMorganFingerprintAsBitVect(m, 4)

In [197]:
len(mf)

2048

In [202]:
DataStructs.FingerprintSimilarity(mf, m_fp)

0.043134435657800146

 <a id = "section7"></a>
## Generate 2048-bit Fingerprints 50K sample set

In [10]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,...,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1.98


In [13]:
smiles_50k = df_train[:50000].smiles.astype(str).apply(lambda x: Chem.MolFromSmiles(x))

In [14]:
fps_50k = smiles_50k.apply(lambda x: FingerprintMols.FingerprintMol(x))

In [15]:
fps_50k

0        [1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, ...
1        [1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, ...
2        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, ...
3        [1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, ...
4        [1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, ...
5        [1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, ...
6        [1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, ...
7        [1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, ...
8        [1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, ...
9        [1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, ...
10       [1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, ...
11       [1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, ...
12       [1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
13       [1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, ...
14       [1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, ...
15       [1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, ...
16       [1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, .

In [43]:
mask = fps_50k.apply(lambda x: len(x))==2048

In [45]:
sum(mask)

47837

In [46]:
only2048 = fps_50k[mask]

In [47]:
len(only2048)

47837

In [29]:
train_2048_50k = pd.DataFrame(np.vstack(only2048))

In [48]:
df_train[:50000][mask].shape

(47837, 258)

In [49]:
filtered = df_train[:50000][mask]

In [52]:
filtered

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,feat_010,feat_011,feat_012,feat_013,feat_014,feat_015,feat_016,feat_017,feat_018,feat_019,feat_020,feat_021,feat_022,feat_023,feat_024,feat_025,feat_026,feat_027,feat_028,feat_029,feat_030,feat_031,feat_032,feat_033,feat_034,feat_035,feat_036,feat_037,feat_038,feat_039,feat_040,feat_041,feat_042,feat_043,feat_044,feat_045,feat_046,feat_047,feat_048,feat_049,...,feat_208,feat_209,feat_210,feat_211,feat_212,feat_213,feat_214,feat_215,feat_216,feat_217,feat_218,feat_219,feat_220,feat_221,feat_222,feat_223,feat_224,feat_225,feat_226,feat_227,feat_228,feat_229,feat_230,feat_231,feat_232,feat_233,feat_234,feat_235,feat_236,feat_237,feat_238,feat_239,feat_240,feat_241,feat_242,feat_243,feat_244,feat_245,feat_246,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1.60
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1.98
5,C1=Cc2cnc3cc4cc(-c5scc6[nH]ccc56)c5ccccc5c4cc3...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1.81
6,c1ncc(s1)-c1cnc2c(c1)oc1c2ccc2ccccc12,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2.91
7,c1sc(-c2ccc3c(c2)sc2c3c3=CCC=c3c3cccnc23)c2[se...,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2.17
9,[nH]1ccc2c3c[nH]cc3c3cc(-c4cncs4)c4=CCC=c4c3c12,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1.71
10,[nH]1c(cc2c3cocc3c3c(ccc4ccc5=CCC=c5c34)c12)-c...,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,2.08


In [58]:
train_2048_50k['smiles'] = filtered.smiles.values

In [59]:
train_2048_50k['gap'] = filtered.gap.values

In [41]:
del train_2048_50k['smiles']

In [60]:
train_2048_50k

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047,smiles,gap
0,1,1,0,0,0,1,0,0,0,0,1,1,1,0,0,0,1,1,1,0,1,1,1,1,1,1,0,1,1,0,0,0,1,1,0,0,1,1,0,1,0,1,1,0,0,1,0,0,0,1,...,0,1,0,0,0,1,0,1,1,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,0,0,1,1,0,1,1,0,1,1,1,1,1,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,1.19
1,1,0,1,0,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,0,0,1,1,1,1,1,1,0,0,0,0,1,0,1,0,1,1,1,1,0,1,1,...,1,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,1,0,1,1,1,0,0,1,1,1,0,0,1,0,1,0,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.60
2,1,1,1,1,1,1,1,0,0,0,1,1,0,1,1,0,1,1,0,1,1,0,1,1,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,1,0,0,1,1,0,0,0,1,...,1,1,1,1,0,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,0,1,0,1,1,1,0,1,1,0,1,1,1,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.49
3,1,1,0,0,1,1,1,0,0,0,1,1,1,0,0,0,1,1,0,0,1,1,0,1,0,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,0,...,1,1,0,1,1,1,0,1,1,1,0,0,0,0,1,1,0,0,1,1,1,0,1,1,0,1,0,0,1,1,0,1,1,1,1,0,1,0,1,0,1,1,0,1,1,0,1,0,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.36
4,1,1,1,0,0,1,0,1,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,1,1,0,0,1,1,0,1,1,1,1,0,1,0,0,1,1,0,0,0,1,1,1,1,0,0,...,1,1,1,1,0,1,1,1,1,1,0,1,0,0,1,1,0,0,1,1,0,1,1,0,0,1,1,0,1,0,1,1,1,1,0,0,0,0,1,1,0,1,0,1,0,1,1,1,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,1.98
5,1,1,1,1,0,1,1,0,0,1,0,1,1,0,1,1,1,0,0,1,0,1,1,0,1,1,1,1,1,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,...,1,0,1,1,1,1,0,1,1,0,0,0,1,0,0,1,1,1,0,1,1,1,1,0,0,1,0,1,1,1,0,0,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,0,C1=Cc2cnc3cc4cc(-c5scc6[nH]ccc56)c5ccccc5c4cc3...,1.81
6,1,1,0,0,0,1,0,0,0,1,1,1,0,1,1,0,0,1,1,1,0,0,0,1,1,1,0,0,1,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,1,...,0,1,1,1,0,1,1,1,0,0,0,1,0,0,0,1,1,1,1,0,0,1,1,0,0,1,0,1,1,0,1,0,0,1,0,0,0,0,1,1,1,1,0,1,0,1,0,0,c1ncc(s1)-c1cnc2c(c1)oc1c2ccc2ccccc12,2.91
7,1,1,0,1,0,1,0,1,0,0,0,1,0,0,1,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,0,1,1,1,1,1,0,1,...,1,1,0,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,0,1,1,1,0,0,0,1,0,0,0,0,0,1,1,1,1,0,0,0,1,1,1,1,0,1,1,1,1,1,c1sc(-c2ccc3c(c2)sc2c3c3=CCC=c3c3cccnc23)c2[se...,2.17
8,1,1,1,0,0,1,0,1,0,0,1,1,1,1,0,1,1,1,1,0,1,0,0,1,0,1,0,0,1,1,1,0,1,0,1,0,0,1,0,1,1,1,0,1,1,1,1,1,0,1,...,0,0,1,1,0,1,0,1,1,0,0,0,0,0,0,1,1,1,0,1,0,1,1,0,0,1,1,0,0,0,0,1,1,0,1,0,0,1,1,1,1,1,0,1,0,1,0,0,[nH]1ccc2c3c[nH]cc3c3cc(-c4cncs4)c4=CCC=c4c3c12,1.71
9,1,1,1,0,0,1,0,1,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,0,1,0,1,0,1,0,0,1,1,1,1,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1,0,1,0,0,0,0,1,1,1,0,1,0,0,0,0,[nH]1c(cc2c3cocc3c3c(ccc4ccc5=CCC=c5c34)c12)-c...,2.08


In [61]:
train_2048_50k.shape

(47837, 2050)

In [62]:
train_2048_50k.to_csv('train_2048_50k.csv')

In [67]:
sum(train_2048_50k.isnull().sum())

0