In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from collections import Counter

In [2]:
# Make the DataFrame
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
# Store gap values
Y_train = df_train.gap.values
# Row where testing examples start
test_idx = df_train.shape[0]
# Delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
# Delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [4]:
# DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


## Feature Engineering

In [5]:
# A method to make features on a supplied DataFrame
def makeFeatures(df):
    
    ##########################
    ### DROP EMPTY COLUMNS ###
    ##########################
    # Remove 0 columns (columns with no data)
    zero_cols = []
    for i in range(1,257):
        if df['feat_%03d' % i].sum() == 0:
            zero_cols.append('feat_%03d' % i)
    df = df.drop(zero_cols, axis=1)
    
    ##############################
    ### SMILE CHARACTER COUNTS ###
    ##############################
    smiles = df.smiles
    smileydict = smiles.map(lambda x: dict(Counter(x)))
    smile_alphabet=list(set(''.join(smiles.iloc[0:50])))
    for smile in smile_alphabet:
        smilechar = smile
        if smile == '=':
            smilechar = 'equal'
        df['smile_'+smilechar] = smileydict.map(lambda x: x[smile] if smile in x.keys() else 0)
    
    ###########################
    ### FEATURE ENGINEERING ###
    ###########################
    # Add length of smile
    df['smile_length'] = df.smiles.map(lambda x: len(x))

    # Add number of C's divided by length
    df['smile_percentc'] = (df.smile_c / df.smile_length)
    df['smile_percentC'] = (df.smile_C / df.smile_length)

    # Count specific molecules
    # [nH]
    df['smile_nh'] = df.smiles.map(lambda x: '[nH]' in x)
    df['smile_nh1'] = df.smiles.map(lambda x: '[nH]1' in x)
    df['smile_si'] = df.smiles.map(lambda x: 'Si' in x)
    df['smile_sih2'] = df.smiles.map(lambda x: '[SiH2]' in x)
    df['smile_se'] = df.smiles.map(lambda x: '[se]' in x)
    df['smile_CdoubleC'] = df.smiles.map(lambda x: 'C=C' in x)
    df['smile_doubleC'] = df.smiles.map(lambda x: 'CC' in x)
    df['smile_doublec'] = df.smiles.map(lambda x: 'cc' in x)
    df['smile_triplec'] = df.smiles.map(lambda x: 'ccc' in x)
    df['smile_quadc'] = df.smiles.map(lambda x: 'cccc' in x)
    df['smile_quintc'] = df.smiles.map(lambda x: 'ccccc' in x)
    df['smile_C1equalCc2'] = df.smiles.map(lambda x: 'C1=Cc2' in x)
    df['smile_C1'] = df.smiles.map(lambda x: 'C1' in x)
    df['smile_c1'] = df.smiles.map(lambda x: 'c1' in x)
    df['smile_equalCCCequal'] = df.smiles.map(lambda x: '=CCC=' in x)
    df['smile_equalCCequal'] = df.smiles.map(lambda x: '=CC=' in x)
    df['smile_equalCequal'] = df.smiles.map(lambda x: '=C=' in x)
    df['smile_C1equalCCequalC'] = df.smiles.map(lambda x: 'C1=CC=C' in x)
    df['smile_c1cccs1'] = df.smiles.map(lambda x: 'c1ccc(s1)' in x)

    # Important molecules
    df['smile_c1ccccc1'] = df.smiles.map(lambda x: 'c1ccccc1' in x)
    df['smile_n1ccccc1'] = df.smiles.map(lambda x: 'n1ccccc1' in x)
    df['smile_o1cccc1'] = df.smiles.map(lambda x: 'o1cccc1' in x)
    df['smile_c1ccccc1-c2ccccc2'] = df.smiles.map(lambda x: 'c1ccccc1-c2ccccc2' in x)
    df['smile_n1c[nH]cc1'] = df.smiles.map(lambda x: 'n1c[nH]cc1' in x)
    
    # Parentheses molecules
    df['smile_parenC1'] = df.smiles.map(lambda x: '(C1)' in x)
    df['smile_parenc1'] = df.smiles.map(lambda x: '(c1)' in x)
    df['smile_parencc1'] = df.smiles.map(lambda x: '(cc1)' in x)
    df['smile_pareno1'] = df.smiles.map(lambda x: '(o1)' in x)
    df['smile_parens1'] = df.smiles.map(lambda x: '(s1)' in x)
    df['smile_parenccc4mol'] = df.smiles.map(lambda x: '(ccc4=C[SiH2]C=c34)' in x)
    df['smile_parenccinnermol'] = df.smiles.map(lambda x: '(cc(-c3ccco3)c3=CCC=c13)' in x)
    df['smile_parennegc3cco3'] = df.smiles.map(lambda x: '(-c3ccco3)' in x)
    df['smile_parenncc3c12'] = df.smiles.map(lambda x: '(ncc3c12)' in x)
    df['smile_parenccc34'] = df.smiles.map(lambda x: '(ccc34)' in x)
    df['smile_parencc4ccc3c2cn1'] = df.smiles.map(lambda x: '(cc4ccc3c2cn1)' in x)

    # Special
    df['smile_percent_aromatic'] = (df.smile_c + df.smile_o + df.smile_n + df.smile_s / df.smile_length)

    # Start
    df['smile_start_C1'] = df.smiles.map(lambda x: x.startswith('C1'))
    df['smile_start_C1equal'] = df.smiles.map(lambda x: x.startswith('C1='))
    df['smile_start_c1'] = df.smiles.map(lambda x: x.startswith('c1'))
    df['smile_start_cc1'] = df.smiles.map(lambda x: x.startswith('cc1'))
    df['smile_start_c1sc'] = df.smiles.map(lambda x: x.startswith('c1sc'))
    df['smile_start_c1ccc'] = df.smiles.map(lambda x: x.startswith('c1ccc'))
    df['smile_start_nH'] = df.smiles.map(lambda x: x.startswith('[nH]'))
    df['smile_start_C1equalCCequalC'] = df.smiles.map(lambda x: x.startswith('C1=CC=C'))

    # End
    df['smile_end_c1ccc'] = df.smiles.map(lambda x: x.endswith('c1ccc'))
    df['smile_end_o1'] = df.smiles.map(lambda x: x.endswith('o1'))
    df['smile_end_ccsc12'] = df.smiles.map(lambda x: x.endswith('ccsc12'))
    
    joint = ''.join(smiles.iloc[:3000])
    
    ###############
    ### BIGRAMS ###
    ###############
    top_50_bigrams = Counter(zip(joint,joint[1:])).most_common(200)
    for bigram,count in top_50_bigrams:
        b = bigram[0]+bigram[1]+""
        df[b+"_bigram"] = smiles.map(lambda x: 1 if b in x else 0)
        
    ################
    ### TRIGRAMS ###
    ################
    top_trigrams = Counter(zip(joint, joint[1:], joint[2:])).most_common(200)
    for trigram, count in top_trigrams:
        t = trigram[0]+trigram[1]+trigram[2]+""
        df[t+"_trigram"] = smiles.map(lambda x: 1 if t in x else 0)
        
    ################
    ### QUADGRAMS ###
    ################
    top_quadgrams = Counter(zip(joint, joint[1:], joint[2:], joint[3:])).most_common(200)
    for quadgram, count in top_quadgrams:
        q = quadgram[0]+quadgram[1]+quadgram[2]+quadgram[3]+""
        df[q+"_quadgram"] = smiles.map(lambda x: 1 if q in x else 0)
        
    ################
    ### QUINTS ###
    ################
    top_quintgrams = Counter(zip(joint, joint[1:], joint[2:], joint[3:], joint[4:])).most_common(100)
    for quintgram, count in top_quintgrams:
        q = quintgram[0]+quintgram[1]+quintgram[2]+quintgram[3]+quintgram[4]+""
        df[q+"_quintgram"] = smiles.map(lambda x: 1 if q in x else 0)
        
    ###########
    ### SIX ###
    ###########
    top_sixgrams = Counter(zip(joint, joint[1:], joint[2:], joint[3:], joint[4:], joint[5:])).most_common(100)
    for sixgram, count in top_sixgrams:
        s = sixgram[0]+sixgram[1]+sixgram[2]+sixgram[3]+sixgram[4]+sixgram[5]+""
        df[s+"_sixgram"] = smiles.map(lambda x: 1 if s in x else 0)
    
    ################################
    ### DROP UNNECESSARY COLUMNS ###
    ################################   
    df = df.drop('smile_length', axis=1)
    
    return df

In [6]:
# Make the features
print "Making Features"
df_all = makeFeatures(df_all)
df_all.shape

Making Features


(1824230, 858)

In [7]:
# Split the train and test sets
df_all = df_all.drop(['smiles'], axis=1)
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape
print "Test features:", X_test.shape

Train features: (1000000, 857)
Train gap: (1000000,)
Test features: (824230, 857)


## Splitting the data into Train, Validation & Test sets

In [8]:
# Create temporary train, validate and test sets from the X_train set
X_tmp_trainvalidate, X_tmp_test, Y_tmp_trainvalidate, Y_tmp_test = train_test_split(X_train, Y_train, test_size=0.20, random_state=37)
X_tmp_train, X_tmp_validate, Y_tmp_train, Y_tmp_validate = train_test_split(X_tmp_trainvalidate, Y_tmp_trainvalidate, test_size=0.20, random_state=42)

## Validating the Data
Here we record the results of various tests and their resultant RMSE values on the validation set.

### Basic RF with SMILE Character Counts
 Validation RMSE for n = 10: 0.144<br/>
 Validation RMSE for n = 15: 0.142<br/>
 Validation RMSE for n = 20: 0.142<br/>
 Validation RMSE for n = 30: 0.141<br/>
 Validation RMSE for n = 40: 0.141
 
 The best N for RF was: 40 with RMSE: .141

### Basic RF with Empty Columns Dropped
Validation RMSE for n = 10: 0.1436

### Basic RF with Feature Engineering
Validation RMSE for n = 10: 0.1293<br/>
Validation RMSE for n = 30: 0.1263<br/>
Validation RMSE for n = 40: 0.1260

### Basic RF with Feature Engineering and min_samples_split testing
Validation RMSE for n = 10, min_samples_split = 2: 0.1296 <br/>
Validation RMSE for n = 10, min_samples_split = 5: 0.1267<br/>
Validation RMSE for n = 10, min_samples_split = 8: 0.1259<br/>
Validation RMSE for n = 10, min_samples_split = 12: 0.1255<br/>
Validation RMSE for n = 10, min_samples_split = 15: 0.1264<br/>

Validation RMSE for n = 30, min_samples_split = 12: 0.1233<br/>

Validation RMSE for n = 40, min_samples_split = 12: 0.1230<br/>

### Basic RF with Bigrams and min_samples_split testing
50 top bigrams: Validation RMSE for n = 40, min_samples_split = 12: 0.1140<br/>
100 top bigrams: Validation RMSE for n = 40, min_samples_split = 12: 0.1021<br/>
200 top bigrams: Validation RMSE for n = 40, min_samples_split = 12: 0.1011

### Basic RF with Bigrams, Trigrams and min_samples_split testing
100 top bigrams + 100 top trigrams: Validation RMSE for n = 40, min_samples_split = 12: 0.0864

### Basic RF with Bigrams, Trigrams, Quadgrams and min_samples_split testing
100 top bigrams + 100 top trigrams + 100 top quadgrams: Validation RMSE for n = 40, min_samples_split = 12: 0.0819<br/>
200 top bigrams + 100 top trigrams + 100 top quadgrams: Validation RMSE for n = 40, min_samples_split = 12: 0.0809

### Basic RF with Quints
200 top bigrams + 200 top trigrams + 100 top quadgrams + 100 top quintgrams: Validation RMSE for n = 40, min_samples_split = 12: 0.0778<br/>
200 top bigrams + 200 top trigrams + 100 top quadgrams + 100 top quintgrams: Validation RMSE for n = 50, min_samples_split = 12: 0.0777

In [12]:
# Validate and tune the Random Forest Regressor
bestRFValidateRMSE = 1.0
bestRFN = 10
bestRFS = 2

for n in [50]:
    for s in [12]:
        RF = RandomForestRegressor(n_estimators=n, min_samples_split=s)
        RF.fit(X_tmp_train, Y_tmp_train)
        rmse = np.sqrt(np.mean((RF.predict(X_tmp_validate) - Y_tmp_validate) ** 2))
        print("Validation RMSE for n = %d, min_samples_split = %d: %.4f" % (n, s, rmse))
        if (rmse < bestRFValidateRMSE):
            bestRFValidateRMSE = rmse
            bestRFN = n
            bestRFS = s

print "The best N for RF was: %d, with min_samples_split: %d, with RMSE: %.4f" % (bestRFN, bestRFS, bestRFValidateRMSE)

Validation RMSE for n = 50, min_samples_split = 12: 0.0769
The best N for RF was: 50, with min_samples_split: 12, with RMSE: 0.0769


## Testing the Data

Once all feature engineering and model tuning has been completed, test the data on the withheld test set.

In [15]:
# Test the final settings for the Random Forest Regressor
print "BEST RF N: %d" % bestRFN
RF = RandomForestRegressor(n_estimators=bestRFN, min_samples_split=bestRFS)
RF.fit(X_tmp_train, Y_tmp_train)
print("Test RMSE: %.4f" % np.sqrt(np.mean((RF.predict(X_tmp_test) - Y_tmp_test) ** 2)))

BEST RF N: 50


KeyboardInterrupt: 

## Prediction Creation

Create the prediction.csv to upload to Kaggle.com.

In [9]:
# Fit the Random Forest on the full X_train set 
RF = RandomForestRegressor(n_estimators=50, min_samples_split=12)
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)

In [10]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [11]:
write_to_file("prediction.csv", RF_pred)