In [1]:
import sys
sys.version

'2.7.11 |Anaconda 2.4.1 (x86_64)| (default, Dec  6 2015, 18:57:58) \n[GCC 4.2.1 (Apple Inc. build 5577)]'

In [2]:
import time
import numpy as np
import math
import pandas as pd
import itertools

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

%matplotlib inline
import matplotlib.pyplot as plt

<div class="alert alert-info">
<strong>LOAD DATA</strong>
</div>

In [3]:
# load Y values
df_Y = pd.read_csv('data/train_Y.csv')
df_Y = df_Y.drop(['Unnamed: 0'], axis=1)
Y_train = df_Y.GAP.values
test_idx = Y_train.shape[0]
test_idx

1000000

In [4]:
# load 3-char features
df_enh = pd.read_csv('data/enhaced_features_3char.csv')
df_enh = df_enh.drop(['Unnamed: 0'], axis=1)
df_enh.shape

(1824230, 593)

In [5]:
# load three bond type features
df_bondtypes = pd.read_csv('data/bond_type_counts.csv')
df_bondtypes = df_bondtypes.drop(['Unnamed: 0'], axis=1)
df_bondtypes.shape

(1824230, 3)

In [6]:
# combine all data
df_comb = pd.concat([df_enh, df_bondtypes], axis=1)
df_comb.shape

(1824230, 596)

In [7]:
# split back up into training and test data
lcols = df_comb.columns.values.tolist()
vals = df_comb.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]

print 'Train features:', X_train.shape
print 'Train gap:', Y_train.shape
print 'Test features:', X_test.shape

Train features: (1000000, 596)
Train gap: (1000000,)
Test features: (824230, 596)


In [8]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

<div class="alert alert-info">
<strong>PREDICTORS</strong>
</div>

**Default parameters**

In [None]:
# random forest regressor - test & validation split
RF = RandomForestRegressor()
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

**Modified parameters**

<a href='http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html'>http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html</a>

In [None]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=20) # default = 10
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

<div class="alert alert-success">
<strong>improved</strong>
</div>

In [None]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=5) # default = 10
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

<div class="alert alert-danger">
<strong>did not improve</strong>
</div>

In [None]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=100) # default = 10
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

<div class="alert alert-success">
<strong>improved</strong>
</div>

In [None]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=150, max_features=0.5, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

In [None]:
# random forest regressor - training & test split
RF = RandomForestRegressor(n_estimators=150, max_features=0.5, min_samples_leaf=1)
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)
RF_rmse = math.sqrt(mean_squared_error(Y_train, RF.predict(X_train)))
print 'New random forest RMSE = %0.5f' % RF_rmse
print 'Baseline random forest RMSE = 0.27188'
write_to_file('data/RF_3char_bondtypes_tuned.csv', RF_pred)

<div class="alert alert-success">
<strong>improved</strong>
</div>

In [None]:
# random forest regressor - training & test split
RF = RandomForestRegressor(n_estimators=100, max_features=0.6, min_samples_leaf=1)
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)
RF_rmse = math.sqrt(mean_squared_error(Y_train, RF.predict(X_train)))
print 'New random forest RMSE = %0.5f' % RF_rmse
print 'Baseline random forest RMSE = 0.27188'
write_to_file('data/RF_3char_bondtypes_tuned2.csv', RF_pred)

<div class="alert alert-success">
<strong>improved</strong>
</div>

In [9]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=10, max_features=1.0, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.03073
Random forest RMSE - validation set = 0.06686
Baseline random forest RMSE = 0.27188


In [10]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=20, max_features=1.0, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02774
Random forest RMSE - validation set = 0.06407
Baseline random forest RMSE = 0.27188


In [11]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=50, max_features=1.0, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02592
Random forest RMSE - validation set = 0.06259
Baseline random forest RMSE = 0.27188


In [12]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=100, max_features=1.0, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02524
Random forest RMSE - validation set = 0.06196
Baseline random forest RMSE = 0.27188


In [13]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=100, max_features=0.6, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02398
Random forest RMSE - validation set = 0.05814
Baseline random forest RMSE = 0.27188


In [14]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=150, max_features=1.0, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02504
Random forest RMSE - validation set = 0.06184
Baseline random forest RMSE = 0.27188


In [15]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=150, max_features=0.7, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02374
Random forest RMSE - validation set = 0.05798
Baseline random forest RMSE = 0.27188


In [17]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=150, max_features=0.6, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

In [None]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=150, max_features=0.5, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

In [None]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=150, max_features=0.3, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

In [None]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=150, max_features='sqrt', min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

In [18]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=10, max_features=0.8, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02995
Random forest RMSE - validation set = 0.06376
Baseline random forest RMSE = 0.27188


In [19]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=20, max_features=0.8, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02685
Random forest RMSE - validation set = 0.06079
Baseline random forest RMSE = 0.27188


In [22]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=50, max_features=0.8, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02467
Random forest RMSE - validation set = 0.05880
Baseline random forest RMSE = 0.27188


In [23]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=100, max_features=0.8, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02411
Random forest RMSE - validation set = 0.05865
Baseline random forest RMSE = 0.27188


In [9]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=150, max_features=0.6, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02372
Random forest RMSE - validation set = 0.05798
Baseline random forest RMSE = 0.27188


In [10]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=200, max_features=0.6, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02358
Random forest RMSE - validation set = 0.05778
Baseline random forest RMSE = 0.27188


In [9]:
# random forest regressor - training & test split
RF = RandomForestRegressor(n_estimators=200, max_features=0.6, min_samples_leaf=1)
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)
RF_rmse = math.sqrt(mean_squared_error(Y_train, RF.predict(X_train)))
print 'New random forest RMSE = %0.5f' % RF_rmse
print 'Baseline random forest RMSE = 0.27188'
write_to_file('data/RF_3char_bondtypes_f1_6.csv', RF_pred)

New random forest RMSE = 0.02288
Baseline random forest RMSE = 0.27188


In [9]:
# random forest regressor - training & test split
RF = RandomForestRegressor(n_estimators=150, max_features=0.6, min_samples_leaf=1)
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)
RF_rmse = math.sqrt(mean_squared_error(Y_train, RF.predict(X_train)))
print 'New random forest RMSE = %0.5f' % RF_rmse
print 'Baseline random forest RMSE = 0.27188'
write_to_file('data/RF_3char_bondtypes_f1_6.csv', RF_pred)

New random forest RMSE = 0.02298
Baseline random forest RMSE = 0.27188


In [10]:
# random forest regressor - test & validation split
RF = RandomForestRegressor(n_estimators=200, max_features=0.7, min_samples_leaf=1)
RF.fit(X_train[:800000], Y_train[:800000])
RF_rmse_train = math.sqrt(mean_squared_error(Y_train[:800000], RF.predict(X_train[:800000])))
RF_rmse_val = math.sqrt(mean_squared_error(Y_train[800000:], RF.predict(X_train[800000:])))
print 'Random forest RMSE - training set = %0.5f' % RF_rmse_train
print 'Random forest RMSE - validation set = %0.5f' % RF_rmse_val
print 'Baseline random forest RMSE = 0.27188'

Random forest RMSE - training set = 0.02359
Random forest RMSE - validation set = 0.05783
Baseline random forest RMSE = 0.27188
