In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

##Seaborn for fancy plots. 
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (8,8)

# 3950 Assignment 1: Part 2

For this assignment we want to use some sort of tree based model to classify the data below. We have a very small training set, so overfitting is a very real concern. 

Some specifics for this assignment:
<ul>
<li>Please paste in any outside functions you may use before submitting. E.g. if you're importing any functions from a util file, paste them in here for this. The reason for this is that it makes it massively easier for me when downloading a submission from everyone. Please put the blocks with those functions before they're called, so I can hit Run All to run the entire workbook. 
<li>
</ul>

In [13]:
df = pd.read_csv("train.csv")
df = df.drop(columns={"id"})
df["target"] = df["target"].astype("int32")
df.sample(10)

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
183,0,-0.079,1.641,-0.209,1.656,0.694,-0.763,-1.388,0.921,0.28,...,-0.641,0.684,1.669,0.009,-0.558,-0.805,-0.35,-0.127,-1.719,1.557
114,0,1.207,0.407,-0.087,0.229,-0.682,1.03,0.053,0.628,0.388,...,0.154,0.28,1.208,0.87,-0.774,-1.211,-1.056,-0.076,0.765,-0.335
23,0,1.256,1.212,-0.433,-1.691,0.046,-0.293,0.78,1.082,-0.404,...,0.446,-0.081,-1.027,0.034,0.891,0.322,0.144,1.069,1.391,-0.459
20,0,-1.624,1.721,-0.175,1.126,0.051,0.335,-0.57,0.849,0.527,...,-0.139,1.108,0.154,-0.579,-0.279,0.359,-1.959,0.463,-0.653,-1.498
5,0,0.739,0.211,-0.836,-1.43,-0.291,-0.989,0.091,0.107,-2.313,...,-1.501,-1.96,0.671,0.091,-1.467,-1.011,-0.118,-0.257,-0.337,-1.064
215,0,-0.765,-0.215,0.276,-0.022,-0.505,-0.116,-0.137,-0.141,0.798,...,-1.631,-0.835,-2.228,-0.583,-0.396,-0.464,0.366,2.329,0.946,0.401
74,1,-1.908,-0.374,1.005,0.687,-0.925,-0.605,2.307,0.892,-0.648,...,0.556,0.843,-0.832,-0.046,-1.669,0.718,1.842,0.224,1.099,0.664
94,0,-1.479,0.635,-2.967,-0.046,-0.324,1.198,-0.188,1.555,0.375,...,1.18,0.494,0.99,-0.038,-1.525,0.465,0.374,1.004,-0.319,0.326
178,0,-2.133,-0.204,-0.28,-1.912,-1.01,0.118,-0.934,-0.365,-0.836,...,1.95,-0.712,-0.047,1.682,1.945,0.7,0.089,0.084,-0.222,-1.396
44,1,1.157,-0.013,-0.106,-0.379,0.62,-0.442,-0.022,0.536,0.003,...,0.073,0.032,-0.063,-0.227,-1.921,-0.389,-0.872,0.885,1.297,0.033


In [11]:
#Check for missing
df.isna().sum().sort_values(ascending=False)

id     0
205    0
203    0
202    0
201    0
      ..
98     0
97     0
96     0
95     0
299    0
Length: 302, dtype: int64

Create a trial run to see what a default forrest looks like. 

In [22]:

y_trial = np.array(df["target"]).reshape(-1,1)
X_trial = np.array(df.drop(columns={"target"}))
X_trainT, X_testT, y_trainT, y_testT = train_test_split(X_trial, y_trial.ravel(), test_size=.3)

trial_forrest = RandomForestClassifier()
trial_pipe = [('scale', StandardScaler()),('forest', trial_forrest) ]
pipe = Pipeline(trial_pipe)
# The pipeline can be used as any other estimator
# and avoids leaking the test set into the train set
pipe.fit(X_trainT, y_trainT)
print("Score:", pipe.score(X_testT, y_testT))
trial_depths = [estimator.tree_.max_depth for estimator in trial_forrest.estimators_]
print("Avg Depth:", np.mean(trial_depths))

Score: 0.7066666666666667
Avg Depth: 8.74


Create model using grid search to tune HPs. The training set is very small, so calculation of many options should be pretty fast. 

I'm going to scale the data, but I suspect that will not be a massive impact. 

In [23]:
#Create Pipeline with Scaling. 
scaler = StandardScaler()
estimator = RandomForestClassifier(n_jobs=-1)
pipe = Pipeline(steps=[("scaler", scaler), ("forrest", estimator)])

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_trial, y_trial.ravel(), test_size=.3)

rf_para = {'forrest__min_samples_split':[3,4,5,6,7,8,9,10,11,12],
            'forrest__criterion':["gini","entropy"],
            'forrest__max_depth':[5,6,7,8,9],
            'forrest__n_estimators':[75,100,125,150,175],
            'forrest__max_samples':[.4, .5, .6, .7]}

#rf_para = {'forrest__max_depth':[3,4,5,6,7,8,9]}
 
clf = GridSearchCV(pipe, param_grid=rf_para, cv=10, n_jobs=-1) 
clf.fit(X_train, y_train.ravel())
best = clf.best_estimator_
best.score(X_test, y_test)

KeyboardInterrupt: 

In [None]:
#Please leave this as is at the end of your file. 
# best should be your final trained model. 
test_df = pd.read_csv("test.csv.zip")
test_y = np.array(test_df["target"]).reshape(-1,1)
test_X = np.array(test_df.drop(columns={"target"}))
print(best.score(test_X, test_y.ravel()))