In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

##Seaborn for fancy plots. 
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (8,8)

# 3950 Assignment 1: Part 2

For this assignment we want to use some sort of tree based model to classify the data below. We have a very small training set, so overfitting is a very real concern. 

Some specifics for this assignment:
<ul>
<li>Please paste in any outside functions you may use before submitting. E.g. if you're importing any functions from a util file, paste them in here for this. The reason for this is that it makes it massively easier for me when downloading a submission from everyone. Please put the blocks with those functions before they're called, so I can hit Run All to run the entire workbook. 
<li>
</ul>

In [90]:
df = pd.read_csv("train.csv")
df = df.drop(columns={"id"})
df["target"] = df["target"].astype("int32")
df.sample(10)

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
7,0,0.668,-0.077,0.014,-1.523,1.207,0.81,-0.879,0.965,-0.181,...,-0.923,-0.643,0.559,-0.271,-0.848,-0.437,0.1,-1.654,-0.237,-1.219
77,0,-1.687,0.399,-1.021,-1.213,-0.742,-0.941,0.853,2.404,-1.456,...,-0.354,0.39,-0.054,1.907,-0.542,-2.135,0.568,-0.874,0.026,-0.627
188,1,0.514,3.138,1.419,-0.496,1.464,0.023,1.712,-1.317,0.044,...,-0.846,-1.254,-0.688,-0.455,-0.348,0.494,-0.332,0.241,-1.48,1.178
142,0,0.081,0.312,0.015,0.688,-0.145,0.894,-0.153,-1.351,-0.183,...,0.307,-0.62,1.04,0.224,-0.678,-0.596,0.203,1.138,1.896,0.952
192,0,-0.41,-0.366,0.125,0.823,0.277,0.26,-1.615,-2.295,-0.462,...,-0.484,-0.556,-0.374,-0.021,-0.488,-0.294,1.144,0.941,-0.531,-1.061
180,0,0.868,0.288,0.379,-0.726,-0.383,-0.458,-0.441,0.325,0.043,...,1.318,-1.535,0.46,0.978,-2.015,2.338,0.48,-0.617,-0.674,1.072
228,0,0.832,2.206,-1.86,0.766,-0.365,1.692,0.345,-0.383,-0.604,...,0.084,-0.228,-0.452,0.446,1.41,-1.919,-0.3,0.638,-0.042,1.204
179,1,-0.61,-0.7,0.609,-1.425,0.612,0.041,-0.445,0.135,0.68,...,0.025,0.99,-1.121,-1.526,0.361,0.841,0.071,-0.196,0.027,1.43
41,0,-0.576,-3.041,0.315,-0.092,-0.103,1.379,0.188,1.259,0.009,...,-0.895,-1.087,-1.105,0.003,-0.993,0.404,2.353,1.491,-0.967,0.249
248,1,-0.451,-0.204,-0.762,0.261,0.022,-1.487,-1.122,0.141,0.369,...,0.729,0.411,2.366,-0.021,0.16,0.045,0.208,-2.117,-0.546,-0.093


In [91]:
#Check for missing
df.isna().sum().sort_values(ascending=False)

target    0
206       0
204       0
203       0
202       0
         ..
99        0
98        0
97        0
96        0
299       0
Length: 301, dtype: int64

Create a trial run to see what a default forrest looks like. 

In [92]:

y_trial = np.array(df["target"]).reshape(-1,1)
X_trial = np.array(df.drop(columns={"target"}))
X_trainT, X_testT, y_trainT, y_testT = train_test_split(X_trial, y_trial.ravel(), test_size=.3)

trial_forrest = RandomForestClassifier()
trial_pipe = [('scale', StandardScaler()),('forest', trial_forrest) ]
pipe = Pipeline(trial_pipe)
# The pipeline can be used as any other estimator
# and avoids leaking the test set into the train set
pipe.fit(X_trainT, y_trainT)
print("Score:", pipe.score(X_testT, y_testT))
trial_depths = [estimator.tree_.max_depth for estimator in trial_forrest.estimators_]
print("Avg Depth:", np.mean(trial_depths))

Score: 0.7733333333333333
Avg Depth: 8.56


Create model using grid search to tune HPs. The training set is very small, so calculation of many options should be pretty fast. 

I'm going to scale the data, but I suspect that will not be a massive impact. 

In [93]:
#Create Pipeline with Scaling. 
scaler = StandardScaler()
estimator = RandomForestClassifier(n_jobs=-1, verbose=0)
pipe = Pipeline(steps=[("scaler", scaler), ("forrest", estimator)])

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_trial, y_trial.ravel(), test_size=.3)

rf_para = {'forrest__min_samples_split':[3,4,5,6,7,8,9,10],
#            'forrest__criterion':["gini","entropy"],
            'forrest__max_depth':[5,6,7,8,9],
            'forrest__n_estimators':[100,150,175],
            'forrest__max_samples':[.4, .5, .6, .7],
            'forrest_max_features':[100,120,140,160,180,200]}

#rf_para = {'forrest__max_depth':[3,4,5,6,7,8,9]}
 
clf = GridSearchCV(pipe, param_grid=rf_para, cv=10, n_jobs=-1) 
print("post grid")
clf.fit(X_train, y_train.ravel())
print("post fit")
best = clf.best_estimator_
print("post best")
print(best.score(X_test, y_test))
print("post score")

post grid
post fit
post best
post score


In [102]:
print(best.score(X_test, y_test))

0.72


In [95]:
#Load Test Data
test_df = pd.read_csv("test.csv.zip")
test_df["id"] = test_df["id"].astype("int32")

In [96]:
sol = pd.read_csv("overfit_sol.csv")
sol = sol[249:19999]
sol.rename(columns={"case_id":"id", "Target_Practice":"target"}, inplace=True)
sol["id"] = sol["id"].astype("int32")

In [97]:
full_test = test_df.assign(sol_id=pd.Series(sol["id"]).values)
full_test = test_df.assign(target=pd.Series(sol["target"]).values)
full_test.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,291,292,293,294,295,296,297,298,299,target
0,250,-0.677,1.721,-0.745,-0.838,0.149,-1.138,0.242,0.504,-1.829,...,-0.403,0.759,-0.6,0.951,-0.349,0.446,-0.819,-0.277,1.297,1
1,251,-0.731,-0.251,0.059,0.054,1.149,2.462,0.836,0.719,-2.269,...,1.114,0.657,0.76,0.899,-1.612,-1.701,1.107,-0.314,-0.641,1
2,252,1.119,1.036,1.22,1.518,0.265,-0.088,0.245,-0.533,-0.921,...,-0.736,0.367,0.154,0.83,-1.352,0.914,0.377,0.588,-0.912,1
3,253,-0.933,0.212,-0.053,0.57,-1.54,-1.108,0.462,1.022,-0.215,...,-0.958,0.762,-0.213,-2.171,0.83,1.435,0.125,2.782,0.619,0
4,254,-0.208,-0.556,2.641,0.853,-0.384,0.312,0.514,0.481,-1.929,...,0.213,0.568,-0.935,-0.015,0.267,0.739,1.34,-0.178,1.01,0


In [101]:
#Please leave this as is at the end of your file. 
# best should be your final trained model. 
test_y = np.array(full_test["target"]).reshape(-1,1)
test_X = np.array(full_test.drop(columns={"id","target"}))
print(cross_val_score(best, test_X, test_y.ravel(), cv=5, scoring='roc_auc'))
print(best.score(test_X,test_y))

[0.50691139 0.50825135 0.49917602 0.50575812 0.49927857]
0.5017721518987341
