In [19]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
import pandas as pd

In [6]:
from time import time
from datetime import timedelta

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV

In [13]:
X = pd.read_csv('../data/processed/X_train', header=0, index_col=0)
y = pd.read_csv('../data/processed/y_train', header=0, index_col=0)

# Splitting Data into Test and Train

- Explain: stratify
- Explain: transform + data leakage

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y[["target"]])
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(189, 13) (189, 1) (48, 13) (48, 1)


In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Tracking Metrics
- Here we setup a function to output a pretty confusion matrix and a dataframe to keep track of performance metrics for each model

In [21]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices
    """
    pred = pd.Series(pred)
    true = pd.Series(true)

    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    cm = cm[cm.index]
    return cm

In [22]:
results_df = pd.DataFrame(index=['Logistic Regression'],
                          columns=['F-score (cv)', 'Precision (cv)','Recall (cv)', 'Accuracy (cv)'])

# Random Forest - First Version 1
This is a simple random forest with untuned parameters

- Explain: parameters used
- Output: coefficients
- Explain: Do coefficients make sense?

In [None]:
init_time=time()

rf = RandomForestClassifier(n_estimators=20, random_state=42, max_depth=4)
rf.fit(X_train,y_train)
rf_predicted = rf.predict(X_test)

print(timedelta(seconds=(time()-init_time)))

In [None]:
cross_val_results = pd.DataFrame(cross_validate(rf, X_train, y_train, cv = 5,
                                                scoring = [ 'f1_macro', 'precision_macro', 'recall_macro', 'accuracy'] ))

results_df.loc['Logistic Regression',:] = cross_val_results[['test_f1_macro',
       'test_precision_macro', 'test_recall_macro','test_accuracy']].mean().values

confusion(y_train,rf_predicted)
results_df

In [None]:
init_time=time()

model = RandomForestClassifier()
n_estimators = [10,40,100,200,500,1000]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [1,2,3,4,5,6,7,8,9,10]
criterion= ['gini', 'entropy']

# define grid search
grid = dict(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, criterion=criterion)
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cv = KFold(n_splits=10, random_state=42, shuffle=True)

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']

model_10CV = grid_result.fit(X_train, y_train)

print(timedelta(seconds=(time()-init_time)))

In [None]:
pd.DataFrame(model_10CV.cv_results_).loc[:,['param_alpha','param_hidden_layer_sizes',  'mean_test_f1_macro',
                                            'std_test_f1_macro','mean_test_precision_macro', 'std_test_precision_macro',
                                        'mean_test_accuracy','std_test_accuracy',]].sort_values(by='mean_test_f1_macro',
                                                                                                ascending=False).head()

In [None]:
model = 'Random Forest Classfier'
for n in (10,20,30,40,50,100):
    for d in (2,3,4,5,6,7,8):
        rf = RandomForestClassifier(n_estimators=n, random_state=42,max_depth=d)
        rf.fit(X_train,y_train)
        rf_predicted = rf.predict(X_test)
        rf_conf_matrix = confusion_matrix(y_test, rf_predicted)
        rf_acc_score = accuracy_score(y_test, rf_predicted)
        print("confusion matrix")
        print(rf_conf_matrix)
        print("\n")
        print(f"Accuracy of Random Forest with {n} estimators and {d} depth:{rf_acc_score*100} \n")
        print(classification_report(y_test,rf_predicted))

In [None]:
rf2 = RandomForestClassifier(n_estimators=20, random_state=42,max_depth=5)
rf2.fit(X_train,y_train)
scores = cross_val_score(rf, X, y.values.ravel(), cv=5)
scores