In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score,recall_score,accuracy_score,f1_score,confusion_matrix, roc_auc_score

In [None]:
# these files are the result from wavelet_features.py
df_f1 = pd.read_csv('features_wavelet_1.csv')
df_f2 = pd.read_csv('features_wavelet_2.csv')
df_f3 = pd.read_csv('features_wavelet_3.csv')
df_f4 = pd.read_csv('features_wavelet_4.csv')

df_l1 = pd.read_csv('labels_wavelet_1.csv', header=None)
df_l2 = pd.read_csv('labels_wavelet_2.csv', header=None)
df_l3 = pd.read_csv('labels_wavelet_3.csv', header=None)
df_l4 = pd.read_csv('labels_wavelet_4.csv', header=None)

df_f = pd.concat([df_f1,df_f2,df_f3,df_f4], ignore_index=True)
df_l = pd.concat([df_l1,df_l2,df_l3,df_l4], ignore_index=True)
df_f.drop(['Unnamed: 0'], axis=1, inplace=True)
df_l.drop([0], axis=1, inplace=True)

In [None]:
# replace infinity/nan with the mean
df_f.replace([np.inf, -np.inf], np.nan, inplace=True)
df_f.fillna(df_f.mean(), inplace=True)

In [None]:
# prepare data for training
X = df_f
y = df_l.values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=11, 
                                                    stratify=y)

In [None]:
# train our random forest classifier and show validation result
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=25, min_samples_split=5, min_samples_leaf=5, random_state=33)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(f1_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
y_pred_proba = rf.predict_proba(X_test)
print(roc_auc_score(y_test, y_pred_proba[:,1]))

In [None]:
# use grid search cv to get the optimum hyperparameters
# this was skipped during submission as it took too long
params = {'max_depth':np.arange(5,30,1), 
          'min_samples_split':np.arange(2,20,1), 
          'min_samples_leaf':np.arange(2,20,1)
            }
clf = RandomForestClassifier(random_state=33, n_estimators=100)
clf_gs = GridSearchCV(clf, params, scoring='roc_auc', cv=5)
clf_gs.fit(X_train, y_train)