# RandomForest Binary Classification Experimental Notebook

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from numpy import mean
import pandas as pd
import numpy as np
import os 

In [None]:
scaler = MinMaxScaler()

## Load Train Data

In [None]:
train_data = pd.read_csv('./DATA/train.csv')
columns = train_data.columns.tolist()
train_data = scaler.fit_transform(train_data)
train_df = pd.DataFrame(train_data, columns=columns)

y_train = train_df['DMIndicator']
X_train = train_df.drop('DMIndicator', axis=1)

## Grid Search 

In [None]:
rfc = RandomForestClassifier()
# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=10, stop=500, num=10)]
# number of features at every split
max_features = ['auto', 'sqrt']
# max depth
max_depth = [int(x) for x in np.linspace(10, 500, num=11)]
max_depth.append(None)
# create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }
# random search of parameters
rfc_random = RandomizedSearchCV(estimator=rfc, param_distributions=random_grid, n_iter=100, cv=10, verbose=2, random_state=123, n_jobs=-1)
# Fit the model
rfc_random.fit(X_train, y_train)
# print results
print(rfc_random.best_params_)

In [None]:
rfc = RandomForestClassifier(n_estimators=227, 
                             max_depth=10, 
                             max_features='auto', 
                             class_weight='balanced')
rfc.fit(X_train, y_train)

## Load Test Data

In [None]:
test_data = pd.read_csv('./DATA/test.csv')
columns = test_data.columns.tolist()
test_data = scaler.fit_transform(test_data)
test_df = pd.DataFrame(test_data, columns=columns)

y_test = test_df['DMIndicator']
X_test = test_df.drop('DMIndicator', axis=1)

## Evaluate Trained Model on Test Data

In [None]:
rfc_predict = rfc.predict(X_test)

In [None]:
rfc_predict

In [None]:
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== Mean AUC Score ===")
rfc_cv_score = cross_val_score(rfc, X_test, y_test, cv=10, scoring='roc_auc')
print(rfc_cv_score.mean())