# The RFC Model
## Setup

In [None]:
import os
default_path=#path
os.chdir(default_path)
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics

df= pd.read_csv("data/processed/full_df_filled_coded.csv")
from 1_data_preprocessing import test_train_manual
from 1_data_preprocessing import select_features
import random


## create train test sets

In [None]:
X_test, X_train, y_test, y_train = test_train_manual(df, 'Leavers', 'id')
#select_features runs a RFC model with all features and selects the most important features
imp_X_train, imp_X_test, feature_importances = select_features(X_test, X_train, y_test, y_train)

## Model with reduced features

In [None]:
# Train the expanded model on only the important features
rf = RandomForestClassifier(n_estimators= 400, random_state=42)
rf.fit(imp_X_train, y_train);

print(classification_report(y_test, rf.predict(imp_X_test)))

## Model with down sampling
We have a very unbalanced data set. So we'll try downsampling

In [None]:
# check the percentage of leavers
count_stay = len(df[df["Leavers"]==0]) 
count_leav = len(df[df["Leavers"]==1]) 
perc_stay = count_stay/(count_stay+count_leav)
print("percentage of stay is",perc_stay*100)
perc_leav= count_leav/(count_stay+count_leav)
print("percentage of leave ",perc_leav*100)


We'll only downsample the training set. To do that, we first combine the y_train with the X training data. 

In [None]:
#combine the training X and y 
imp_X_train["Leavers"]= y_train
df_train = imp_X_train.copy().reset_index(drop=True) # for naming 
imp_X_train.drop(columns='Leavers', inplace=True)
print("length of training df",len(df_train))

Now we need a portion of non leavers and will take whole df of Leavers. We'll try a range of portions, and choose the best performing one. For example portion of 4 means there are 4 non-leavers for each leaver. So we choose portion x number of leavers data points among the stay indices (stay_ind).

In [None]:
#array of indices for leavers and non-leavers
leav_ind= np.array(df_train[df_train.Leavers==1].index)
stay_ind = np.array(df_train[df_train.Leavers==0].index)

In [None]:
def undersample(stay_ind,leav_ind,times):#times denote the normal df = times*fraud df
    #choose a number of indices from stay based on portion
    stay_ind_undersample = np.array(np.random.choice(stay_ind,(times*count_leav),replace=False)) 
    undersample_df= np.concatenate([leav_ind,stay_ind_undersample]) #indices for all leavers and some stayers
    undersample_df = df_train.iloc[undersample_df,:] #create the df based on indices
    
    print("the stay proportion is :",len(undersample_df[undersample_df.Leavers==0])/len(undersample_df['Leavers']))
    print("the leav proportion is :",len(undersample_df[undersample_df.Leavers==1])/len(undersample_df['Leavers']))
    print("total number of record in resampled df is:",len(undersample_df['Leavers']))
    #seperate the X and y again
    features=undersample_df.columns.values.tolist()
    y=['Leavers']
    X=[i for i in features if i not in y]
    
    under_X_train=undersample_df[X]
    under_y_train=undersample_df['Leavers']
    return(under_X_train, under_y_train)

We train this model using undersample data and test for the whole data test set. We do this for different proportions in the range of 4-8

In [None]:
for i in range(4,8):
    print("the undersample data for {} proportion".format(i))
    print()
    (under_X_train, under_y_train)=undersample(stay_ind,leav_ind,i)
    print("------------------------------------------------------------")
    print()
    print("the model classification for {} proportion".format(i))
    print()
    
    clf=RandomForestClassifier(n_estimators=100)
    clf.fit(under_X_train, under_y_train)
    print(classification_report(y_test, clf.predict(imp_X_test)))

The best performnace is with the 6 proportion

In [None]:
(under_X_train, under_y_train)=undersample(stay_ind,leav_ind,6)

rf.fit(under_X_train, under_y_train)
print(classification_report(y_test, rf.predict(imp_X_test)))

## Improve Parameters

In [None]:
#FIND THE BEST PARAMETERS
rf = RandomForestClassifier(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 3)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 120, num = 3)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
## TRY WITH THE BETTER PARAMETERS IF YOU DID THE PARAMETER SEARCH
rf_imp_para = RandomForestClassifier(n_estimators= 400, 
                                random_state=42, 
                                min_samples_leaf=1, 
                                max_features='sqrt',
                                bootstrap= True
                                )
rf_imp_para.fit(under_X_train, under_y_train)
predictions=rf_imp_para.predict(imp_X_test)
print(classification_report(y_test, predictions))


## Model Performance 

In [None]:
# Train and Test Accuracy
print("Train Accuracy :: ", accuracy_score(y_train, rf_imp_para.predict(imp_X_train)))
print("Test Accuracy  :: ", accuracy_score(y_test, predictions))
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(accuracy_score(y_test, predictions))
plt.title(all_sample_title, size = 15);

In [None]:
#cross validation

from sklearn import model_selection

kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = rf_imp_para
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, under_X_train, under_y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))
print(results.std())

In [None]:
#PRECISION RECALL CURVE

# precision-recall curve and f1
#from sklearn.datasets import make_classification
#from sklearn.neighbors import KNeighborsClassifier

rf_imp=rf
# predict probabilities
probs = rf_imp.predict_proba(imp_X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# predict class values
yhat = rf_imp.predict(imp_X_test)
# calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, probs)
# calculate F1 score
f1 = f1_score(y_test, yhat)
# calculate precision-recall AUC
auc = auc(recall, precision)
# calculate average precision score
ap = average_precision_score(y_test, probs)
print('f1=%.3f auc=%.3f ap=%.3f' % (f1, auc, ap))
# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
# plot the roc curve for the model
plt.plot(recall, precision, marker='.')
# show the plot
plt.show()

# Export the model
I wanted to export one of the trees, cause it gives a more tangible material for presenting the results, even though it's not very imformative.

In [None]:
#________________EXPORTING THE MODEL
from sklearn.tree import export_graphviz
import pydot

estimator=rf_imp_para.estimators_[10]
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = under_X_train.columns,
                class_names = True,
                rounded = True, proportion = False, 
                precision = 2, filled = True)
# Convert to png using system command (requires Graphviz)
os.environ["PATH"] += os.pathsep + 'C:/Appl/release/bin'
(graph,) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree_5.png')
