## Part 4 : Machine Learning

###  Import  Necessary Libraries

In [40]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
#import os
import re
import seaborn as sns
%matplotlib inline
from statistics import mode
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.metrics import roc_curve,auc,confusion_matrix,accuracy_score,classification_report 
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from operator import itemgetter
#Advanced optimization
from scipy import optimize as op

#### Reading the clean data : Numeric and Categorical

In [2]:
df = pd.read_pickle('categ_data.pkl')

In [3]:
num_df=pd.read_pickle('numeric_data.pkl')

In [4]:
df.head()

Unnamed: 0,grade,sub_grade,emp_title,home_ownership,addr_state,newPurpose,verification,defaulter
0,C,C4,Ryder,RENT,GA,car,Verified,Yes
1,C,C1,AIR RESOURCES BOARD,RENT,CA,other,Verified,No
2,B,B5,University Medical Group,RENT,OR,other,Verified,No
3,A,A4,Veolia Transportaton,RENT,AZ,personal,Verified,No
4,C,C5,Southern Star Photography,RENT,NC,credit_card,Not Verified,No


In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
grade,C,C,B,A,C
home_ownership,RENT,RENT,RENT,RENT,RENT
addr_state,GA,CA,OR,AZ,NC
newPurpose,car,other,other,personal,credit_card
verification,Verified,Verified,Verified,Verified,Not Verified
defaulter,Yes,No,No,No,No


In [8]:
predictors=df.copy()
predictors.drop(['defaulter'],inplace=True,axis=1)

In [5]:
df.drop(columns=['emp_title','sub_grade'],inplace=True,axis=1)

#### Function for NaNs in check

In [14]:
def nanSummary(dataset):
    summary={}
    
    nanS=dataset.isnull().sum().sort_values(ascending=False)
    for key in nanS.keys():
        if nanS[key] > 0:
            summary[key]=nanS[key]
    if summary == {}:
        summary='Data is now Clean'
    
    return summary        

## Making Dummy Categorical Variables using : One-Hot- Encoding

In [9]:
dummy=pd.get_dummies(predictors)

In [10]:
X_pred=pd.concat([num_df,dummy],axis=1)

In [15]:
nanSummary(X_pred)

'Data is now Clean'

#### Encoding Response Labels : '0' : Non Defaulter , '1' : Yes Defaulter

In [18]:
lb = LabelEncoder()
df["y"] = lb.fit_transform(df["defaulter"])
df[["defaulter", "y"]].head(10)
response=df['y']

In [20]:
df=pd.concat([X_pred,response],axis=1)

In [22]:
nanSummary(df)

'Data is now Clean'

### It can be seen from above that this dataset suffers from a severe class imbalance problem and it this problem is not resolved then it can lead to falsified predictions

In [23]:
count=df['y'].value_counts()
count

0    33969
1     5883
Name: y, dtype: int64

### To solve this class imbalance problem the strategy used in this algorithm is 'Down Sampling the Majority Class' . 

### In this strategy , we take the following steps:

#### First, we'll separate observations from each class into different DataFrames.
#### Next, we'll resample the majority class without replacement, setting the number of samples to match that of the minority class.
#### Finally, we'll combine the down-sampled majority class DataFrame with the original minority class DataFrame.

In [24]:
cntClass0= count[0]
cntClass1= count[1]

In [25]:
df_majority = df[df.y==0]
df_minority = df[df.y==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=cntClass1,     # to match minority class
                                 random_state=123) # reproducible results

In [26]:
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [27]:
df_downsampled.y.value_counts()

1    5883
0    5883
Name: y, dtype: int64

In [28]:
nanSummary(df_downsampled)

'Data is now Clean'

In [29]:
X = df_downsampled.drop('y',axis=1)
y = df_downsampled['y']
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.30)

# Hyperparameter tuning of a Random Forest 

### Grid Search Method is used for searching optimal hyperparameters

### We initially start with arbitrary parameters and search for best parameters by assigning top scoring parameters to the model. Thus we rank the model by its accuracy and select these parameters for our final model 

In [100]:
def hyperParamSummary(grid_scores, n_top):
    BestScores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(BestScores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.4f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [30]:
RFmodel = RandomForestClassifier(min_samples_split = 3, 
                             max_leaf_nodes = 75, 
                             n_estimators = 100,
                             max_depth = 6,
                             min_samples_leaf = 5,
                             oob_score =True, n_jobs = -1)

In [31]:
RFmodel.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=75,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [32]:
pred = RFmodel.predict(X_validation)

In [33]:
print( accuracy_score(y_validation, pred) )

0.916312659303


In [34]:
print( np.unique( pred ) )

[0 1]


In [39]:
 print(confusion_matrix(y_validation, pred))

[[1054   90]
 [ 107 1103]]


### Investigating the Metrics : Recall, Precision and f1 score for the model 

In [35]:
print(classification_report(y_validation,pred))

             precision    recall  f1-score   support

          0       0.91      0.92      0.91      1144
          1       0.92      0.91      0.92      1210

avg / total       0.92      0.92      0.92      2354



In [38]:
fpr, tpr, thresholds = roc_curve(y_validation, pred, pos_label=1)
auc(fpr, tpr)   

0.91644945963127789

# SMOTE Approach (Synthetic Minority oversampling approach)

In [45]:
from collections import Counter
from sklearn.datasets import make_classification
import imblearn
from imblearn import over_sampling
from imblearn.over_sampling import SMOTE 

In [86]:
sm = SMOTE(ratio='auto', random_state=40, k_neighbors=5,
           m=None, m_neighbors=10, out_step=0.5, 
           kind='regular', n_jobs=1)

In [87]:
X_data=df.drop(['y'],axis=1)
y_data=df['y']

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.30)

In [89]:
X_train.shape

(27896, 91)

In [90]:
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [91]:
y_train_res_df=pd.DataFrame(y_train_res)

In [92]:
y_train_res_df[0].value_counts()

1    23805
0    23805
Name: 0, dtype: int64

# Hyperparameter tuning of a Random Forest 

### Grid Search Method is used for searching optimal hyperparameters

### We initially start with arbitrary parameters and search for best parameters by assigning top scoring parameters to the model. Thus we rank the model by its accuracy and select these parameters for our final model 

In [101]:
RF_model = RandomForestClassifier(random_state = 84)

param_grid = {"n_estimators": [100, 200],
              "max_depth": [5, 6],
              "min_samples_split": [5, 10],
              "min_samples_leaf": [3, 5],
              "max_leaf_nodes": [50, 75]}

In [111]:
grid_search = GridSearchCV(RF_model, param_grid=param_grid)
grid_search.fit(X_train_res, y_train_res)

hyperParamSummary(grid_search.grid_scores_, 4)

Model with rank: 1
Mean validation score: 0.9591)
Parameters: {'min_samples_leaf': 3, 'n_estimators': 200, 'max_leaf_nodes': 75, 'min_samples_split': 10, 'max_depth': 6}

Model with rank: 2
Mean validation score: 0.9590)
Parameters: {'min_samples_leaf': 5, 'n_estimators': 200, 'max_leaf_nodes': 75, 'min_samples_split': 5, 'max_depth': 6}

Model with rank: 3
Mean validation score: 0.9590)
Parameters: {'min_samples_leaf': 5, 'n_estimators': 200, 'max_leaf_nodes': 75, 'min_samples_split': 10, 'max_depth': 6}

Model with rank: 4
Mean validation score: 0.9590)
Parameters: {'min_samples_leaf': 3, 'n_estimators': 200, 'max_leaf_nodes': 75, 'min_samples_split': 5, 'max_depth': 6}



### Using Hyperparameters from Summary for our Final Model

In [122]:
RFmodel = RandomForestClassifier(min_samples_split = 3, 
                             max_leaf_nodes = 75, 
                             n_estimators = 200,
                             max_depth = 6,
                             min_samples_leaf = 10,
                             oob_score =True, n_jobs = -1)

In [123]:
RFmodel.fit(X_train_res,y_train_res)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=75,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [124]:
pred = RFmodel.predict(X_test)

In [125]:
print( accuracy_score(y_test, pred) )

0.953830712613


In [126]:
print( np.unique( pred ) )

[0 1]


In [127]:
print(confusion_matrix(y_test, pred))

[[9975  189]
 [ 363 1429]]


### Investigating the Metrics : Recall, Precision and f1 score for the model 

In [128]:
print(classification_report(y_test,pred))

             precision    recall  f1-score   support

          0       0.96      0.98      0.97     10164
          1       0.88      0.80      0.84      1792

avg / total       0.95      0.95      0.95     11956

