# Random Forest

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [31]:
df=pd.read_pickle('gmsc_clean.pkl')
df=df.sample(frac=0.1,random_state=4).reset_index(drop=True)
df.head()

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberRealEstateLoansOrLines,NumberOfDependents
0,132394,0,1.0,76,0.31934,2000.0,6,0,0.0
1,93855,0,0.266782,44,0.154811,3668.0,7,0,2.0
2,106376,0,0.479971,74,0.28883,10500.0,11,1,0.0
3,7391,0,0.460477,42,0.204466,6000.0,10,0,1.0
4,84921,0,0.392186,39,0.355366,4583.0,5,1,1.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 9 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   ID                                    150000 non-null  int64  
 1   SeriousDlqin2yrs                      150000 non-null  int64  
 2   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 3   age                                   150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         150000 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberRealEstateLoansOrLines          150000 non-null  int64  
 8   NumberOfDependents                    150000 non-null  float64
dtypes: float64(4), int64(5)
memory usage: 10.3 MB


In [33]:
df['SeriousDlqin2yrs'].value_counts()

0    13983
1     1017
Name: SeriousDlqin2yrs, dtype: int64

In [34]:
df['SeriousDlqin2yrs'].value_counts(normalize=True)

0    0.9322
1    0.0678
Name: SeriousDlqin2yrs, dtype: float64

In [32]:
y=df['SeriousDlqin2yrs']
X=df.drop(['ID','SeriousDlqin2yrs'],axis=1)

In [39]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score

In [21]:
rfc = RandomForestClassifier(n_estimators=100,random_state=4,n_jobs=-1)

In [22]:
rfc.fit(X_train,y_train)
y_train_pred=rfc.predict(X_train)
y_train_prob=rfc.predict_proba(X_train)[:,1]

print('Train')
print('Confusion Matrix:\n',confusion_matrix(y_train,y_train_pred))
print('\nAccuracy Score:',accuracy_score(y_train,y_train_pred))
print('AUC:',roc_auc_score(y_train,y_train_prob))

Train
Confusion Matrix:
 [[98011     0]
 [   10  6979]]

Accuracy Score: 0.9999047619047619
AUC: 0.9999999576641643


In [23]:
y_test_pred=rfc.predict(X_test)
y_test_prob=rfc.predict_proba(X_test)[:,1]

print('Test')
print('Confusion Matrix:\n',confusion_matrix(y_test,y_test_pred))
print('\nAccuracy Score:',accuracy_score(y_test,y_test_pred))
print('AUC:',roc_auc_score(y_test,y_test_prob))

Test
Confusion Matrix:
 [[41872    91]
 [ 3006    31]]

Accuracy Score: 0.9311777777777778
AUC: 0.7257853283437655


### Random Forest Tuning

In [1]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

In [27]:
rfc = RandomForestClassifier()

param_dist = {'n_estimators':sp_randint(25,250),
             'max_features':sp_randint(1,7),
              'max_depth':sp_randint(2,15),
              'min_samples_leaf':sp_randint(1,20),
             'min_samples_split':sp_randint(2,50),
             'criterion':['gini','entropy']}

In [35]:
rsearch=RandomizedSearchCV(rfc,param_distributions=param_dist,scoring='roc_auc',
                           n_iter=10,cv=3,n_jobs=-1,random_state=4)
rsearch.fit(X,y)
rsearch.best_params_

{'max_depth': 9,
 'max_features': 2,
 'min_samples_leaf': 9,
 'min_samples_split': 38,
 'n_estimators': 203}

In [40]:
rfc = RandomForestClassifier(**rsearch.best_params_,random_state=4,n_jobs=-1)
rfc.fit(X_train,y_train)
y_train_pred=rfc.predict(X_train)
y_train_prob=rfc.predict_proba(X_train)[:,1]

print('Train')
print('Confusion Matrix:\n',confusion_matrix(y_train,y_train_pred))
print('\nAccuracy Score:',accuracy_score(y_train,y_train_pred))
print('AUC:',roc_auc_score(y_train,y_train_prob))

Train
Confusion Matrix:
 [[9807    0]
 [ 693    0]]

Accuracy Score: 0.934
AUC: 0.8913436245953835


In [41]:
y_test_pred=rfc.predict(X_test)
y_test_prob=rfc.predict_proba(X_test)[:,1]

print('Test')
print('Confusion Matrix:\n',confusion_matrix(y_test,y_test_pred))
print('\nAccuracy Score:',accuracy_score(y_test,y_test_pred))
print('AUC:',roc_auc_score(y_test,y_test_prob))

Test
Confusion Matrix:
 [[4176    0]
 [ 324    0]]

Accuracy Score: 0.928
AUC: 0.7726662646043234


In [5]:
print(sp_randint(1,2))

<scipy.stats._distn_infrastructure.rv_frozen object at 0x000002021E740820>
