<a href="https://colab.research.google.com/github/ankithakumari/SupervisedML/blob/master/RandomForest_Interpreter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
from sklearn import model_selection, linear_model, metrics, discriminant_analysis
from sklearn.ensemble import RandomForestClassifier
import xgboost



In [3]:
from google.colab import files
uploaded = files.upload()

credit_data = pd.read_csv('German_Final.csv')
y = credit_data['ClassificationStatus']

Saving German_Final.csv to German_Final.csv


In [0]:
def reset_data():
    # train test split 70:30 using stratify
    X_train, X_test, y_train, y_test = model_selection.train_test_split(credit_data.loc[:, credit_data.columns != 'ClassificationStatus'], credit_data['ClassificationStatus'], test_size=0.30, stratify=y)
    # Use one-hot encoding for categorical variables 
    cat_preds = [column for column in X_train.columns if len(credit_data[column].unique()) <= 10]
    continuous_preds = [column for column in X_train.columns if len(credit_data[column].unique()) > 10]
    continuous_preds.append('DurationInCurrentResidence')
    continuous_preds.append('InstallmentRate')
    cat_preds.remove('DurationInCurrentResidence')
    cat_preds.remove('InstallmentRate')
    X_train = pd.get_dummies(X_train, columns=cat_preds, drop_first= True)
    X_test = pd.get_dummies(X_test, columns=cat_preds, drop_first=True)
    return (X_train, X_test, y_train, y_test)


In [0]:
X_train, X_test, y_train, y_test = reset_data()

In [0]:
params = {
    'n_estimators' : [100, 200, 500, 800, 1000],
    'max_depth' : [3, 4, 7, 8, 10, 15, 20],
    "min_samples_split": [2, 3, 10]
    }
clf = RandomForestClassifier(class_weight='balanced')
grid_search = model_selection.GridSearchCV(clf, param_grid=params, cv=5)

In [10]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators='warn', n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [100, 200, 500, 800, 1000], 'max_depth': [3, 4, 7, 8, 10, 15, 20], 'min_samples_split': [2, 3, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=8, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [0]:
rf_fit = grid_search.best_estimator_
rf_fit.fit(X_train, y_train)
y_pred = rf_fit.predict(X_test)

In [13]:
metrics.confusion_matrix(y_pred, y_test)

array([[ 43,  24],
       [ 47, 186]])

In [14]:
metrics.precision_recall_fscore_support(y_pred, y_test)

(array([0.47777778, 0.88571429]),
 array([0.64179104, 0.79828326]),
 array([0.5477707 , 0.83972912]),
 array([ 67, 233]))

In [16]:
metrics.accuracy_score(y_test, y_pred)

0.7633333333333333

In [65]:
importance = pd.DataFrame(rf_fit.feature_importances_, index=X_train.columns, 
                          columns=["Importance"])
importance.sort_values(by=['Importance'], ascending=False)[0:15]

Unnamed: 0,Importance
CreditAmount,0.102867
ExistingAccountStatus_No Account,0.090979
Age,0.086281
Duration,0.078466
ExistingAccountStatus_None,0.055942
CreditHistory_critical account/ other credits existing (not at this bank),0.034339
DurationInCurrentResidence,0.027532
InstallmentRate,0.026386
SavingsAccount_Below 100 DM,0.026053
ExistingAccountStatus_Below 200 DM,0.0247


In [64]:
importance.sort_values(by=['Importance'], ascending=False)[-5:]

Unnamed: 0,Importance
Purpose_Repairs,0.004529
"Job_Unemployed, Unskilled",0.003848
Purpose_Others,0.001432
Purpose_Retraining,0.000994
ExistingCredits_4,0.000796


In [0]:
!pip install -q treeinterpreter


In [0]:
from treeinterpreter import treeinterpreter as ti

In [61]:
rf_fit.predict_proba(np.array(X_train.iloc[1]).reshape(1, -1))

array([[0.54087488, 0.45912512]])

In [0]:
instance = np.array(X_train.iloc[1]).reshape(1, -1)

In [0]:
prediction, bias, contributions = ti.predict(rf_fit, instance)

In [60]:
for c, feature in zip(contributions[0], 
                             X_train.columns):
      if (c[1] > 0.01) or (c[1] < -0.01):
          print(feature, c)

CreditAmount [-0.01591669  0.01591669]
DurationInCurrentResidence [ 0.01416965 -0.01416965]
Age [-0.02093174  0.02093174]
ExistingAccountStatus_Below 200 DM [-0.02152897  0.02152897]
ExistingAccountStatus_No Account [-0.1105204  0.1105204]
ExistingAccountStatus_None [-0.03305077  0.03305077]
CreditHistory_critical account/ other credits existing (not at this bank) [ 0.0250396 -0.0250396]
CreditHistory_delay in paying off in the past  [ 0.01858604 -0.01858604]
Purpose_Repairs [ 0.10929355 -0.10929355]
SavingsAccount_Below 100 DM [-0.02691212  0.02691212]
SavingsAccount_None [ 0.01156269 -0.01156269]
Property_None [ 0.0405462 -0.0405462]
OtherInstallment_None [-0.01343928  0.01343928]
Housing_Own [ 0.02219738 -0.02219738]
PeopleLiable_2 [ 0.02347926 -0.02347926]
Telephone_yes [ 0.01402986 -0.01402986]
