In [None]:
# I played around with different numbers of estimators and different numbers of features
#From A. Muller https://github.com/amueller/mglearn/blob/master/mglearn/tools.py
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, colorConverter, LinearSegmentedColormap
import pandas as pd
import os
%matplotlib inline

#set datadir based on cwd
cwd = os.getcwd()

#OSX
#datadir = cwd + '/data/'
df = pd.read_csv('A:\\Aditya\\NYU\\Assignments\\DwD\\DM_Project\\lending-club-loan-data\\loan_data_clean.csv',index_col=0)


In [None]:
df.addr_state.head()

In [None]:
# added a random_state so we will get the same results each time we run
train_df = df.sample(frac=0.7, replace=False, random_state = 42)
test_df = df[~df.index.isin(train_df.index)]

#check if train_df and test_df overlap
len(train_df) + len(test_df) == len(df)

In [None]:
#Declare target variable, training data and testing data
target = 'loan_status'
train = train_df
test = test_df

#Set up X Y
X_train = train.drop(target, 1)
Y_train = train[target]
X_test = test.drop(target, 1)
Y_test = test[target]

In [None]:
import seaborn as sns
from sklearn.metrics import roc_curve, auc,roc_auc_score
%matplotlib inline
from sklearn.model_selection import KFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

In [None]:
#Create the function to calculate and return auc score
def AUCCal(preds, truth, label_string):
    '''
    preds is an nx1 array of predictions
    truth is an nx1 array of truth labels
    label_string is text to go into the plotting label
    '''
    
    #1. call the roc_curve function to get the ROC X and Y values
    fpr, tpr, thresholds = roc_curve(truth, preds)
    #2. Input fpr and tpr into the auc function to get the AUC
    roc_auc = auc(fpr, tpr)
    
    if roc_auc < 0.5:
        fpr, tpr, thresholds = roc_curve(truth, -1 * preds)
        roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, label = str(col) + ' (AUC = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    
    return roc_auc

In [None]:
#Check auc scores for all features
featureset = list(X_train.columns.values)
feature_auc_dict = {}


fig = plt.figure(figsize = (12, 6))
ax = plt.subplot(111)
#create a plot and set some options
for col in X_train.columns:
    feature_auc_dict[col] = AUCCal(X_train[col],Y_train,col)
    
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
    

# Put a legend below current axis
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.0 , box.width, box.height * 1])
ax.legend(loc = 'upper center', bbox_to_anchor = (0.5, -0.15), fancybox = True, 
              shadow = True, ncol = 4, prop = {'size':10})

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_best = RandomForestClassifier(criterion='entropy')
rf_best = rf_best.fit(X_train, Y_train)



In [None]:
%matplotlib inline
cols = X_train.columns.values
rf_fi = rf_best.feature_importances_

fig, ax = plt.subplots(figsize=(25, 10))

ax.bar(np.arange(len(cols)), rf_fi, width, color='b', label='RF')


ax.set_xticks(np.arange(len(cols)))
ax.set_xticklabels(cols, rotation=45)
plt.title('Feature Importance from RF')
ax.set_ylabel('Normalized Gini Importance')
plt.legend(loc=1)