In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import swifter
mpl.style.use('classic')
import seaborn as sns

from sklearn import ensemble
from sklearn.model_selection import cross_val_score

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Place data in dataframe
df = pd.read_csv(
    'LoanStats3d.csv',
    skipinitialspace=True,
    header=1
)

In [3]:
df = df[:-2]

In [4]:
rfc = ensemble.RandomForestClassifier()
X = df.drop('loan_status', 1)
Y = df['loan_status']

In [5]:
categorical = df.select_dtypes(include=['object'])

In [6]:
df['id'] = pd.to_numeric(df['id'], errors='coerce')
df['int_rate'] = pd.to_numeric(df['int_rate'].str.strip('%'), errors='coerce')

In [7]:
# Drop categorical options over 30 count
df.drop(['url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util', 'sub_grade', 'addr_state', 'desc'], 1, inplace=True)

In [8]:
#Create
X = df.drop('loan_status', 1)
Y = df['loan_status']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

In [9]:
importances = list(rfc.fit(X, Y).feature_importances_)
feature_importances = [(feature, round(importance, 4)) for feature, importance in zip(list(X.columns), importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True) 

In [10]:
cvscore = cross_val_score(rfc, X, Y, cv=10, n_jobs=-1)
print('Cross-Validation Score:\n', cvscore)
print('Cross-Validation Mean:\n', cvscore.mean())
print('Cross-Validation +/- (2x STD):\n', (cvscore.std()*2))

Cross-Validation Score:
 [0.97910285 0.98043267 0.98119256 0.98178623 0.97573023 0.97848492
 0.9532879  0.98052673 0.98019332 0.98024035]
Cross-Validation Mean:
 0.9770977744992895
Cross-Validation +/- (2x STD):
 0.016196435144993436


In [11]:
features_ranked = []
for i in range(0, len(feature_importances)):
    if feature_importances[i][1] >= 0.01:
        features_ranked.append(feature_importances[i][0])
        
features_ranked

['last_pymnt_amnt',
 'last_pymnt_d_Jan-2017',
 'out_prncp_inv',
 'next_pymnt_d_Feb-2017',
 'out_prncp',
 'total_rec_prncp',
 'total_pymnt_inv',
 'last_credit_pull_d_Jan-2017',
 'total_pymnt',
 'last_pymnt_d_Dec-2016',
 'recoveries',
 'last_pymnt_d_Nov-2016',
 'collection_recovery_fee',
 'total_rec_int',
 'last_pymnt_d_Oct-2016',
 'funded_amnt',
 'next_pymnt_d_Jan-2017',
 'installment']

In [12]:
set1 = pd.DataFrame(X.loc[:, features_ranked])

In [13]:
group1 = ['last_pymnt_d_Jan-2017', 'out_prncp', 'out_prncp_inv', 'next_pymnt_d_Feb-2017']

means = set1[group1].mean(axis=0)
stds = set1[group1].std(axis=0)
set1['var1'] = ((set1[group1] - means) / stds).mean(axis=1)
set1['var1'] = set1['var1'] - set1['var1'].min()
set1['var1'] = set1['var1']/set1['var1'].max()

set1.drop(group1, axis=1, inplace=True)

In [14]:
group2 = ['last_pymnt_amnt', 'total_rec_prncp', 'total_pymnt_inv', 'total_pymnt']

means = set1[group2].mean(axis=0)
stds = set1[group2].std(axis=0)
set1['var2'] = ((set1[group2] - means) / stds).mean(axis=1)
set1['var2'] = set1['var2'] - set1['var2'].min()
set1['var2'] = set1['var2']/set1['var2'].max()

set1.drop(group2, axis=1, inplace=True)

In [15]:
group3 = ['recoveries', 'collection_recovery_fee']

means = set1[group3].mean(axis=0)
stds = set1[group3].std(axis=0)
set1['var3'] = ((set1[group3] - means) / stds).mean(axis=1)
set1['var3'] = set1['var3'] - set1['var3'].min()
set1['var3'] = set1['var3']/set1['var3'].max()


set1.drop(group3, axis=1, inplace=True)

In [16]:
rfc = ensemble.RandomForestClassifier()
Y = df['loan_status']

cvscore = cross_val_score(rfc, set1, Y, cv=10, n_jobs=-1)
print('Score:\n', cvscore)
print('Mean:\n', cvscore.mean())
print('STD', (cvscore.std()*2))

Cross-Validation Score:
 [0.96276507 0.97098145 0.97392605 0.97729809 0.96874852 0.97337924
 0.97195374 0.97568216 0.97090745 0.97767539]
Cross-Validation Mean:
 0.9723317171435228
Cross-Validation +/- (2x STD):
 0.00840092934917324


In [17]:
cvscore = cross_val_score(rfc, set1, Y, cv=10, n_jobs=-1)
print('Score', cvscore)
print('Mean', cvscore.mean())
print('STD', (cvscore.std()*2))

Cross-Validation Score:
 [0.96276507 0.97098145 0.97392605 0.97729809 0.96874852 0.97337924
 0.97195374 0.97568216 0.97090745 0.97767539]
Cross-Validation Mean:
 0.9723317171435228
Cross-Validation +/- (2x STD):
 0.00840092934917324


In [18]:
cvscore = cross_val_score(rfc, set1.loc[:,['var1','var2','var3']], Y, cv=10, n_jobs=-1)
print('Score:\n', cvscore)
print('Mean:\n', cvscore.mean())
print('STD', (cvscore.std()*2))

Cross-Validation Score:
 [0.86542709 0.90361188 0.89401819 0.90919237 0.87055331 0.9115412
 0.89774158 0.89479684 0.90101408 0.91462024]
Cross-Validation Mean:
 0.8962516784899247
Cross-Validation +/- (2x STD):
 0.03121137452962254


In [None]:
cvscore = cross_val_score(rfc, set1, Y, cv=10, n_jobs=-1)
print('Score:\n', cvscore)
print('Mean:\n', cvscore.mean())
print('STD', (cvscore.std()*2))

In [None]:
# # Conduct PCA
# pca = PCA(n_components=4)

# # data vs model parallelism -- multi core parameter

# # Fit data to PCA function, then transform
# pca.fit(x)
# principalComponents = pca.transform(x)
# # Place in dataframe
# principalDf = pd.DataFrame(data = principalComponents)
# # Concatenate dependents/independents to single dataframe
# finalDf = pd.concat([principalDf, dd])

In [None]:
# pca = PCA().fit(x)
# plt.plot(np.cumsum(pca.explained_variance_ratio_))
# plt.xlabel('number of components')
# plt.ylabel('cumulative explained variance')

In [None]:
as you can see by using feature importance it is possible to weed out the most important features. I started out using PCA but couldn't find a way to rank the data on a per feature basis. The feature importance allowed me to see that payment related variable were among the top of the importance list, as well as outstanding principal