## Summary

- Classifier models were trained ...

## Data Prep

In [1]:
import pandas as pd
from os import listdir, getcwd
from os.path import isfile, join
pd.set_option('display.max_columns', 500)

In [2]:
# get filenames
data_filenames = [ file for file in listdir('data') if isfile(join(getcwd(), 'data', file))]

In [3]:
def load_data():
    """
    Load all files, return single dataframe
    """
    data_array = [] # placeholder for dataframes
    for filename in data_filenames:
        full_path = join(getcwd(), 'data', filename)
        print("Loading: " + filename)
        data_array.append(pd.read_csv(full_path))
    
    return(pd.concat(data_array, ignore_index = True))

In [4]:
raw = load_data()
# create boolean variable for charged off loan status
raw['charged_off'] = raw['loan_status'] == 'Charged Off' 

Loading: 2016Q1.csv.gz
Loading: 2016Q2.csv.gz
Loading: 2017Q1.csv.gz
Loading: 2017Q4.csv.gz
Loading: 2017Q2.csv.gz
Loading: 2017Q3.csv.gz
Loading: 2016Q3.csv.gz
Loading: 2016Q4.csv.gz


In [5]:
raw.shape

(438991, 56)

In [6]:
raw.sample(3)

Unnamed: 0,id,funded_amnt,term,int_rate,installment,emp_title,emp_length,home_ownership,annual_inc,loan_status,purpose,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_high,inq_last_6mths,mths_since_last_delinq,open_acc,revol_bal,revol_util,total_acc,application_type,annual_inc_joint,dti_joint,open_act_il,open_il_24m,il_util,open_rv_24m,all_util,total_rev_hi_lim,inq_last_12m,acc_open_past_24mths,bc_open_to_buy,bc_util,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,total_bal_ex_mort,total_bc_limit,revol_bal_joint,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,charged_off
110696,78199485,7000,36 months,9.75%,225.05,Sales Manager,6 years,MORTGAGE,109000.0,Current,other,358xx,AL,19.88,0,Dec-01,689,0.0,,11,10060,99.60%,23,Individual,,,4.0,4.0,69.0,0.0,73.0,10100,0.0,4,0.0,100.4,1,45.0,,,0,11,6,11,0,1,100.0,100.0,0,59679,9000,,,,,False
428922,90815825,12000,60 months,14.49%,282.28,Senior Analyst,4 years,MORTGAGE,79000.0,Fully Paid,home_improvement,480xx,MI,15.56,0,Sep-01,709,0.0,,6,14403,74.20%,14,Individual,,,2.0,1.0,53.0,1.0,67.0,19400,2.0,4,4997.0,74.2,3,14.0,,,0,6,2,6,0,0,100.0,33.3,1,61177,19400,,,,,False
200426,122982893,6000,36 months,9.44%,192.03,Radiologic Texhnologist,10+ years,OWN,54000.0,Fully Paid,debt_consolidation,317xx,GA,19.66,0,Feb-97,699,0.0,,5,14613,59.20%,14,Individual,,,1.0,1.0,52.0,2.0,56.0,24700,0.0,3,9113.0,61.5,0,3.0,,,0,6,3,5,0,1,100.0,33.3,1,22931,23700,,,,,False


In [7]:
raw.isna().sum()

id                                     0
funded_amnt                            0
term                                   0
int_rate                               0
installment                            0
emp_title                          30367
emp_length                         30013
home_ownership                         0
annual_inc                             0
loan_status                            0
purpose                                0
zip_code                               0
addr_state                             0
dti                                  285
delinq_2yrs                            0
earliest_cr_line                       0
fico_range_high                        0
inq_last_6mths                         0
mths_since_last_delinq            213676
open_acc                               0
revol_bal                              0
revol_util                           359
total_acc                              0
application_type                       0
annual_inc_joint

In [8]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438991 entries, 0 to 438990
Data columns (total 56 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              438991 non-null  int64  
 1   funded_amnt                     438991 non-null  int64  
 2   term                            438991 non-null  object 
 3   int_rate                        438991 non-null  object 
 4   installment                     438991 non-null  float64
 5   emp_title                       408624 non-null  object 
 6   emp_length                      408978 non-null  object 
 7   home_ownership                  438991 non-null  object 
 8   annual_inc                      438991 non-null  float64
 9   loan_status                     438991 non-null  object 
 10  purpose                         438991 non-null  object 
 11  zip_code                        438991 non-null  object 
 12  addr_state      

In [9]:
raw['id'].drop_duplicates().shape

(438991,)

In [10]:
raw.shape

(438991, 56)

In [11]:
raw['loan_status'].value_counts(dropna = False)

Current               251552
Fully Paid            132055
Charged Off            40902
Late (31-120 days)      8209
In Grace Period         4016
Late (16-30 days)       2257
Name: loan_status, dtype: int64

In [12]:
raw['purpose'].value_counts()[:15]

debt_consolidation    246935
credit_card            91374
home_improvement       32995
other                  30875
major_purchase         10685
medical                 6248
car                     5062
small_business          4930
vacation                3585
moving                  3480
house                   2525
renewable_energy         295
educational                1
wedding                    1
Name: purpose, dtype: int64

## Build Model

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import numpy as np
from yellowbrick.regressor import residuals_plot, prediction_error
from yellowbrick.classifier import DiscriminationThreshold, ClassificationReport
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn import set_config
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE 
from sklearn.metrics import roc_auc_score
from time import time

#set_config(display="diagram")


def class_report(model, test = True):
    """
    Use yellowbrick to get a classifiation report on
    a model for the test datset
    """
    visualizer = ClassificationReport(model)
    if test:
        visualizer.fit(X_test, y_test)
        visualizer.score(X_test, y_test)
    else:
        visualizer.fit(X_train, y_train)
        visualizer.score(X_train, y_train)
    visualizer.show()

def print_time_elapsed(t0):
    """
    Print time elapsed in seconds since time t0
    """
    print("%.1f seconds elapsed" % (time() - t0))


Bad key "text.kerning_factor" on line 4 in
/home/cambonator/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution


In [14]:
# predictors from the original dataset to include in the models
selected_X_columns = [
    'annual_inc', 'purpose', 'home_ownership', 'dti', 'delinq_2yrs']

X_orig = raw[selected_X_columns]

In [15]:
# transform categorical and numerical columns
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(X_orig)
categorical_columns = categorical_columns_selector(X_orig)

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
#numerical_preprocessor = StandardScaler()

numeric_transformer = Pipeline([("imputer", 
    SimpleImputer(strategy="median")), ("scaler", StandardScaler())])

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard-scaler', numeric_transformer, numerical_columns)])

# create model pipelines
lr_model = LogisticRegression(solver='liblinear')

rf_model = RandomForestClassifier(n_jobs = -1, n_estimators=25)

In [16]:
# perform preprocessing on predictors
X = preprocessor.fit_transform(
    X_orig
    )

#y = np.log10(study_df[['total_gmv']].values.ravel())
y = raw[['charged_off']].values.ravel()
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

In [17]:
# Apply SMOTE to get balanced classes
t0 = time()
sm = SMOTE(random_state=42, n_jobs = -1)
X_res, y_res = sm.fit_resample(X, y)
print_time_elapsed(t0)

AttributeError: 'SMOTE' object has no attribute '_validate_data'

In [None]:
np.mean(y_res) # check class imbalance

In [None]:
# Test/Train split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, # predictors
    y_res, # outcome variable
    test_size = 0.50, 
    random_state = 42
)

## Train Models

In [None]:
t0 = time()
# train models
lr_model.fit(X_train, y_train)
print_time_elapsed(t0)

In [None]:
t0 = time()
rf_model.fit(X_train, y_train)
print_time_elapsed(t0)

## Evaluate Models

In [None]:
def print_roc(model, test = True):
    """
    Print ROC metric for an input the model
    """
    if test:
        roc = roc_auc_score(y_test, model.predict(X_test))
    else:
        roc = roc_auc_score(y_train, model.predict(X_train))
    print(roc)

In [None]:
print_roc(rf_model)

In [None]:
print_roc(rf_model, test = False)

In [None]:
class_report(lr_model)

In [None]:
# Look at training set performance
class_report(lr_model, test = False)

In [None]:
class_report(rf_model)

In [None]:
# Look at training set performance
class_report(rf_model, test = False)

In [None]:
rf_model.feature_importances_

In [None]:
rf_model.get_features()

## Lift Curve

In [None]:
from scikitplot.metrics import plot_lift_curve

In [None]:
y_probas = lr_model.predict_proba(X_test)
plot_lift_curve(y_test, y_probas)
plt.show()