In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import statsmodels.stats.weightstats as wst
import pandas_ml as pdml

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

%matplotlib inline
pd.options.display.max_columns = None
pd.options.display.max_rows = None
plt.rcParams['figure.figsize'] = (10.0, 8.0)

In [2]:
# Read the dataset.

# Define a function for string % convertion to a float ratio
def p2f(x):
    perc = None
    if (x != None) :
        if (type(x) is str) :
            x = x.strip('%')
        perc = float(x)/100
    return perc

loans_2014 = pd.read_csv('./LoanStats2014.csv', skiprows=1, skipfooter=2, parse_dates = [15, 26, 45,47, 48], 
                        infer_datetime_format = True, engine = 'python', converters = {'int_rate' : p2f, 'revol_util' : p2f})

In [3]:
loans_1213 = pd.read_csv('./LoanStats2012to2013.csv', skiprows=1, skipfooter=2, parse_dates = [15, 26, 45,47, 48], 
                        infer_datetime_format = True, engine = 'python', converters = {'int_rate' : p2f, 'revol_util' : p2f})

In [64]:
#Merging the datasets
frames = [loans_2014, loans_1213]
loan_data = pd.concat(frames, axis = 0)

In [65]:
loan_data['revol_util'] = loan_data['revol_util'].astype(np.float64)

In [66]:
#Removing the attribute columns that have all null values. Only the attributes listed in the above DataDictionary will remain.
loan_data.dropna(axis = 1, how = 'all', inplace= True)

#Since id values are not provided in the dataset, create id variable from the index.
loan_data['id'] = loan_data.index


In [67]:
loan_data['term_mnths'] = loan_data.term.str.strip('months').astype(int)

In [68]:
len(loan_data)

423810

In [69]:
#Filtering out the 36 months term loans
loan_data = loan_data[loan_data.term_mnths == 36]
len(loan_data)

306462

In [70]:
#Dropping textual columns from the dataset
loan_data.drop(['emp_title', 'desc', 'title'], axis = 1, inplace = True)

In [71]:
#Marking loans that had at least one late payment
loan_data['late_fee_rec_indicator'] = (loan_data.total_rec_late_fee > 0)


In [12]:
#loans across different loan statuses
#sns.countplot(x = 'loan_status', hue = 'late_fee_rec_indicator', data = loan_data)

In [72]:
# Classify the loans as 'Good' ,'Bad', 'Delinquent'

def classify_loan(row) :
    val = ''
    if ((row.loan_status in ['Fully Paid', 'Current']) ) :
        val = '0-Good'
    else :
        val = '1-Bad'
    return val

loan_data['loan_class'] = loan_data.apply(classify_loan, axis = 1)

In [73]:
pd.DataFrame(loan_data.groupby('loan_class').size())

Unnamed: 0_level_0,0
loan_class,Unnamed: 1_level_1
0-Good,265808
1-Bad,40654


In [74]:
#Transform date fields into appropriate duration units
from datetime import datetime
loans = loan_data.copy()
ref_datetime = datetime.strptime('2017-04-02', '%Y-%m-%d')

# Earliest credit line
loans['mo_sin_earliest_cr_line'] = ((ref_datetime - loans['earliest_cr_line']) / np.timedelta64(1, 'M')).astype(int)

In [75]:
#Imputing missing values and converting to duration units

#last_credit_pull_d
#Setting the nulls to May 2017, so that the calculated duration field will have the value -1
loans['last_credit_pull_d'] = loans['last_credit_pull_d'].fillna(pd.to_datetime('2017-05-03'))
loans['mo_sin_last_credit_pull'] = ((ref_datetime - loans['last_credit_pull_d']) / np.timedelta64(1, 'M')).astype(int)

In [76]:
#Drop the now unused columns
loans.drop([ 'earliest_cr_line', 'last_credit_pull_d'], axis = 1, inplace = True)


In [77]:
# Imputing missing values

#mths_since_last_delinq
# this will be null when the applicant has no prior delinquencies. Imputing these to -1 to set these apart.
loans['mths_since_last_delinq'] = loans['mths_since_last_delinq'].fillna(-1)

#mths_since_last_record
# this will be null when the applicant has no public records. Imputing these to -1 to set these apart.
loans['mths_since_last_record'] = loans['mths_since_last_record'].fillna(-1)

#mths_since_last_major_derog
# this will be null when the applicant has no previous derogatory records. Imputing these to -1 to set these apart.
loans['mths_since_last_major_derog'] = loans['mths_since_last_major_derog'].fillna(-1)

#mo_sin_old_il_acct
# this will be null when the applicant has no bank installment accounts. Imputing these to -1 to set these apart.
loans['mo_sin_old_il_acct'] = loans['mo_sin_old_il_acct'].fillna(-1)

#mths_since_recent_bc_dlq
# this will be null when there's no previous bankcard delinquency. Imputing these to -1 to set these apart.
loans['mths_since_recent_bc_dlq'] = loans['mths_since_recent_bc_dlq'].fillna(-1)

#mths_since_recent_inq
# this will be null when there's no previous inquiries. Imputing these to -1 to set these apart.
loans['mths_since_recent_inq'] = loans['mths_since_recent_inq'].fillna(-1)

#mths_since_recent_revol_delinq
# this will be null when there's no previous revolving account delinquencies. Imputing these to -1 to set these apart.
loans['mths_since_recent_revol_delinq'] = loans['mths_since_recent_revol_delinq'].fillna(-1)

#num_tl_120dpd_2m
# this will be null when info is missing. Imputing these to 0.
#loans['num_tl_120dpd_2m'] = loans['num_tl_120dpd_2m'].fillna(0)


In [78]:
#Removing some loan records which have nans for many critical features. (total credit limit, accounts past due etc)
loans.dropna(axis = 0, how = 'any', subset = ['tot_hi_cred_lim', 'tot_cur_bal', 'num_tl_120dpd_2m', 'revol_util',
                                             'bc_open_to_buy', 'bc_util', 'mths_since_recent_bc', 'pct_tl_nvr_dlq'], 
             inplace = True)


In [79]:
len(loans)

274171

In [80]:
#Removing addr_state from dataset
loans.drop(['addr_state'], axis = 1, inplace = True)

In [81]:
# Convert categorical features to One Hot Encode format using Pandas getDummies
categorical_features = ['grade', 'sub_grade', 'home_ownership','emp_length', 'verification_status', 'pymnt_plan', 'purpose', 
                       'initial_list_status']
loans_with_dummies = pd.get_dummies(columns = categorical_features, data= loans)

In [82]:
# setting up learning and test sets

from sklearn.model_selection import train_test_split

# Identifying X values - Removing all derived columns, date columns, id and constants
X = loans_with_dummies.drop(['term','issue_d', 'zip_code','loan_status', 'application_type','id','term_mnths', 'late_fee_rec_indicator', 
                             'loan_class', 'next_pymnt_d', 'last_pymnt_d', 'collection_recovery_fee', 'last_pymnt_amnt',
                            'out_prncp', 'out_prncp_inv', 'recoveries', 'total_pymnt', 'total_pymnt_inv', 'total_rec_int',
                            'total_rec_late_fee', 'total_rec_prncp'], axis = 1)
X.rename(columns = {'emp_length_< 1 year': 'emp_length_lt 1 year'}, inplace = True)
loans_with_dummies['loan_class'] = loans_with_dummies.loan_class.astype('category')
Y = loans_with_dummies.loan_class.cat.codes
#Y = loans_with_dummies.loan_class


In [83]:
#create ModelFrame object
loans_mf = pdml.ModelFrame(X, target = Y)
loans_mf.to_csv('prepared_loans.csv', index = False)

In [91]:
#Split into training and test data sets
train_loans, test_loans = loans_mf.model_selection.train_test_split(test_size = 0.30, random_state = 5)

In [119]:
#Oversampling the training dataset
sampler = train_loans.imbalance.over_sampling.RandomOverSampler()
sampled_lr_loans = train_loans.fit_sample(sampler)

In [120]:
sampled_lr_loans.to_csv('prepared_sampled_loans.csv')

1    164070
0    164070
Name: .target, dtype: int64