In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from datetime import datetime
%matplotlib inline
pd.set_option('display.max_rows', 200)

In [2]:
all_loans = pd.read_csv('./data/loans-kaggle.csv', low_memory=False)

In [3]:
all_loans.shape

(887379, 74)

In [4]:
loans = all_loans.copy()

In [5]:
loans.columns

Index([u'id', u'member_id', u'loan_amnt', u'funded_amnt', u'funded_amnt_inv',
       u'term', u'int_rate', u'installment', u'grade', u'sub_grade',
       u'emp_title', u'emp_length', u'home_ownership', u'annual_inc',
       u'verification_status', u'issue_d', u'loan_status', u'pymnt_plan',
       u'url', u'desc', u'purpose', u'title', u'zip_code', u'addr_state',
       u'dti', u'delinq_2yrs', u'earliest_cr_line', u'inq_last_6mths',
       u'mths_since_last_delinq', u'mths_since_last_record', u'open_acc',
       u'pub_rec', u'revol_bal', u'revol_util', u'total_acc',
       u'initial_list_status', u'out_prncp', u'out_prncp_inv', u'total_pymnt',
       u'total_pymnt_inv', u'total_rec_prncp', u'total_rec_int',
       u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee',
       u'last_pymnt_d', u'last_pymnt_amnt', u'next_pymnt_d',
       u'last_credit_pull_d', u'collections_12_mths_ex_med',
       u'mths_since_last_major_derog', u'policy_code', u'application_type',
       u'annu

### Lots of null values. Let's clean up the data

In [6]:
null_value_count = loans.isnull().sum().sort_values(ascending=False)
null_value_count[null_value_count != 0]

dti_joint                      886870
verification_status_joint      886868
annual_inc_joint               886868
il_util                        868762
mths_since_rcnt_il             866569
all_util                       866007
max_bal_bc                     866007
open_rv_24m                    866007
open_rv_12m                    866007
total_cu_tl                    866007
total_bal_il                   866007
open_il_24m                    866007
open_il_12m                    866007
open_il_6m                     866007
open_acc_6m                    866007
inq_fi                         866007
inq_last_12m                   866007
desc                           761350
mths_since_last_record         750326
mths_since_last_major_derog    665676
mths_since_last_delinq         454312
next_pymnt_d                   252971
tot_cur_bal                     70276
total_rev_hi_lim                70276
tot_coll_amt                    70276
emp_title                       51457
last_pymnt_d

Joint applications were introduced late in 2015. For the vast majority of individual applications, set joint_dti to 0, annual_inc_joint to 0, and verification_status_joint to 'Not applicable'

In [7]:
loans[loans.application_type == 'JOINT'].issue_d.value_counts()

Dec-2015    250
Nov-2015    187
Oct-2015     74
Name: issue_d, dtype: int64

In [8]:
loans[(loans.application_type == 'JOINT') & loans.dti_joint.isnull() & loans.annual_inc_joint.isnull()
     & loans.verification_status_joint.isnull()].shape

(0, 74)

In [9]:
loans['dti_joint'].fillna(value=0.0, inplace=True)
loans['annual_inc_joint'].fillna(value=0.0, inplace=True)
loans['verification_status_joint'].fillna(value='Not Applicable', inplace=True)

These fields were added on December 2015 and only exist for new loans. We can just drop them.

In [10]:
dec_2015_columns = ['il_util', 'mths_since_rcnt_il', 'open_acc_6m', 'inq_last_12m', 'open_il_6m', 'open_il_12m',
                   'open_il_24m', 'total_bal_il', 'open_rv_12m', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
                   'inq_fi', 'total_cu_tl', 'all_util']
loans = loans.drop(dec_2015_columns, axis=1)

Description is missing for most loans and since we are not doing any keyword or sentiment analyis, we can drop it.

In [11]:
loans = loans.drop('desc', axis=1)

Public records are derogatory records such as bankruptcy, civil judgment, and tax liens. These values are missing because these loans have no negative public records. Let's set the value to a very high number, i.e., 1,000,000

In [12]:
loans[loans.mths_since_last_record.isnull() & (loans.pub_rec > 0)].shape

(0, 59)

In [13]:
loans.mths_since_last_record.fillna(value=1000000, inplace=True)

Derogatories are public records or loans which have been delinquent for more than 90 days. mths_since_last_major_derog is the number of months since the last derogatory record. Since it's missing for most loans, we set it to the max of the column.

In [14]:
loans['mths_since_last_major_derog'].fillna(value=loans.mths_since_last_major_derog.max(), inplace=True)

mths_since_last_delinq is the number of months since last delinquency. If they currently have a delinquent account or one in the past two years, we set it to 0. Otherwise set it to max of column.

In [15]:
max_months = loans.mths_since_last_delinq.max()
def impute_mths_since_last_delinq(row):  
    months = max_months
    if row.acc_now_delinq > 0 or row.delinq_2yrs > 0:
        months = 0
    return months

loans.loc[loans.mths_since_last_delinq.isnull(), 'mths_since_last_delinq'] = \
    loans[loans.mths_since_last_delinq.isnull()].apply(impute_mths_since_last_delinq, axis=1)

For the following, we will impute using the median
tot_cur_bal => Total current balance of all accounts
total_rev_hi_lim => Total revolving credit limit
tot_coll_amt => Total collection amounts ever owed

In [27]:
loans.tot_cur_bal.fillna(loans.tot_cur_bal.median(), inplace=True)
loans.total_rev_hi_lim.fillna(loans.total_rev_hi_lim.median(), inplace=True)
loans.tot_coll_amt.fillna(loans.tot_coll_amt.median(), inplace=True)

Since there are so many different employment titles, we cannot include it as a categorical feature. So we drop it.

In [29]:
loans.emp_title.value_counts().count()

299272

In [31]:
loans = loans.drop('emp_title', axis=1)

Drop all the columns that are dates.

In [36]:
date_columns = ['last_pymnt_d', 'earliest_cr_line', 'next_pymnt_d']

In [37]:
loans = loans.drop(date_columns, axis=1)

These 29 loans issued in the summer of 2007 when Lending Club's screening process was still being implemented. We can simply remove these loans.

In [40]:
loans[loans.acc_now_delinq.isnull() & loans.total_acc.isnull() & loans.pub_rec.isnull() & loans.open_acc.isnull()
     & loans.inq_last_6mths.isnull() & loans.delinq_2yrs.isnull()].shape[0]

29

In [41]:
loans[loans.acc_now_delinq.isnull() & loans.total_acc.isnull() & loans.pub_rec.isnull() & loans.open_acc.isnull()
     & loans.inq_last_6mths.isnull() & loans.delinq_2yrs.isnull()].issue_d.value_counts()

Jun-2007    21
Aug-2007     4
Jul-2007     4
Name: issue_d, dtype: int64

In [42]:
missing_summer_2007 = ['delinq_2yrs', 'acc_now_delinq', 'inq_last_6mths', 'open_acc', 'pub_rec', 'total_acc']
loans = loans.dropna(subset=missing_summer_2007)

Member ID and URL are unique to each loan, so we can drop those columns

In [48]:
loans = loans.drop(['member_id', 'url'], axis=1)

In [50]:
loans.isnull().sum()

id                               0
loan_amnt                        0
funded_amnt                      0
funded_amnt_inv                  0
term                             0
int_rate                         0
installment                      0
grade                            0
sub_grade                        0
emp_length                       0
home_ownership                   0
annual_inc                       0
verification_status              0
issue_d                          0
loan_status                      0
pymnt_plan                       0
purpose                          0
title                          151
zip_code                         0
addr_state                       0
dti                              0
delinq_2yrs                      0
inq_last_6mths                   0
mths_since_last_delinq           0
mths_since_last_record           0
open_acc                         0
pub_rec                          0
revol_bal                        0
revol_util          

In [None]:

# delinq_2yrs => the Number of 30+ days past-due incidences of delinquency in the borrower's credit file for the past 2 years.
# acc_now_delinq => The Number of accounts on which the borrower is now delinquent.
loans[loans.mths_since_last_major_derog.isnull() & (loans.acc_now_delinq > 0)]['acc_now_delinq']

In [None]:
loans.mths_since_last_major_derog.value_counts()

In [None]:
loans[loans.mths_since_last_major_derog.isnull() & (loans.pub_rec == 0)].shape

In [None]:
# We'll impute the mths_since_last_major_derog by taking the mean for other values for that subgrade
np.floor(loans[loans.sub_grade == 'D4']['mths_since_last_major_derog'].mean())

In [None]:
loans.mths_since_last_major_derog.isnull().sum()

In [None]:
def impute_mths_since_last_major_derog(loans):
    num_rows = loans.shape[0]
    zeros = np.zeros(num_rows, dtype=int)
    loans['mths_since_last_major_derog_imputed'] = zeros
    
    means = {}
    for sub_grade in loans.sub_grade.value_counts().keys():
        means[sub_grade] = loans[loans['sub_grade'] == sub_grade]['mths_since_last_major_derog'].mean()
    loans.loc[loans.mths_since_last_major_derog.isnull(), 'mths_since_last_major_derog'] = loans[loans.mths_since_last_major_derog.isnull()].apply(lambda x: means[x.sub_grade], axis=1)
    loans.loc[loans.mths_since_last_major_derog.isnull(), 'mths_since_last_major_derog_imputed'] = 1
    return loans

loans = impute_mths_since_last_major_derog(loans)

In [None]:
def impute_mths_since_last_major_derog(df):
    loans = df.copy()
    num_rows = loans.shape[0]
    num_null_rows = loans[loans.mths_since_last_major_derog.isnull()].shape[0]
    zeros = np.zeros(num_rows, dtype=int)
    loans['mths_since_last_major_derog_imputed'] = zeros
    
    means = {}
    for sub_grade in loans.sub_grade.value_counts().keys():
        means[sub_grade] = loans[loans['sub_grade'] == sub_grade]['mths_since_last_major_derog'].mean()
    loans.loc[loans.mths_since_last_major_derog.isnull(), 'mths_since_last_major_derog'] = loans[loans.mths_since_last_major_derog.isnull()].apply(lambda x: means[x.sub_grade], axis=1)
    loans.loc[loans.mths_since_last_major_derog.isnull(), 'mths_since_last_major_derog_imputed'] = np.array()
    return loans

loans2 = impute_mths_since_last_major_derog(loans)