In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from datetime import datetime
%matplotlib inline
pd.set_option('display.max_rows', 200)

In [None]:
all_loans = pd.read_csv('./data/loans-kaggle.csv', low_memory=False)

In [None]:
all_loans.shape

In [None]:
loans = all_loans.copy()

In [None]:
loans.columns

### Lots of null values. Let's clean up the data

In [None]:
null_value_count = loans.isnull().sum().sort_values(ascending=False)

In [None]:
null_value_count[null_value_count != 0]

In [None]:
loans.application_type.value_counts()

In [None]:
loans[loans.application_type == 'JOINT'].issue_d.value_counts()

In [None]:
loans[(loans.application_type == 'JOINT') & loans.dti_joint.isnull() & loans.annual_inc_joint.isnull()
     & loans.verification_status_joint.isnull()].shape

In [None]:
# For individual applicants, set joint_dti to 0, annual_inc_joint to 0, and verification_status_joint to Not Verified
loans[loans.application_type == 'JOINT'][['dti_joint', 'annual_inc_joint', 'verification_status_joint',
                                         'dti', 'annual_inc', 'verification_status']]

In [None]:
loans['dti_joint'] = loans['dti_joint'].fillna(value=0.0)
loans['annual_inc_joint'] = loans['annual_inc_joint'].fillna(value=0.0)
loans['verification_status_joint'] = loans['verification_status_joint'].fillna(value=0.0)

In [None]:
loans.head()

### These fields were added on December 2015 and only exist for new loans

In [None]:
dec_2015_columns = ['il_util', 'mths_since_rcnt_il', 'open_acc_6m', 'inq_last_12m', 'open_il_6m', 'open_il_12m',
                   'open_il_24m', 'total_bal_il', 'open_rv_12m', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
                   'inq_fi', 'total_cu_tl', 'all_util']
loans = loans.drop(dec_2015_columns, axis=1)

In [None]:
# Description is missing for most loans
loans = loans.drop('desc', axis=1)

In [None]:
# Public records are derogatory records such as bankruptcy, civil judgment, and tax liens
loans[loans.mths_since_last_record.isnull() & (loans.pub_rec > 0)].shape

In [None]:
# Since the other three columns for public records report 0, the months since last record should be 0
loans.mths_since_last_record.fillna(value=0, inplace=True)

In [None]:
# mths_since_last_major_derog => the number of months since most recent 90-day or worse rating
# delinq_2yrs => the Number of 30+ days past-due incidences of delinquency in the borrower's credit file for the past 2 years.
# acc_now_delinq => The Number of accounts on which the borrower is now delinquent.
loans[loans.mths_since_last_major_derog.isnull() & (loans.acc_now_delinq > 0)]['acc_now_delinq']

In [None]:
loans.mths_since_last_major_derog.value_counts()

In [None]:
loans[loans.mths_since_last_major_derog.isnull() & (loans.pub_rec == 0)].shape

In [None]:
# We'll impute the mths_since_last_major_derog by taking the mean for other values for that subgrade
np.floor(loans[loans.sub_grade == 'D4']['mths_since_last_major_derog'].mean())

In [None]:
def print_row(row):
    return row['sub_grade']

loans[loans.mths_since_last_major_derog.isnull()].apply(print_row, axis=1)


In [None]:
loans.mths_since_last_major_derog.isnull().sum()

In [None]:
def impute_mths_since_last_major_derog(loans):
    num_rows = loans.shape[0]
    zeros = np.zeros(num_rows, dtype=int)
    loans['mths_since_last_major_derog_imputed'] = zeros
    
    means = {}
    for sub_grade in loans.sub_grade.value_counts().keys():
        means[sub_grade] = loans[loans['sub_grade'] == sub_grade]['mths_since_last_major_derog'].mean()
    loans.loc[loans.mths_since_last_major_derog.isnull(), 'mths_since_last_major_derog'] = loans[loans.mths_since_last_major_derog.isnull()].apply(lambda x: means[x.sub_grade], axis=1)
    loans.loc[loans.mths_since_last_major_derog.isnull(), 'mths_since_last_major_derog_imputed'] = 1
    return loans

loans2 = impute_mths_since_last_major_derog(loans)

In [None]:
loans2.mths_since_last_major_derog_imputed.value_counts()

In [None]:
loans2.loc[loans2.mths_since_last_major_derog.isnull(), 'mths_since_last_major_derog'] = -1

In [None]:
loans2.mths_since_last_major_derog.isnull().sum()

In [None]:
loans['issue_date'] = loans.issue_d.apply(lambda x: datetime.strptime(x, '%b-%Y').date())

In [None]:
loans['year'] = loans.issue_d.apply(lambda x: x.split('-')[1])

In [None]:
loans[loans.year == '2015']['loan_status'].value_counts()

In [None]:
loans[loans.loan_status == 'Issued'].isnull().sum()

In [None]:
df.loan_status.value_counts()

In [None]:
df[df.loan_status=='Current'].issue_d.value_counts()

In [None]:
df.policy_code.value_counts()

In [None]:
df.application_type.value_counts(1)