In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline

### Read in loan data from 2007 to 2015. The data has been split into 21 different files so that they can pushed to Github

In [2]:
def read_loan_data():
    """Read in data from 21 files and concatentate into one dataframe"""
    dataframes = []
    for i in range(21):
        filename = './data/LoanStats_2007_to_2015_' + str(i) + '.csv'
        df = pd.read_csv(filename, low_memory=False)
        dataframes.append(df)
    loans = pd.concat(dataframes)
    return loans

In [3]:
loans = read_loan_data()

In [4]:
loans.shape

(887449, 143)

In [5]:
loans.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount
0,1077501,,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,,,,,,,,,,
1,1077430,,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,,,,,,,,,,
2,1077175,,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,,,,,,,,,,
3,1076863,,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,...,,,,,,,,,,
4,1075358,,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,...,,,,,,,,,,


## Some exploratory data analysis

### What kind of interest rate are borrowers paying?

In [None]:
# Remove 9 loans with missing interest rates. These were never funded
loans = loans[loans.int_rate.notnull()]

In [None]:
# Strip out % and convert to float
loans.int_rate = loans.int_rate.apply(lambda x: float(x.rstrip('%')))

In [None]:
loans.int_rate.describe()

In [None]:
sns.distplot(loans.int_rate, axlabel='Interest Rate');

### How long are the loan terms?

In [None]:
loans.term.value_counts()

In [None]:
three_year_loans = 618731 / float(loans.shape[0])
print three_year_loans

In [None]:
loans.term.value_counts().plot(kind='pie', fontsize=16);

### How much are people borrowing?

In [None]:
loans.loan_amnt.describe()

In [None]:
loans.loan_amnt.plot(kind='hist', bins=20, x='Loan Amount');

Seems like people like to round up to multiples of 5k such as 10k and 15k when applying.

### What are people taking these loans out for?

In [None]:
purposes = loans.purpose.value_counts()
purposes

In [None]:
purposes.plot.barh(figsize=(15, 5));

In [None]:
titles = loans.title.str.cat(sep=',')
wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', max_font_size=40, relative_scaling=0.5)
wordcloud.generate(titles)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

### Are the borrowers renters or homeowners?

In [None]:
loans.home_ownership.value_counts()

In [None]:
loans.loc[(loans.home_ownership == 'ANY') | (loans.home_ownership == 'NONE'), 'home_ownership'] = 'OTHER'

In [None]:
loans.home_ownership.value_counts().plot(kind='pie', fontsize=16);

### LendingClub rates their loans from A to G, with subgrades 1 to 5

In [None]:
loans.grade.value_counts().sort_index(ascending=False).plot(kind='barh', fontsize=16);

In [None]:
loans.sub_grade.value_counts().sort_index().plot(kind='bar', fontsize=16, figsize=(12, 4));

### Where do these borrowers live?

In [None]:
loans.addr_state.value_counts() #  51 states including DC as it's own

In [None]:
loans.zip_code.value_counts()

In [None]:
import vincent
vincent.core.initialize_notebook()

In [None]:
"""
import json
with open('maps/us_counties.topo.json') as counties_file:    
    county_topo = json.load(counties_file)
with open('maps/us_states.topo.json') as states_file:
    state_topo = json.load(states_file)
geo_data = [{'name': 'counties',
             'url': county_topo,
             'feature': 'us_counties.geo'},
            {'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'}
             ]

vis = vincent.Map(geo_data=geo_data, scale=200)
"""

### What are we trying to predict?

In [6]:
loans.loan_status.value_counts()

Fully Paid                                             453555
Current                                                295717
Charged Off                                            116280
Late (31-120 days)                                      10079
In Grace Period                                          6513
Late (16-30 days)                                        2536
Does not meet the credit policy. Status:Fully Paid       1988
Does not meet the credit policy. Status:Charged Off       761
Default                                                    11
Name: loan_status, dtype: int64

### Lots of null values, drop columns where 80% or more of the values are null

In [None]:
num_rows = loans.shape[0]
threshold = int(num_rows * 0.8)
loans = loans.dropna(axis=1, thresh=threshold)
loans.shape

In [None]:
loans.loan_status.value_counts()

In [None]:
null_value_count = loans.isnull().sum().sort_values(ascending=False)

In [None]:
cutoff = 887449 * 0.8;
columns_to_remove = []
for column, null_values in null_value_count.iteritems():
    if null_values > cutoff:
        columns_to_remove.append(column)
        print column, null_values

In [None]:
loans.desc.head()

In [None]:
loans.url.head()

In [None]:
loans.title.head(100)

In [None]:
cutoff = loans.shape[0] * 0.8;
df = loans.dropna(axis=1, thresh=cutoff)
df.shape

In [None]:
df.columns

In [None]:
loans.application_type.value_counts()

In [None]:
loans['fico'] = (loans.fico_range_low + loans.fico_range_high) / 2

In [None]:
loans.loc[:, ['fico_range_low', 'fico_range_high', 'fico']].head(20)

In [None]:
loans = loans.drop(['fico_range', 'fico_range_low', 'fico_range_high'], axis=1)

In [None]:
loans.columns

In [None]:
loans.loan_status.value_counts()

In [None]:
loans = loans[(loans.loan_status != 'Does not meet the credit policy. Status:Charged Off') & (loans.loan_status != 'Does not meet the credit policy. Status:Fully Paid')]

In [None]:
loans.loan_status.value_counts()

In grace period means less than 15 days.

In [None]:
# In grace period means 
#loans['defaulted'] = 

### For the purposes of this analysis we'll only look at loans that have had at least two years