# Modeling Personal Loan Delinquency with LendingClub Data

## Imports and Settings

In [1]:
import numpy as np
import pandas as pd

from utils.utils import (
    load_dataframe, 
    preprocess, 
    split_data,
    reset_axes)

from utils.models import build_mle_matrix, build_markov_chain_no_priors
from utils.inference import compute_mle, infer_matrix_no_priors

Instructions for updating:
Use the retry module or similar alternatives.


In [2]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', -1)

## Data Loading and Preprocessing

In [3]:
df = load_dataframe()

Loading raw data from hdf5 cache...
Fetching raw data took 2.60 seconds
Retrieved 40,268,594 rows, 4 columns


Our variable of interest is called `loan_status` which has eight possible states. These are the Loan Status Descriptions from the LendingClub [website](https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-):

- **Current**: Loan is up to date on all outstanding payments. 

- **Fully paid**: Loan has been fully repaid, either at the expiration of the 3- or 5-year year term or as a result of a prepayment.
 
- **Late (16-30)**: Loan has not been current for 16 to 30 days. Learn more about the tools LendingClub has to deal with delinquent borrowers.
 
- **Late (31-120)**: Loan has not been current for 31 to 120 days. Learn more about the tools LendingClub has to deal with delinquent borrowers.
 
- **Charged Off**: Loan for which there is no longer a reasonable expectation of further payments. Upon Charge Off, the remaining principal balance of the Note is deducted from the account balance. Charge Off typically occurs when a loan is 120 days or more past due and there is no reasonable expectation of sufficient payment to prevent the charge off. Loans for which borrowers have filed for bankruptcy may be charged off earlier based on the date of bankruptcy notification. 

- **Default**: Loan has not been current for an extended period of time. More about the difference between Default and Charged Off [here](https://help.lendingclub.com/hc/en-us/articles/216127747)

- **In Grace Period**: Loan is past due but within the 15-day grace period. 

- **Issued**: New loan that has passed all LendingClub reviews, received full funding, and has been issued.

In [4]:
df = preprocess(df)

Mapping column names...
Loading preprocessed data from hdf5 cache...
Fetching preprocessed data took 2.26 seconds
Preprocessed 27,641,460 rows, 4 columns


In [5]:
x_train, x_test = split_data(df)

Loading split data from hdf5 cache...
Fetching training and test data took 0.37 seconds


## Experiment 1: Markov Model with Maximum Likelihood Estimates

The MLE solution of a Markov Chain is simply the empirical frequencies of each transition. Even though we want to solve the problem from a Bayesian perspective, it's good to look at this estimate and have it in mind later.

### Model

In [6]:
realized_transitions = build_mle_matrix(df)

Loading transition matrix from hdf5 cache...
Fetching transition matrix took 0.01 seconds


In [7]:
realized_transitions

Unnamed: 0,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
Charged Off,0,0,0,0,0,0,0,0
Current,774,24453702,3,707322,5831,0,160366,62102
Default,28897,147,2297,71,0,0,4,506
Fully Paid,0,0,0,8063,12,0,101,72
In Grace Period,0,276,0,11,22,0,59,41
Issued,0,17206,0,670,1,0,38,1
Late (16-30 days),4548,32376,0,2066,257,0,13413,119621
Late (31-120 days),105934,25434,29802,2146,56,0,3292,332762


### Inference

In [8]:
compute_mle(realized_transitions)

Unnamed: 0,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
Charged Off,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Current,0.0,0.96,0.0,0.03,0.0,0.0,0.01,0.0
Default,0.91,0.0,0.07,0.0,0.0,0.0,0.0,0.02
Fully Paid,0.0,0.0,0.0,0.98,0.0,0.0,0.01,0.01
In Grace Period,0.0,0.67,0.0,0.03,0.05,0.0,0.14,0.1
Issued,0.0,0.96,0.0,0.04,0.0,0.0,0.0,0.0
Late (16-30 days),0.03,0.19,0.0,0.01,0.0,0.0,0.08,0.69
Late (31-120 days),0.21,0.05,0.06,0.0,0.0,0.0,0.01,0.67


### Criticism

## Experiment 2: Stationary Markov Chain without Priors

### Model

In [9]:
chain_len = max(df.age_of_loan)
n_states = df.loan_status.unique().shape[0]
x, T = build_markov_chain_no_priors(n_states, chain_len)

### Inference - Variational Inference

In [10]:
infer_matrix_no_priors(x_train, x, T, n_states, chain_len, **{'n_iter': 20000})

20000/20000 [100%] ██████████████████████████████ Elapsed: 101s | Loss: 1.833


Unnamed: 0,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
Charged Off,0.098205,0.101345,0.102685,0.099544,0.099228,0.097762,0.097746,0.09879
Current,0.129179,0.130857,0.129489,0.126082,0.130558,0.127369,0.126617,0.127844
Default,0.118641,0.121201,0.120378,0.121411,0.124497,0.125006,0.119926,0.12165
Fully Paid,0.143039,0.140888,0.141293,0.147665,0.140984,0.143109,0.138954,0.141033
In Grace Period,0.122338,0.123548,0.119738,0.12506,0.118013,0.120828,0.127999,0.11893
Issued,0.139208,0.131497,0.13904,0.131977,0.138706,0.135345,0.13555,0.140579
Late (16-30 days),0.123957,0.124278,0.11915,0.122819,0.122666,0.121893,0.121057,0.125405
Late (31-120 days),0.125434,0.126386,0.128226,0.125442,0.125348,0.12869,0.132151,0.125768


### Inference - Gibbs Sampling

In [None]:
#WIP 

T = 5000 # number of posterior samples => the "M" in our lecture on MCMC (length of MC used for inference)

# the approximating family has to be an empirical distribution in MCMC:
# qpi = ed.models.Empirical(params=tf.get_variable("qpi/params", [T, n_statuses],
#       initializer=tf.constant_initializer(1.0 / n_statuses))) # initialize as uniform probs
qpi = ed.models.Empirical(tf.Variable(name="qpi/params", expected_shape=[T, n_states],
                                      initial_value=tf.constant(1.0/n_states, shape=[T, n_states])))

# self.qu = ed.models.Empirical(params=tf.Variable(tf.zeros([n_iter, self.N, self.K]), name="qu"))
inference = ed.inferences.Gibbs(latent_vars={pi: qpi}, data={counts: data})
inference.run()

# CRITICISM
print("Inferred pi: {}".format(sess.run(qpi.mean()))) 