# Modeling Personal Loan Delinquency with LendingClub Data

## Imports and Settings

In [1]:
import numpy as np
import pandas as pd

from utils.utils import (
    load_dataframe, 
    preprocess, 
    split_data,
    reset_axes)

from utils.models import build_mle_matrix, build_markov_chain_no_priors
from utils.inference import compute_mle, infer_matrix_no_priors

Instructions for updating:
Use the retry module or similar alternatives.


In [2]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', -1)

## Data Loading and Preprocessing

In [3]:
df = load_dataframe()

Loading raw data from hdf5 cache...
Fetching raw data took 3.02 seconds
Retrieved 40,268,594 rows, 4 columns


Our variable of interest is called `loan_status` which has eight possible states. These are the Loan Status Descriptions from the LendingClub [website](https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-):

- **Current**: Loan is up to date on all outstanding payments. 

- **Fully paid**: Loan has been fully repaid, either at the expiration of the 3- or 5-year year term or as a result of a prepayment.
 
- **Late (16-30)**: Loan has not been current for 16 to 30 days. Learn more about the tools LendingClub has to deal with delinquent borrowers.
 
- **Late (31-120)**: Loan has not been current for 31 to 120 days. Learn more about the tools LendingClub has to deal with delinquent borrowers.
 
- **Charged Off**: Loan for which there is no longer a reasonable expectation of further payments. Upon Charge Off, the remaining principal balance of the Note is deducted from the account balance. Charge Off typically occurs when a loan is 120 days or more past due and there is no reasonable expectation of sufficient payment to prevent the charge off. Loans for which borrowers have filed for bankruptcy may be charged off earlier based on the date of bankruptcy notification. 

- **Default**: Loan has not been current for an extended period of time. More about the difference between Default and Charged Off [here](https://help.lendingclub.com/hc/en-us/articles/216127747)

- **In Grace Period**: Loan is past due but within the 15-day grace period. 

- **Issued**: New loan that has passed all LendingClub reviews, received full funding, and has been issued.

In [4]:
df = preprocess(df)

Mapping column names...
Loading preprocessed data from hdf5 cache...
Fetching preprocessed data took 2.41 seconds
Preprocessed 27,641,460 rows, 4 columns


In [5]:
x_train, x_test = split_data(df)

Loading training and test data from hdf5 cache...
Fetching training and test data took 0.54 seconds
Training on 1,337,814 rows, 36 columns
Testing on 148,541 rows, 36 columns


## Experiment 1: Markov Model with Maximum Likelihood Estimates

The MLE solution of a Markov Chain is simply the empirical frequencies of each transition. Even though we want to solve the problem from a Bayesian perspective, it's good to look at this estimate and have it in mind later.

### Model

In [6]:
realized_transitions = build_mle_matrix(df)

Loading transition matrix from hdf5 cache...
Fetching transition matrix took 0.01 seconds


In [7]:
realized_transitions

Unnamed: 0,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
Charged Off,0,0,0,0,0,0,0,0
Current,774,24453702,3,707322,5831,0,160366,62102
Default,28897,147,2297,71,0,0,4,506
Fully Paid,0,0,0,8063,12,0,101,72
In Grace Period,0,276,0,11,22,0,59,41
Issued,0,17206,0,670,1,0,38,1
Late (16-30 days),4548,32376,0,2066,257,0,13413,119621
Late (31-120 days),105934,25434,29802,2146,56,0,3292,332762


### Inference

In [8]:
compute_mle(realized_transitions)

Unnamed: 0,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
Charged Off,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Current,0.0,0.96,0.0,0.03,0.0,0.0,0.01,0.0
Default,0.91,0.0,0.07,0.0,0.0,0.0,0.0,0.02
Fully Paid,0.0,0.0,0.0,0.98,0.0,0.0,0.01,0.01
In Grace Period,0.0,0.67,0.0,0.03,0.05,0.0,0.14,0.1
Issued,0.0,0.96,0.0,0.04,0.0,0.0,0.0,0.0
Late (16-30 days),0.03,0.19,0.0,0.01,0.0,0.0,0.08,0.69
Late (31-120 days),0.21,0.05,0.06,0.0,0.0,0.0,0.01,0.67


### Criticism

## Experiment 2: Stationary Markov Chain without Priors

In [9]:
chain_len = max(df.age_of_loan)
n_states = df.loan_status.unique().shape[0]

### Model

In [10]:
x, T = build_markov_chain_no_priors(n_states, chain_len)

### Inference

In [11]:
infer_matrix_no_priors(x_train, x, T, n_states, chain_len, **{'n_iter': 20000})

20000/20000 [100%] ██████████████████████████████ Elapsed: 101s | Loss: -0.761


Unnamed: 0,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
Charged Off,0.105033,0.109243,0.100326,0.1033,0.10553,0.102538,0.10461,0.105673
Current,0.110824,0.113342,0.115474,0.109577,0.117765,0.115646,0.113324,0.112651
Default,0.109697,0.10589,0.105548,0.107623,0.105526,0.107476,0.107594,0.113063
Fully Paid,0.193937,0.192598,0.198965,0.204528,0.192145,0.195143,0.201478,0.191486
In Grace Period,0.092327,0.097789,0.09332,0.091553,0.09255,0.096386,0.091617,0.09196
Issued,0.120743,0.115724,0.119377,0.115471,0.122313,0.118704,0.118776,0.118835
Late (16-30 days),0.108444,0.110795,0.112329,0.108209,0.109245,0.110418,0.10921,0.111982
Late (31-120 days),0.158994,0.154617,0.154659,0.159739,0.154925,0.153687,0.15339,0.154348


## Experiment 3: Stationary Markov Chain with Priors

In [12]:
import tensorflow as tf
import edward as ed
from edward.models import Bernoulli, Categorical, Normal, Empirical, Multinomial, Beta, Dirichlet

### Model

In [13]:
N = x_train.shape[0] # training size
M = 1000 # batch size

In [14]:
tf.reset_default_graph()

# create default starting state probability vector
pi_0 = Dirichlet(tf.ones(n_states))
x_0 = Categorical(pi_0, sample_shape=M)

# transition matrix
pi_T = Dirichlet(tf.ones([n_states, n_states]))

x = []
for _ in range(chain_len):
    x_tm1 = x[-1] if x else x_0
    x_t = Categorical(probs=tf.gather(pi_T, x_tm1))
    x.append(x_t)

### Inference (Batch)

In [15]:
qpi_0 = Dirichlet(tf.nn.softplus(tf.Variable(tf.ones(n_states))))
qpi_T = Dirichlet(tf.nn.softplus(tf.Variable(tf.ones([n_states, n_states]))))

In [16]:
def generator(df, batch_size):
    """Generate batches, one with respect to each array's first axis."""
    starts = 0 # pointer to where we are in iteration
    while True:
        start = starts
        stop = start + batch_size
        diff = stop - df.shape[0]
        if diff <= 0:
            batch = df.iloc[start:stop]
            starts += batch_size
        else:
            batch = pd.concat((df.iloc[start:], df.iloc[:diff]))
            starts = diff

        yield batch

data = generator(x_train, M)

In [17]:
X = np.array([tf.placeholder(tf.int32, [M]) for _ in range(chain_len)])

n_batch = int(N / M)
n_epoch = 5

inference = ed.KLqp({pi_0: qpi_0, pi_T: qpi_T}, data=dict(zip(x, X)))
inference.initialize(n_iter=n_batch * n_epoch, n_samples=5, optimizer=tf.train.AdamOptimizer(0.005))

inferred_matrix = pd.DataFrame()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for _ in range(inference.n_iter):
        x_batch = next(data)

        info_dict = inference.update(dict(zip(X, x_batch.values.T)))
        inference.print_progress(info_dict)
    
    inferred_matrix = pd.DataFrame(sess.run(pi_T))

6666/6685 [ 99%] █████████████████████████████  ETA: 0s | Loss: 20205.461

In [18]:
reset_axes(inferred_matrix)

Unnamed: 0,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
Charged Off,0.130217,0.164067,0.274118,0.002501,0.063059,0.195958,0.018994,0.151085
Current,0.026293,0.13958,0.279588,0.002092,0.108295,0.011379,0.122812,0.309962
Default,0.117354,0.547433,0.066024,0.045885,0.013871,0.004496,0.078229,0.126709
Fully Paid,0.246612,0.004481,0.091131,0.009608,0.105153,0.344687,0.154555,0.043774
In Grace Period,0.078507,0.023229,0.14054,0.284513,0.018286,0.135009,0.309692,0.010224
Issued,0.136861,0.047957,0.007327,0.061763,0.145634,0.399721,0.106177,0.09456
Late (16-30 days),0.208707,0.018984,0.331011,0.113073,0.116742,0.012533,0.140729,0.058222
Late (31-120 days),0.31148,0.054715,0.010912,0.043954,0.045705,0.353342,0.033848,0.146043
