# Modeling Personal Loan Delinquency with LendingClub Data

## Imports and Settings

In [1]:
import numpy as np
import pandas as pd

from utils.utils import (
    load_dataframe, 
    preprocess, 
    split_data,
    reset_axes)

from utils.models import build_mle_matrix, build_markov_chain_no_priors
from utils.inference import compute_mle, infer_matrix_no_priors

Instructions for updating:
Use the retry module or similar alternatives.


In [2]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', -1)

## Data Loading and Preprocessing

In [3]:
df = load_dataframe()

Loading raw data from hdf5 cache...
Fetching raw data took 3.01 seconds
Retrieved 40,268,594 rows, 4 columns


Our variable of interest is called `loan_status` which has eight possible states. These are the Loan Status Descriptions from the LendingClub [website](https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-):

- **Current**: Loan is up to date on all outstanding payments. 

- **Fully paid**: Loan has been fully repaid, either at the expiration of the 3- or 5-year year term or as a result of a prepayment.
 
- **Late (16-30)**: Loan has not been current for 16 to 30 days. Learn more about the tools LendingClub has to deal with delinquent borrowers.
 
- **Late (31-120)**: Loan has not been current for 31 to 120 days. Learn more about the tools LendingClub has to deal with delinquent borrowers.
 
- **Charged Off**: Loan for which there is no longer a reasonable expectation of further payments. Upon Charge Off, the remaining principal balance of the Note is deducted from the account balance. Charge Off typically occurs when a loan is 120 days or more past due and there is no reasonable expectation of sufficient payment to prevent the charge off. Loans for which borrowers have filed for bankruptcy may be charged off earlier based on the date of bankruptcy notification. 

- **Default**: Loan has not been current for an extended period of time. More about the difference between Default and Charged Off [here](https://help.lendingclub.com/hc/en-us/articles/216127747)

- **In Grace Period**: Loan is past due but within the 15-day grace period. 

- **Issued**: New loan that has passed all LendingClub reviews, received full funding, and has been issued.

In [4]:
df = preprocess(df)

Mapping column names...
Loading preprocessed data from hdf5 cache...
Fetching preprocessed data took 2.69 seconds
Preprocessed 27,641,460 rows, 4 columns


In [5]:
x_train, x_test = split_data(df)

Loading split data from hdf5 cache...
Fetching training and test data took 0.45 seconds


## Experiment 1: Markov Model with Maximum Likelihood Estimates

The MLE solution of a Markov Chain is simply the empirical frequencies of each transition. Even though we want to solve the problem from a Bayesian perspective, it's good to look at this estimate and have it in mind later.

### Model

In [6]:
realized_transitions = build_mle_matrix(df)

Loading transition matrix from hdf5 cache...
Fetching transition matrix took 0.01 seconds


In [7]:
realized_transitions

Unnamed: 0,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
Charged Off,0,0,0,0,0,0,0,0
Current,774,24453702,3,707322,5831,0,160366,62102
Default,28897,147,2297,71,0,0,4,506
Fully Paid,0,0,0,8063,12,0,101,72
In Grace Period,0,276,0,11,22,0,59,41
Issued,0,17206,0,670,1,0,38,1
Late (16-30 days),4548,32376,0,2066,257,0,13413,119621
Late (31-120 days),105934,25434,29802,2146,56,0,3292,332762


### Inference

In [8]:
compute_mle(realized_transitions)

Unnamed: 0,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
Charged Off,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Current,0.0,0.96,0.0,0.03,0.0,0.0,0.01,0.0
Default,0.91,0.0,0.07,0.0,0.0,0.0,0.0,0.02
Fully Paid,0.0,0.0,0.0,0.98,0.0,0.0,0.01,0.01
In Grace Period,0.0,0.67,0.0,0.03,0.05,0.0,0.14,0.1
Issued,0.0,0.96,0.0,0.04,0.0,0.0,0.0,0.0
Late (16-30 days),0.03,0.19,0.0,0.01,0.0,0.0,0.08,0.69
Late (31-120 days),0.21,0.05,0.06,0.0,0.0,0.0,0.01,0.67


### Criticism

## Experiment 2: Stationary Markov Chain without Priors

In [9]:
chain_len = max(df.age_of_loan)
n_states = df.loan_status.unique().shape[0]

### Model

In [113]:
x, T = build_markov_chain_no_priors(n_states, chain_len)

### Inference

In [11]:
infer_matrix_no_priors(x_train, x, T, n_states, chain_len, **{'n_iter': 20000})

20000/20000 [100%] ██████████████████████████████ Elapsed: 113s | Loss: 2.254


Unnamed: 0,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
Charged Off,0.090544,0.091356,0.091502,0.093266,0.089471,0.091574,0.09089,0.091522
Current,0.112818,0.112714,0.11072,0.115084,0.115543,0.109126,0.114241,0.109078
Default,0.127862,0.126773,0.128156,0.125549,0.130121,0.126625,0.124615,0.125298
Fully Paid,0.072671,0.081318,0.079279,0.080357,0.084068,0.080091,0.079148,0.080356
In Grace Period,0.199384,0.191548,0.189464,0.197563,0.189009,0.199124,0.194411,0.19861
Issued,0.109619,0.114605,0.120081,0.111959,0.117241,0.114356,0.112465,0.111935
Late (16-30 days),0.122225,0.124331,0.120604,0.122278,0.117185,0.121461,0.122313,0.125618
Late (31-120 days),0.164878,0.157356,0.160193,0.153945,0.157362,0.157644,0.161917,0.157583


## Experiment 3: Stationary Markov Chain with Priors

In [167]:
import tensorflow as tf
import edward as ed
from edward.models import Bernoulli, Categorical, Normal, Empirical, Multinomial, Beta, Dirichlet

### Model

In [168]:
N = 5000 # number of posterior samples

In [169]:
tf.reset_default_graph()

# create default starting state probability vector
pi_0 = Dirichlet(tf.ones(n_states))
x_0 = Categorical(pi_0)

# transition matrix
pi_T = Dirichlet(tf.ones([n_states, n_states]))

x = []
for _ in range(chain_len):
    x_tm1 = x[-1] if x else x_0
    x_t = Categorical(probs=tf.gather(pi_T, x_tm1))
    x.append(x_t)

### Inference

In [224]:
N = x_train.shape[0]
M = 1000
D = len(x_train.columns)
x_train_sub = x_train.sample(M)
X = tf.placeholder(tf.float32, [None, D])

In [225]:
def generator(arrays, batch_size):
    """Generate batches, one with respect to each array's first axis."""
    starts = [0] * len(arrays)  # pointers to where we are in iteration
    while True:
        batches = []
        for i, array in enumerate(arrays):
            start = starts[i]
            stop = start + batch_size
            diff = stop - array.shape[0]
            if diff <= 0:
                batch = array[start:stop]
                starts[i] += batch_size
            else:
                batch = np.concatenate((array[start:], array[:diff]))
                starts[i] = diff
            batches.append(batch)
        yield batches

data = generator([x_train], M)

In [226]:
tf.reset_default_graph()

# create default starting state probability vector
pi_0 = Dirichlet(tf.ones(n_states))
x_0 = Categorical(pi_0, sample_shape=N)

# transition matrix
pi_T = Dirichlet(tf.ones([n_states, n_states]))

x = []
for _ in range(chain_len):
    x_tm1 = x[-1] if x else x_0
    x_t = Categorical(probs=tf.gather(pi_T, x_tm1))
    x.append(x_t)

In [227]:
n_batch = int(N / M)
n_epoch = 5

qpi_0 = Dirichlet(tf.nn.softplus(tf.Variable(tf.ones(n_states))))
qpi_T = Dirichlet(tf.nn.softplus(tf.Variable(tf.ones([n_states, n_states]))))

inferred_matrix = pd.DataFrame()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
        
    inference = ed.KLqp({pi_0: qpi_0, pi_T: qpi_T}, {x[i]: x_train[i+1].values for i in range(chain_len)})
    inference.initialize(n_iter=n_batch * n_epoch, n_samples=5) #, scale={x: N / M}

    for _ in range(inference.n_iter):
        x_batch = next(data)
        info_dict = inference.update({X: x_batch})
        inference.print_progress(info_dict)
        
    
# tf.global_variables_initializer().run()


TypeError: Cannot interpret feed_dict key as Tensor: Tensor Tensor("Placeholder:0", shape=(?, 36), dtype=float32) is not an element of this graph.

In [206]:
# qpi_0 = Empirical(tf.Variable(tf.ones([N, n_states])))
# qpi_T = Empirical(tf.Variable(tf.ones([N, n_states, n_states])))

qpi_0 = Dirichlet(tf.nn.softplus(tf.Variable(tf.ones(n_states))))
qpi_T = Dirichlet(tf.nn.softplus(tf.Variable(tf.ones([n_states, n_states]))))

inferred_matrix = pd.DataFrame()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
        
    inference = ed.KLqp({pi_0: qpi_0, pi_T: qpi_T}, {x[i]: x_train_sub[i+1].values for i in range(chain_len)})
    inference.run(n_iter=20000, optimizer=tf.train.AdamOptimizer(0.005))
    inferred_matrix = pd.DataFrame(sess.run(pi_T))
    
    
#     inference = ed.HMC({pi_0: qpi_0, pi_T: qpi_T}, data={x[i]: subset_pivoted[i+1].values for i in range(chain_len)})
#     inference = ed.SGLD(dict(zip(x, qx)), data=dict(zip(x, x_train)))    
#     inference.run()

In [None]:
reset_axes(inferred_matrix)