In [1]:
from pprint import pprint
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import tensorflow as tf
import edward as ed
from edward.models import Bernoulli, Categorical, Normal, Empirical, Multinomial

from utils.utils import load_dataframe, preprocess, transition_matrix, relabel_axes

Instructions for updating:
Use the retry module or similar alternatives.


In [2]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', -1)
sns.set_style('whitegrid')

## Data

In [3]:
df = load_dataframe()

Loading raw data from hdf5 cache...
Fetching raw data took 3.25 seconds
id             int64   
loan_status    category
age_of_loan    int64   
term           category
dtype: object
Retrieved 40,268,594 rows, 4 columns


In [4]:
# note I made this filter on term == 36 and age_of_loan <= 36 
df = preprocess(df)

Mapping transformations...
Loading preprocessed data from hdf5 cache...
Fetching preprocessed data took 2.99 seconds
id             int64
loan_status    int64
age_of_loan    int64
term           int64
dtype: object
Preprocessed 27,641,460 rows, 4 columns


In [5]:
realized_transitions = transition_matrix(df)

Building transition matrix...
Filling in empty row 0...
Filling in empty column 5...
Caching...
Building transition matrix took 19.76 seconds


In [6]:
realized_transitions

loan_status_y,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
loan_status_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Charged Off,0,0,0,0,0,0,0,0
Current,774,24453702,3,707322,5831,0,160366,62102
Default,28897,147,2297,71,0,0,4,506
Fully Paid,0,0,0,8063,12,0,101,72
In Grace Period,0,276,0,11,22,0,59,41
Issued,0,17206,0,670,1,0,38,1
Late (16-30 days),4548,32376,0,2066,257,0,13413,119621
Late (31-120 days),105934,25434,29802,2146,56,0,3292,332762


## Model

In [7]:
# from issue https://github.com/blei-lab/edward/issues/450
chain_len = 36
n_states = df.loan_status.unique().shape[0]

# create default starting state probability vector with equal probabilties for each state
p = tf.fill([n_states], 1.0 / n_states)
x_0 = Categorical(probs=p)

# transition matrix
myvars = tf.random_uniform([n_states, n_states])
T = tf.nn.softmax(tf.Variable(myvars), axis=0)

# MODEL
x = []
for _ in range(chain_len):
    x_tm1 = x[-1] if x else x_0
    x_t = Categorical(probs=T[x_tm1, :])
    x.append(x_t)

## Inference

In [10]:
df.dtypes

id                int64
loan_status       int64
age_of_loan       int64
term              int64
previous_month    int64
dtype: object

In [15]:
x_data = df.pivot(index='id', columns='age_of_loan', values='loan_status')

In [16]:
# drop where 0 column is not null - this might be a data error, then drop the 0 column
# and fill null values by propogating forward the last valid value
x_data = x_data[x_data[0].isnull()].drop(0, axis=1).fillna(axis=1, method='ffill')

In [17]:
x_data.head()

age_of_loan,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
54734,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
55521,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
55716,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
55742,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0
56121,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,7.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
qx = [Categorical(probs=tf.nn.softmax(tf.Variable(tf.ones(n_states)))) for _ in range(chain_len)]

# placeholders
initial_matrix = pd.DataFrame()
inferred_matrix = pd.DataFrame()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

#     print('Before inference')
#     print(sess.run(T))
    initial_matrix = pd.DataFrame(sess.run(T))
    inference = ed.KLqp(dict(zip(x, qx)), dict(zip(x, x_data)))
    inference.run(n_iter=20000)
    inferred_matrix = pd.DataFrame(sess.run(T))
#     print('#'*40)
#     print('After inference')
#     print(sess.run(T))
  
#     print('#'*40)
#     print('qx:')
#     pprint(sess.run([foo.probs for foo in qx]))
    
#     print('#'*40)
#     print('x:')
#     pprint(sess.run([foo.probs for foo in x]))

20000/20000 [100%] ██████████████████████████████ Elapsed: 107s | Loss: -0.948


In [20]:
# add back status names for analysis
initial_matrix = relabel_axes(initial_matrix)
inferred_matrix = relabel_axes(inferred_matrix)

In [23]:
initial_matrix

Unnamed: 0,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
Charged Off,0.156697,0.134805,0.112641,0.101629,0.100098,0.085648,0.178734,0.165433
Current,0.085575,0.147237,0.156323,0.12164,0.151106,0.166469,0.186597,0.102486
Default,0.102992,0.10578,0.200324,0.106954,0.106618,0.094027,0.118038,0.094599
Fully Paid,0.132352,0.068923,0.080815,0.081269,0.094719,0.104378,0.084757,0.114175
In Grace Period,0.150732,0.080521,0.085264,0.146652,0.187604,0.145184,0.097994,0.125133
Issued,0.112525,0.17262,0.106135,0.212479,0.078122,0.186977,0.127408,0.112832
Late (16-30 days),0.099614,0.113532,0.178471,0.132964,0.142912,0.130162,0.113127,0.116791
Late (31-120 days),0.159513,0.176581,0.080027,0.096414,0.138823,0.087154,0.093344,0.168551


In [21]:
inferred_matrix

Unnamed: 0,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
Charged Off,0.121554,0.128261,0.124591,0.123596,0.130907,0.126439,0.128109,0.129193
Current,0.115301,0.119997,0.121378,0.116161,0.117937,0.119497,0.115391,0.117116
Default,0.148956,0.149914,0.146794,0.158191,0.148937,0.154405,0.153351,0.150634
Fully Paid,0.125608,0.125973,0.127146,0.121815,0.123717,0.126913,0.125361,0.120422
In Grace Period,0.115444,0.111002,0.111754,0.113666,0.114261,0.112906,0.110387,0.116361
Issued,0.098651,0.1008,0.096624,0.099102,0.101811,0.098967,0.099937,0.096078
Late (16-30 days),0.140425,0.136069,0.137585,0.129476,0.134335,0.131204,0.131224,0.133539
Late (31-120 days),0.134061,0.127985,0.134128,0.137993,0.128095,0.129671,0.136239,0.136656
