In [10]:
from pprint import pprint
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import tensorflow as tf
import edward as ed
from edward.models import Bernoulli, Categorical, Normal, Empirical, Multinomial

from os.path import join, abspath
import sys
sys.path.append(join(abspath('.'), '../utils'))
from utils import load_dataframe, preprocess, transition_matrix

In [11]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', -1)
sns.set_style('whitegrid')

## Data

In [12]:
df = load_dataframe()

Loading raw data from hdf5 cache...
Fetching raw data took 3.50 seconds
id             int64   
loan_status    category
age_of_loan    int64   
term           category
dtype: object
Retrieved 40,268,594 rows, 4 columns


In [13]:
# note I made this filter on term == 36 and age_of_loan <= 36 
df = preprocess(df)

Mapping transformations...
Loading preprocessed data from hdf5 cache...
Fetching preprocessed data took 2.83 seconds
id             int64
loan_status    int64
age_of_loan    int64
term           int64
dtype: object
Preprocessed 27,641,460 rows, 4 columns


In [14]:
realized_transitions = transition_matrix(df)

Loading transition matrix from hdf5 cache...
Fetching transition matrix took 0.06 seconds


In [15]:
realized_transitions

loan_status_y,Charged Off,Current,Default,Fully Paid,In Grace Period,Issued,Late (16-30 days),Late (31-120 days)
loan_status_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Charged Off,0,0,0,0,0,0,0,0
Current,774,24453702,3,707322,5831,0,160366,62102
Default,28897,147,2297,71,0,0,4,506
Fully Paid,0,0,0,8063,12,0,101,72
In Grace Period,0,276,0,11,22,0,59,41
Issued,0,17206,0,670,1,0,38,1
Late (16-30 days),4548,32376,0,2066,257,0,13413,119621
Late (31-120 days),105934,25434,29802,2146,56,0,3292,332762


## Model

In [22]:
# from issue https://github.com/blei-lab/edward/issues/450
chain_len = 36
n_states = df.loan_status.unique().shape[0]

# create default starting state probability vector
x_0 = Categorical(probs=tf.fill([n_states], 1.0 / n_states))

# transition matrix
T = tf.nn.softmax(tf.Variable(tf.random_uniform([n_states, n_states])), axis=0)

# MODEL
x = []
for _ in range(chain_len):
    x_tm1 = x[-1] if x else x_0
    x_t = Categorical(probs=T[:, x_tm1])
    x.append(x_t)

## Inference

In [23]:
df.dtypes

id             int64
loan_status    int64
age_of_loan    int64
term           int64
dtype: object

In [24]:
df.loc[df.id == 55742].loan_status.values

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3])

In [25]:
qx = [Categorical(probs=tf.nn.softmax(tf.Variable(tf.ones(n_states)))) for _ in range(chain_len)]

x_data = df.loc[df.id == 55742].loan_status.values

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(T))

    inference = ed.KLqp(dict(zip(x, qx)), dict(zip(x, x_data)))
    inference.run(n_iter=20000)

    print(sess.run(T))
    print('qx:')
    pprint(sess.run([foo.probs for foo in qx]))
    print('x:')
    pprint(sess.run([foo.probs for foo in x]))

[[0.09339098 0.09090046 0.12965283 0.1391049  0.19196498 0.10267992
  0.11436833 0.09429701]
 [0.16178802 0.11638659 0.17263737 0.08341139 0.0976414  0.11560068
  0.10797748 0.11512759]
 [0.08978619 0.13569684 0.10518714 0.14952794 0.13735092 0.08365457
  0.14951542 0.08035672]
 [0.1757229  0.11401516 0.08459467 0.0786596  0.11438404 0.1304588
  0.15258092 0.07018454]
 [0.12332325 0.10607208 0.10870188 0.08217118 0.11474957 0.1822207
  0.14335462 0.15483947]
 [0.13706343 0.09313743 0.18253113 0.1740693  0.09652725 0.13538864
  0.1191893  0.18945393]
 [0.08658472 0.18747242 0.09708162 0.16092776 0.09487512 0.14392984
  0.11845443 0.18371333]
 [0.13234043 0.15631908 0.11961339 0.13212791 0.15250671 0.10606684
  0.09455957 0.11202736]]
20000/20000 [100%] ██████████████████████████████ Elapsed: 111s | Loss: 2.778
[[0.11632124 0.11554148 0.11634213 0.11561058 0.11701157 0.1099626
  0.11763285 0.11249746]
 [0.19384244 0.19605905 0.19529247 0.19724227 0.19833048 0.19882351
  0.19617704 0.1958

In [26]:
T = 5000 # number of posterior samples => the "M" in our lecture on MCMC (length of MC used for inference)

# the approximating family has to be an empirical distribution in MCMC:
# qpi = ed.models.Empirical(params=tf.get_variable("qpi/params", [T, n_statuses],
#       initializer=tf.constant_initializer(1.0 / n_statuses))) # initialize as uniform probs


qpi = ed.models.Empirical(tf.Variable(name="qpi/params", expected_shape=[T, n_states],
                                      initial_value=tf.constant(1.0/n_states, shape=[T, n_states])))

# self.qu = ed.models.Empirical(params=tf.Variable(tf.zeros([n_iter, self.N, self.K]), name="qu"))
inference = ed.inferences.Gibbs(dict(zip(x, qx)), dict(zip(x, x_data)))
inference.run(n_iter=20000)

# CRITICISM
print("Inferred pi: {}".format(sess.run(qpi.mean()))) 

TypeError: Latent variable value has an invalid type: <class 'numpy.int64'>