## Homogenous (stationary) Markov Chain Implementation in Edward

### Package Imports and Options

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import tensorflow as tf
import edward as ed
from edward.models import Bernoulli, Categorical, Normal

from utils.utils import load_dataframe, load_data_dic

Instructions for updating:
Use the retry module or similar alternatives.


In [2]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', -1)

### Load Data

In [3]:
dic = load_data_dic()

In [7]:
df_raw = load_dataframe()

Fetching data from csvs...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d', 'application_type', 'verification_status_joint', 'sec_app_earliest_cr_line', 'hardship_flag', 'hardship_type', 'hardship_reason', 'hardship_status', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date', 'hardship_loan_status', 'disbursement_method', 'debt_settlement_flag', 'debt_settlement_flag_date', 'settlement_status', 'settlement_date']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


Fetching data took 116.04 seconds
Retrieved 2,132,287 rows, 151 columns


In [8]:
df_raw.shape

(2132287, 151)

In [6]:
df_raw.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,10159611,,10000.0,10000.0,10000.0,36 months,9.67%,321.13,B,B1,Registered Nurse,7 years,MORTGAGE,102000.0,Not Verified,Dec-2013,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=10159611,,debt_consolidation,Clean Up,027xx,MA,15.55,2.0,Oct-1989,670.0,674.0,0.0,11.0,,9.0,0.0,9912.0,44.4%,22.0,f,0.0,0.0,11560.462185,11560.46,10000.0,1560.46,0.0,0.0,0.0,Jan-2017,320.91,,Dec-2016,629.0,625.0,0.0,54.0,1.0,Individual,,,,0.0,0.0,39143.0,,,,,,,,,,,,22300.0,,,,3.0,4349.0,973.0,89.4,0.0,0.0,243.0,290.0,23.0,8.0,0.0,25.0,11.0,8.0,11.0,1.0,3.0,4.0,3.0,6.0,9.0,6.0,13.0,4.0,9.0,0.0,0.0,0.0,1.0,77.3,66.7,0.0,0.0,58486.0,39143.0,9200.0,36186.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,10159498,,12000.0,12000.0,12000.0,36 months,6.62%,368.45,A,A2,MANAGER INFORMATION DELIVERY,10+ years,MORTGAGE,105000.0,Not Verified,Dec-2013,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=10159498,,debt_consolidation,UNIVERSAL CARD,060xx,CT,14.05,0.0,Mar-1994,760.0,764.0,1.0,43.0,,12.0,0.0,13168.0,21.6%,22.0,w,0.0,0.0,13263.954639,13263.95,12000.0,1263.95,0.0,0.0,0.0,Jan-2017,368.2,,Oct-2018,814.0,810.0,0.0,,1.0,Individual,,,,0.0,0.0,267646.0,,,,,,,,,,,,61100.0,,,,4.0,26765.0,39432.0,25.0,0.0,0.0,146.0,237.0,20.0,3.0,4.0,20.0,,3.0,43.0,0.0,2.0,2.0,5.0,5.0,9.0,8.0,9.0,2.0,12.0,0.0,0.0,0.0,2.0,95.5,0.0,0.0,0.0,333044.0,42603.0,52600.0,42769.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,10129506,,20800.0,20800.0,20800.0,36 months,13.53%,706.16,B,B5,Operations Manager,10+ years,RENT,81500.0,Verified,Dec-2013,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=10129506,Borrower added on 12/31/13 > My goal is to purchase a home. I am consolidating my debt to lower interest rate to pay off debt faster. My goal is targeted for February 2015.<br>,debt_consolidation,Reducing Debt to Purchase Home,100xx,NY,16.73,0.0,Jun-1998,685.0,689.0,2.0,64.0,,29.0,0.0,23473.0,54.5%,41.0,f,0.0,0.0,23926.640008,23926.64,20800.0,3126.64,0.0,0.0,0.0,May-2015,13334.93,,Oct-2018,644.0,640.0,0.0,71.0,1.0,Individual,,,,0.0,0.0,23473.0,,,,,,,,,,,,43100.0,,,,9.0,869.0,6811.0,54.6,0.0,0.0,115.0,186.0,0.0,0.0,0.0,0.0,70.0,0.0,70.0,1.0,8.0,24.0,11.0,17.0,1.0,29.0,40.0,24.0,29.0,0.0,0.0,0.0,3.0,90.2,50.0,0.0,0.0,43100.0,23473.0,15000.0,0.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,10149342,,27050.0,27050.0,27050.0,36 months,10.99%,885.46,B,B2,Team Leadern Customer Ops & Systems,10+ years,OWN,55000.0,Verified,Dec-2013,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=10149342,Borrower added on 12/31/13 > Combining high interest credit cards to lower interest rate.<br>,debt_consolidation,Debt Consolidation,481xx,MI,22.87,0.0,Oct-1986,730.0,734.0,0.0,,,14.0,0.0,36638.0,61.2%,27.0,w,0.0,0.0,31752.53,31752.53,27050.0,4702.53,0.0,0.0,0.0,Jul-2016,6074.19,,Mar-2018,809.0,805.0,0.0,,1.0,Individual,,,,0.0,0.0,114834.0,,,,,,,,,,,,59900.0,,,,3.0,9570.0,16473.0,53.9,0.0,0.0,117.0,326.0,16.0,6.0,4.0,16.0,,8.0,,0.0,2.0,4.0,4.0,8.0,8.0,10.0,15.0,4.0,14.0,0.0,0.0,0.0,1.0,100.0,25.0,0.0,0.0,138554.0,70186.0,35700.0,33054.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,10149488,,4800.0,4800.0,4800.0,36 months,10.99%,157.13,B,B2,Surgical Technician,2 years,MORTGAGE,39600.0,Source Verified,Dec-2013,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=10149488,"Borrower added on 12/31/13 > Just bought a house, and would like a little extra funds to improve aspects of the house such as, duct work, electrical outlets, backyard, and other minor areas.<br>",home_improvement,For The House,782xx,TX,2.49,0.0,Aug-1995,755.0,759.0,2.0,,,3.0,0.0,4136.0,16.1%,8.0,w,0.0,0.0,5157.519457,5157.52,4800.0,357.52,0.0,0.0,0.0,Sep-2014,3900.48,,Jan-2017,534.0,530.0,0.0,,1.0,Individual,,,,0.0,0.0,4136.0,,,,,,,,,,,,25700.0,,,,0.0,1379.0,21564.0,16.1,0.0,0.0,104.0,220.0,25.0,25.0,0.0,25.0,,3.0,,0.0,2.0,2.0,3.0,4.0,1.0,3.0,7.0,2.0,3.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,25700.0,4136.0,25700.0,0.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [66]:
# whenever we need to reset to raw df we can run this
df = df_raw

In [67]:
# drop null and remove non-digits from term
print(df.shape)
df.dropna(subset=['term'], inplace=True)
print(df.shape)
df.term = df.term.str.replace(r'\D+', '')

(2132287, 151)
(2132256, 151)


In [58]:
df_36 = df.loc[df.term == '36']
df_60 = df.loc[df.term == '60']
df_other = df_raw.loc[~((df_raw.term == '60') | (df_raw.term == '36'))]

In [60]:
print(df_raw.shape)

print(df_36.shape)
print(df_60.shape)
print(df_other.shape)

print(df_36.shape[0] + df_60.shape[0] + df_other.shape[0])

(2132287, 151)
(1521575, 151)
(610681, 151)
(31, 151)
2132287


In [63]:
df_other

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
188181,Total amount funded in policy code 1: 2700702175,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
188182,Total amount funded in policy code 2: 81866225,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
235629,Total amount funded in policy code 1: 3503840175,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
235630,Total amount funded in policy code 2: 873652739,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
421095,Total amount funded in policy code 1: 6417608175,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
421096,Total amount funded in policy code 2: 1944088810,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
130772,Total amount funded in policy code 1: 2080429200,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
130773,Total amount funded in policy code 2: 737901574,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
128194,Total amount funded in policy code 1: 2063142975,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
128195,Total amount funded in policy code 2: 823319310,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
# resources
# http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture11.pdf
# https://github.com/blei-lab/edward/issues/450
# https://gist.github.com/fredcallaway/c7252b6326dfb502e70cad4146731aef
# https://discourse.edwardlib.org/t/a-simple-tensorflow-implementation-of-forward-backward/67
# https://gist.github.com/currymj/e903644c4e54e35fdb858c94f1631fe4

In [None]:
# experimenting with: https://gist.github.com/fredcallaway/c7252b6326dfb502e70cad4146731aef
def categorical(ps):
    return Categorical(logits=ed.logit(ps)).value()

def flip(p):
    return tf.equal(Bernoulli(p=p), tf.constant(1))

def append(lst, x):
    return tf.concat(0, [lst, [x]])

class HMM(object):
    """A Hidden Markov Model."""

    def step(self, state):
        """Returns a new state following `state`."""
        raise NotImplementedError()

    def emit(self, state):
        """Returns an observable emission from `state`."""
        raise NotImplementedError()

    def init(self):
        """Returns an intial state."""
        raise NotImplementedError()

    def final(self, state):
        """Returns true if the model should stop in `state`."""
        raise NotImplementedError()
    
    def sample(self):
        def cond(states, emissions):
            s0 = states[-1]
            return self.final(s0) # TODO not final
        
        def body(states, emissions):
            s0 = states[-1]
            s1 = self.step(s0)
            e1 = self.emit(s1)
            return append(states, s1), append(emissions, e1)

        s0 = self.init()
        e0 = self.emit(s0)
        states = tf.convert_to_tensor([s0])
        emissions = tf.convert_to_tensor([e0])
        return tf.while_loop(
            cond, body, 
            loop_vars=[states, emissions],
            shape_invariants=[tf.TensorShape(None), tf.TensorShape(None)]
        )


class DiscreteGaussianHMM(HMM):
    """HMM with discrete transitions and gaussian emissions."""
    def __init__(self, P, mu, sigma, p_init, p_final):
        super().__init__()
        self.P = P
        self.mu = mu
        self.sigma = sigma
        self.p_init = p_init
        self.p_final = p_final

    def step(self, state):
        return categorical(self.P[state])

    def emit(self, state):
        return Normal(mu=self.mu[state], sigma=self.sigma[state]).value()

    def init(self):
        return categorical(self.p_init)

    def final(self, state):
        return flip(1 - tf.gather(self.p_final, state))
      

def demo():
    import matplotlib.pyplot as plt

    P = tf.constant(np.array([
        [.6, .4],
        [.2, .8],
    ], dtype='float32'))
    mu = tf.constant([5., -5.])
    sigma = tf.constant([1., 1.])
    model = DiscreteGaussianHMM(P, mu, sigma, [0.5, 0.5], [0.02, 0.02])

    sess = ed.get_session()
    emissions = [sess.run(model.sample()[1]) for _ in range(3)]
    for e in emissions:
        plt.plot(e)
    plt.show()