In [7]:
# ####################################
# Settings & Imports
# ####################################

# Imports from __future__ in case we're running Python 2
from __future__ import division, print_function
from __future__ import absolute_import, unicode_literals

# import my own helper functions
from read import read_sims_result
from clean import cleanup_0IR_exp
from clean import cleanup_0IR_single

# Our numerical workhorses
import numpy as np

# Import pyplot for plotting
import matplotlib.pyplot as plt

# import logistic regression from scikit learn 
from sklearn.linear_model import LogisticRegression

# import model selection stuff from scikit learn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# import feature selection stuff from scikit learn
from sklearn.feature_selection import RFE

# sklearn.metrics
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import brier_score_loss
from sklearn.metrics import mean_squared_error

#
import statsmodels.api as sm

#
# from sklearn import preprocessing

# Import pandas
import pandas as pd

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables SVG graphics inline.  There is a bug, so uncomment if it works.
# %config InlineBackend.figure_formats = {'svg',}

# This enables high resolution PNGs. SVG is preferred, but has problems
# rendering vertical and horizontal lines
# %config InlineBackend.figure_formats = {'png', 'retina'}

# remove some pandas warning
pd.options.mode.chained_assignment = None

Now we want to construct a model to estimate the probability of default based on some parameter when this is no borrowing or lending.

First, let's read and clean up the data.

Then, we observe the data.

In [3]:
# ####################################
# Read input, toggle env variable here
# ####################################
train_on_file = "0622/0IR300s"
df = read_sims_result("/Users/xcheng/Documents/Oberlin/honors/DataAnalysis/data/"+train_on_file, 32)
df2 = cleanup_0IR_exp(df, numSim=300, balanced=False, debts=False)

In [39]:
# ####################################
# Make Sure There is NO DEBT in this 0IR simulation
# ####################################
# df3 = df[df["defaults due to negative wealth"] # filter default rows
#   +df["defaults due to deposit shock"]
#   +df['defaults due to interest'] == 0].copy()
# for col in df.columns.values[:31]:
#     print('{0:5s} {1:15f} {2:15f}'.format(col, df3[col].sum(), df[col].sum()))
# print("total debt to pay:", df3["debt to pay"].sum())
# print("total debt owed:", df3["debt owed"].sum())

In [40]:
# ####################################
# look at how things differ for banks with different level of risk aversion
# ####################################
# avg_by_bank = df2.groupby('theta (risk aversion)').mean()
# avg_by_bank[["default-next-wealth", "default-next-deposit"]].plot(
#     kind="line", 
#     figsize=(12,8),
#     title="Frequency & Reason of Default for Each Bank"
# )
# avg_by_bank[["wealth", "deposits", "cash", "assets"]].plot(
#     kind="line", 
#     figsize=(12,8),
#     title="Investment Decisions for Each Bank"
# )
# avg_by_bank[["credit available", "credit issued"]].plot(
#     kind="line", 
#     figsize=(12,8),
#     title="Credit Condition for Each Bank"
# )

In [41]:
# ####################################
# look at the plots for correlation
# ####################################

# to see how independent variables correlate with each other 
# plt.scatter(df2["cash"], df2["credit available"])
# plt.scatter(df2["assets"], df2["credit available"])
# plt.scatter(df2["deposits"], df2["credit available"])
# plt.scatter(df2["cash"], df2["assets"])
# plt.scatter (df2["cash"], df2["deposits"])
# plt.scatter(df2["deposits"], df2["assets"])

# to see how dependent variable relates to independent variables
# plt.scatter(df2["default-next"], df2["cash"])
# plt.scatter(df2["default-next"], df2["assets"])
# plt.scatter(df2["default-next"], df2["deposits"])
# plt.scatter(df2["default-next"], df2["wealth"])

# plot settings
# plt.yscale("symlog")
# plt.xscale("symlog")
# plt.rcParams["figure.figsize"] = [25,15]

In [42]:
# ####################################
# look at how things differ during each period
# ####################################

# Plot time vs default
# df2['period'].hist(by=df2['default-next'], bins=14,  rwidth=0.7, stacked=True)

# Plot time vs independent variables
# avg_by_time = df2.groupby('period').mean()
# avg_by_time[["default-next-wealth", "default-next-deposit"]].plot(
#     kind="line", 
#     figsize=(12,8),
#     title="Frequency & Reason of Default During Each Period"
# )
# avg_by_time[["wealth", "deposits", "cash", "assets"]].plot(
#     kind="line", 
#     figsize=(12,8),
#     title="Investment Decisions During Each Period"
# )
# avg_by_time[["credit available", "credit issued"]].plot(
#     kind="line", 
#     figsize=(12,8),
#     title="Credit Condition During Each Period"
# )

In [43]:
# negative wealth not default
# df[df["wealth"]<0].loc[1:10000,"period":]
# df2[df2["wealth"]<=0]["wealth"]

In [44]:
# come back from 0 wealth
# df2[df2["theta (risk aversion)"]==0.1]

The thing about negative wealth is due to the haircut in asset when calculating wealth.

In [45]:
# check the number of default vs non-default cases
# df2.groupby('default-next').count()

In [46]:
# Try to figure out whether there is Multicollinearity
# df2["cash"]+df2["assets"]*0.8-df2["deposits"]-df2["wealth"]
# np.linalg.eig(X.corr())

In [47]:
# ####################################
# look at the data
# ####################################
# k=1
# sim1 = df.loc[k*15*31:(k+1)*15*31-1]
# sim1_ready = cleanup_0IR_single(sim1, 32)
# sim1_ready[sim1_ready["theta (risk aversion)"] == 0.05]
# sim2 = df.loc[49*15*31:50*15*31-1].copy().reset_index(drop=True)
# sim2_ready = cleanup_0IR_single(sim2, 32)
# sim2_ready
# sim2_ready[sim2_ready["theta (risk aversion)"] == 0.05].loc[:,"period":]
# df2[df2["theta (risk aversion)"] == 0.05].loc[:,"period":]

We want to construst a model to predict default (in the next period) in a simulation where there is no lending or borrowing. 

Variables we can consider includes deposits, cash, assets, credit available, credit issued. 

I exclude theta (risk aversion) because I think it is private info. 

I exclude period because I don't think banks should have this information. It feels like cheating.

I exclude defaults due to interest, debt to pay, debt owed, over leverages because there is no debt. 

I exclude credit issued becuase this should not affect anything when there is no debt. 

I exclude wealth because wealth is a linear combination of deposits, cash and assets.
(this might not be true)

Let's do a logistic regression.

In [6]:
df2.columns

Index(['period', 'theta (risk aversion)', 'defaults due to interest',
       'defaults due to negative wealth', 'defaults due to deposit shock',
       'over leverages', 'wealth', 'debt to pay', 'credit available',
       'debt owed', 'credit issued', 'deposits', 'cash', 'assets', 'sim#',
       'bankID', 'default-next-wealth', 'default-next-deposit',
       'default-next-interest', 'leverage', 'dummy-0-leverage',
       'over-leverage-frequency', 'wealth-lag', 'deposits-lag', 'cash-lag',
       'assets-lag', 'leverage-lag', 'credit-available-lag',
       'credit-issued-lag', 'dummy-0-leverage-lag', 'default-next'],
      dtype='object')

In [4]:
# independent variables (candidates)
independent = ["deposits", "cash", "assets", "credit available", "wealth", "leverage", 
         "dummy-0-leverage",
         "wealth-lag", "deposits-lag", "cash-lag", "assets-lag", "leverage-lag",
         "dummy-0-leverage-lag",
         "over-leverage-frequency"]
# dependent variable 0
dependent = "default-next"

In [5]:
X = df2[independent]
y = df2[dependent]

In [46]:
# output summary statisitics
# X.describe().T[['mean', 'min', 'max']].to_csv("what.csv", sep='&', float_format='%.3f')

In our data, there are significantly more non-default cases than default cases after I remove cases where the banks default during the previous periods. Over 98% of the cases are non-default. So when we do cross validation, the model we train would just predict not default every time and still get a very high accuracy. 

[DONE] Solution : Throw away lots of non-default cases randomly
    -> get the same amount of default and non-default cases

In [50]:
# correlation matrix
# X.corr()

We sometimes get a warning about complete quasi-separation. What does that mean? Trying to figure out.

Once I remove cash as an independent variable, the warning about possible complete quasi-separation goes away. However, I don't think we should readlly care about complete quasi-separation.

In [51]:
# RFE feature selection
# logreg = LogisticRegression()
# logreg.fit(X,y)
# chosen_rfe_X = []
# for i in range(1,6):
#     rfe = RFE(logreg, i)
#     rfe = rfe.fit(X,y)
#     print(i, "feature(s):", rfe.support_, rfe.ranking_)
#     chosen_rfe_X.append(X.columns.values[rfe.support_])
# chosen_rfe_X.append(X.columns.values)

In [52]:
# fit the model with features selected by RFE
# for predictors in chosen_rfe_X:
#     print(sm.Logit(y, sm.add_constant(df2[predictors])).fit().summary())

In [53]:
# Check the prediction
# m = LogisticRegression()
# m.fit(df2[["cash", "assets"]], df2["default-next"])
# print(m.intercept_, m.coef_)
# print(m.predict([[0,50], [50,0],[50,50],[-100,-100],[1000,1000]]))
# print(m.predict_proba([[10,0.9]]))

In [54]:
# ##############################
# K-fold cross validation
# ##############################
# fold = 9
# kf = KFold(n_splits=fold, shuffle=True)

# for pp in chosen_rfe_X[1:]:
#     xx = df2[pp]
#     score = 0
# #     f1 = 0
#     con = np.array([[0, 0], [0, 0]])
# #     mat = 0
#     brier = 0
#     for train_index, test_index in kf.split(xx):
#         logreg = LogisticRegression()
#         logreg.fit(xx.iloc[train_index], y.iloc[train_index])
#         score += logreg.score(xx.iloc[test_index], y.iloc[test_index])
#         con += confusion_matrix(y.iloc[test_index], logreg.predict(xx.iloc[test_index]))
# #         f1 += f1_score(y.iloc[test_index], logreg.predict(xx.iloc[test_index]))
# #         mat += matthews_corrcoef(y.iloc[test_index], logreg.predict(xx.iloc[test_index]))
#         brier += brier_score_loss(y.iloc[test_index], logreg.predict(xx.iloc[test_index]))
#     print("{}\n {}\n accuracy:{:24}\n  brier:{:24}\n".format(
#         xx.columns.values, con, score/fold, brier/fold))
    

# for p in X.columns.values:
#     xx = df2[p].values.reshape(-1, 1)
#     score = 0
# #     f1 = 0
#     con = np.array([[0, 0], [0, 0]])
#     brier = 0
#     for train_index, test_index in kf.split(xx):
#         logreg = LogisticRegression()
#         logreg.fit(xx[train_index], y[train_index])
#         score += logreg.score(xx[test_index], y[test_index])
#         con += confusion_matrix(y[test_index], logreg.predict(xx[test_index]))
# #         f1 += f1_score(y[test_index], logreg.predict(xx[test_index]))
#         brier += brier_score_loss(y[test_index], logreg.predict(xx[test_index]))
#     print("{}\n {}\n accuracy:{:24}\n brier:{:24}\n".format(
#         p, con, score/fold, brier/fold))

In [55]:
# ##############################
# look at the pattern in defaults across different simulation
# ##############################
# df2[df2["default-next"]==1].loc[:,"sim#"].plot(kind='hist',
#                                                title="# of defaulted banks during each simulation",
#                                                bins=300, 
#                                                figsize=(16, 6)
#                                               )
# df2[df2["default-next"]==1].plot(kind='scatter', 
#                                  x="sim#",
#                                  y="period", 
#                                  figsize=(16,6),
#                                  title="The periods that defaults happen during each simulation"
#                                 )

In [6]:
# ##############################
# Regularization + cross validation 
# trial and error for the best lambda
# ##############################
fold = 12
inv_reg_strength = 0.007
kf = KFold(n_splits=fold, shuffle=True)

accuracy = 0
conf = np.array([[0, 0], [0, 0]])
brier = 0
co_effs = pd.DataFrame(columns = np.append(X.columns.values, "const"), 
                       index=range(fold))
row = 0

for train_index, test_index in kf.split(X):
    lasso = LogisticRegression(penalty="l1", C=inv_reg_strength)
    lasso.fit(X.iloc[train_index], y.iloc[train_index])
    new_row = np.append(lasso.coef_, lasso.intercept_)
    co_effs.loc[row] = new_row
    accuracy += lasso.score(X.iloc[test_index], y.iloc[test_index])
    conf += confusion_matrix(y.iloc[test_index], lasso.predict(X.iloc[test_index]))
    brier += brier_score_loss(y.iloc[test_index], lasso.predict(X.iloc[test_index]))
    row += 1
print("{}\n accuracy:{:24}\n brier:{:24}\n".format(
        conf, accuracy/fold, brier/fold))
co_effs

[[103257     89]
 [  1060     71]]
 accuracy:      0.9890023512042108
 brier:    0.010997648795789142



Unnamed: 0,deposits,cash,assets,credit available,wealth,leverage,dummy-0-leverage,wealth-lag,deposits-lag,cash-lag,assets-lag,leverage-lag,dummy-0-leverage-lag,over-leverage-frequency,const
0,0,-0.00405699,0.0187464,-0.00295503,-0.0972679,0.0125242,-0.347231,-0.036873,0.000180019,0.0,-0.00360895,-0.0285629,0,0,-3.15878
1,0,-0.00424096,0.0199552,-0.000702847,-0.110804,0.0120581,-0.290797,-0.0363759,0.00052051,0.0,-0.00537493,-0.0280979,0,0,-3.10436
2,0,-0.00392597,0.019634,-0.00367666,-0.0989065,0.0209843,-0.314829,-0.0409095,0.000979653,0.0,-0.00582258,-0.046779,0,0,-3.10975
3,0,-0.00320477,0.020037,-0.00469506,-0.106832,0.0148513,-0.262991,-0.0372141,0.000251162,0.0,-0.00498773,-0.0332982,0,0,-3.13537
4,0,-0.00370993,0.0186168,-0.00361191,-0.109528,0.0105363,-0.300851,-0.0281913,0.000394699,0.0,-0.00429308,-0.0246577,0,0,-3.16443
5,0,-0.00415379,0.0192308,-0.00574917,-0.0958207,0.0135407,-0.34531,-0.0373825,0.000608729,0.0,-0.00472521,-0.0317393,0,0,-3.14994
6,0,-0.00432058,0.0199607,-0.00281846,-0.111233,0.0127892,-0.295612,-0.0368266,0.000453305,0.0,-0.00528737,-0.0288377,0,0,-3.07798
7,0,-0.00386932,0.0200215,0.0,-0.0993416,0.0107413,-0.310735,-0.0406549,0.0,-8.24892e-05,-0.00551553,-0.0235118,0,0,-3.16122
8,0,-0.00308033,0.0188581,-0.00319111,-0.108086,0.0107348,-0.278604,-0.03084,0.0,-0.00035885,-0.00391553,-0.0240731,0,0,-3.17471
9,0,-0.0039798,0.0196217,-0.00327363,-0.101557,0.0132482,-0.370622,-0.0385347,0.000714742,0.0,-0.00539658,-0.039921,0,0,-3.06451


In [8]:
# ################################
# Fit the model on all data
# ################################
inv_reg_strength = 0.0007
final = LogisticRegression(penalty="l1", C=inv_reg_strength)
final.fit(X,y)

LogisticRegression(C=0.0007, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [9]:
# ################################
# look at the importance of each variable
# importance = coefficient * varaible average
# ################################
var_imp = pd.DataFrame(columns = np.append(X.columns.values, "const"))
coeff = np.append(final.coef_, final.intercept_)
var_imp.loc["coefficient"] = coeff
var_imp.loc["importance"] = coeff * np.append(np.array(X.mean()), 1)
var_imp

Unnamed: 0,deposits,cash,assets,credit available,wealth,leverage,dummy-0-leverage,wealth-lag,deposits-lag,cash-lag,assets-lag,leverage-lag,dummy-0-leverage-lag,over-leverage-frequency,const
coefficient,0.0,-0.009588,0.035445,-0.018639,-0.205266,0.019904,0.0,-0.09446,0.006865,0.0,-0.019053,-0.166881,0.0,0.0,-0.785173
importance,0.0,-0.99788,0.995747,-0.189899,-2.786152,0.164262,0.0,-1.197659,0.680074,0.0,-0.463354,-1.267147,0.0,0.0,-0.785173


In [14]:
var_imp.T[var_imp.T['coefficient']!=0].to_csv("inpaper.csv", sep='&', float_format='%.6f')

In [39]:
# ##############################
# Read input for 1 interest rate
# ##############################
df_1 =  read_sims_result("/Users/xcheng/Documents/Oberlin/Summer2/DataAnalysis/data/0625/1IR", 32)
df_1c = cleanup_0IR_exp(df_1, numSim=50, balanced=False)

In [19]:
# visualize prediction results
# plt.hist(list(map(lambda x: x[1], final.predict_proba(df_1c[df_1c["default-next"]==0][independent]))))

In [58]:
# ##############################
# Examine the prediction interest rate
# ##############################
print(final.score(df_1c[independent], df_1c[dependent]))
print(confusion_matrix(df_1c[dependent], final.predict(df_1c[independent])))
print(brier_score_loss(df_1c[dependent], final.predict(df_1c[independent])))

0.9267543859649123
[[16747  1332]
 [    4   157]]
0.07324561403508772


In [40]:
# ##############################
# Read input for 2 interest rates
# ##############################
df_2 =  read_sims_result("/Users/xcheng/Documents/Oberlin/Summer2/DataAnalysis/data/0625/2IR", 32)
df_2c = cleanup_0IR_exp(df_2, numSim=50, balanced=False)

In [56]:
# ##############################
# Examine the prediction for 2 interest rates
# ##############################
print(final.score(df_2c[independent], df_2c[dependent]))
print(confusion_matrix(df_2c[dependent], final.predict(df_2c[independent])))
print(brier_score_loss(df_2c[dependent], final.predict(df_2c[independent])))

0.9244439538605883
[[16594  1362]
 [    7   156]]
0.07555604613941166


As we can see, the prediction is fairly good.

We have a lot of false negatives, but hopefully the network part would help with this.

An interesting observation: the more sample we have in 0IR, the more false negatives, the less true negatives we get (less accurate predictions).