In [1]:
# ####################################
# Settings & Imports
# ####################################

# Imports from __future__ in case we're running Python 2
from __future__ import division, print_function
from __future__ import absolute_import, unicode_literals

# import my own helper functions
from read import read_sims_result
from clean import cleanup_0IR_exp
from clean import cleanup_0IR_single

# Our numerical workhorses
import numpy as np

# Import pyplot for plotting
import matplotlib.pyplot as plt

# import logistic regression from scikit learn 
from sklearn.linear_model import LogisticRegression

# import model selection stuff from scikit learn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# import feature selection stuff from scikit learn
from sklearn.feature_selection import RFE

# sklearn.metrics
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import brier_score_loss

#
import statsmodels.api as sm

# 
# from sklearn import preprocessing

# Import pandas
import pandas as pd

# Seaborn, useful for graphics
import seaborn as sns

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables SVG graphics inline.  There is a bug, so uncomment if it works.
# %config InlineBackend.figure_formats = {'svg',}

# This enables high resolution PNGs. SVG is preferred, but has problems
# rendering vertical and horizontal lines
# %config InlineBackend.figure_formats = {'png', 'retina'}

# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2, 
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': 'DFDFE5'}
# sns.set_context('notebook', rc=rc)
# sns.set_style('darkgrid', rc=rc)

# remove some pandas warning
pd.options.mode.chained_assignment = None

Now we want to construct a model to estimate the probability of default based on some parameter when this is no borrowing or lending.

First, let's read and clean up the data.

Then, we observe the data.

In [2]:
# ####################################
# Actually run analysis, toggle env variable here
# ####################################
df = read_sims_result("/Users/xcheng/Documents/Oberlin/Summer2/DataAnalysis/data/0622/0IR", 32)
df2 = cleanup_0IR_exp(df, numNode=32, numPeriod=15, numSim=200, balanced=True)

In [4]:
# ####################################
# Make Sure There is NO DEBT in this 0IR simulation
# ####################################
# df3 = df[df["defaults due to negative wealth"] # filter default rows
#   +df["defaults due to deposit shock"]
#   +df['defaults due to interest'] == 0].copy()
# for col in df.columns.values[:31]:
#     print('{0:5s} {1:15f} {2:15f}'.format(col, df3[col].sum(), df[col].sum()))
# print("total debt to pay:", df3["debt to pay"].sum())
# print("total debt owed:", df3["debt owed"].sum())

In [5]:
# ####################################
# look at the data
# ####################################
# k=1
# sim1 = df.loc[k*15*31:(k+1)*15*31-1]
# sim1_ready = cleanup_0IR_single(sim1, 32)
# sim1_ready[sim1_ready["theta (risk aversion)"] == 0.05]
# sim2 = df.loc[49*15*31:50*15*31-1].copy().reset_index(drop=True)
# sim2_ready = cleanup_0IR_single(sim2, 32)
# sim2_ready
# sim2_ready[sim2_ready["theta (risk aversion)"] == 0.05].loc[:,"period":]
# df2[df2["theta (risk aversion)"] == 0.05].loc[:,"period":]

In [6]:
# ####################################
# look at how things differ for banks with different level of risk aversion
# ####################################
# avg_by_bank = df2.groupby('theta (risk aversion)').mean()
# avg_by_bank[["default-next-wealth", "default-next-deposit"]].plot(
#     kind="line", 
#     figsize=(12,8),
#     title="Frequency & Reason of Default for Each Bank"
# )
# avg_by_bank[["wealth", "deposits", "cash", "assets"]].plot(
#     kind="line", 
#     figsize=(12,8),
#     title="Investment Decisions for Each Bank"
# )
# avg_by_bank[["credit available", "credit issued"]].plot(
#     kind="line", 
#     figsize=(12,8),
#     title="Credit Condition for Each Bank"
# )

In [7]:
# ####################################
# look at the plots for correlation
# ####################################

# to see how independent variables correlate with each other 
# plt.scatter(df2["cash"], df2["credit available"])
# plt.scatter(df2["assets"], df2["credit available"])
# plt.scatter(df2["deposits"], df2["credit available"])
# plt.scatter(df2["cash"], df2["assets"])
# plt.scatter (df2["cash"], df2["deposits"])
# plt.scatter(df2["deposits"], df2["assets"])

# to see how dependent variable relates to independent variables
# plt.scatter(df2["default-next"], df2["cash"])
# plt.scatter(df2["default-next"], df2["assets"])
# plt.scatter(df2["default-next"], df2["deposits"])
# plt.scatter(df2["default-next"], df2["wealth"])

# plot settings
# plt.yscale("symlog")
# plt.xscale("symlog")
# plt.rcParams["figure.figsize"] = [25,15]

In [8]:
# ####################################
# look at how things differ during each period
# ####################################

# Plot time vs default
# df2['period'].hist(by=df2['default-next'], bins=14,  rwidth=0.7, stacked=True)

# Plot time vs independent variables
# avg_by_time = df2.groupby('period').mean()
# avg_by_time[["default-next-wealth", "default-next-deposit"]].plot(
#     kind="line", 
#     figsize=(12,8),
#     title="Frequency & Reason of Default During Each Period"
# )
# avg_by_time[["wealth", "deposits", "cash", "assets"]].plot(
#     kind="line", 
#     figsize=(12,8),
#     title="Investment Decisions During Each Period"
# )
# avg_by_time[["credit available", "credit issued"]].plot(
#     kind="line", 
#     figsize=(12,8),
#     title="Credit Condition During Each Period"
# )

In [9]:
# negative wealth not default
# df[df["wealth"]<0].loc[1:10000,"period":]
# df2[df2["wealth"]<=0]["wealth"]

In [10]:
# come back from 0 wealth
# df2[df2["theta (risk aversion)"]==0.1]

The thing about negative wealth is due to the haircut in asset when calculating wealth.

In [11]:
# check the number of default vs non-default cases
# df2.groupby('default-next').count()

In [12]:
# Try to figure out whether there is Multicollinearity
# df2["cash"]+df2["assets"]*0.8-df2["deposits"]-df2["wealth"]
# np.linalg.eig(X.corr())

We want to construst a model to predict default (in the next period) in a simulation where there is no lending or borrowing. 

Variables we can consider includes deposits, cash, assets, credit available, credit issued. 

I exclude theta (risk aversion) because I think it is private info. 

I exclude period because I don't think banks should have this information. It feels like cheating.

I exclude defaults due to interest, debt to pay, debt owed, over leverages because there is no debt. 

I exclude credit issued becuase this should not affect anything when there is no debt. 

I exclude wealth because wealth is a linear combination of deposits, cash and assets.
(this might not be true)

Let's do a logistic regression.

In [29]:
# independent variables (candidates)
independent = ["deposits", "cash", "assets", "credit available", "wealth", "leverage", 
         "dummy-0-leverage",
         "wealth-lag", "deposits-lag", "cash-lag", "assets-lag", "leverage-lag", 
         "credit-available-lag", "credit-issued-lag", "dummy-0-leverage-lag",
         "over-leverage-frequency"]
# dependent variable 0
dependent = "default-next"

In [30]:
X = df2[independent]
y = df2[dependent]

In our data, there are significantly more non-default cases than default cases after I remove cases where the banks default during the previous periods. Over 98% of the cases are non-default. So when we do cross validation, the model we train would just predict not default every time and still get a very high accuracy. 

[DONE] Solution : Throw away lots of non-default cases randomly
    -> get the same amount of default and non-default cases

In [14]:
# correlation matrix
# X.corr()

We sometimes get a warning about complete quasi-separation. What does that mean? Trying to figure out.

Once I remove cash as an independent variable, the warning about possible complete quasi-separation goes away. However, I don't think we should readlly care about complete quasi-separation.

In [15]:
# RFE feature selection
# logreg = LogisticRegression()
# logreg.fit(X,y)
# chosen_rfe_X = []
# for i in range(1,6):
#     rfe = RFE(logreg, i)
#     rfe = rfe.fit(X,y)
#     print(i, "feature(s):", rfe.support_, rfe.ranking_)
#     chosen_rfe_X.append(X.columns.values[rfe.support_])
# chosen_rfe_X.append(X.columns.values)

In [16]:
# fit the model with features selected by RFE
# for predictors in chosen_rfe_X:
#     print(sm.Logit(y, sm.add_constant(df2[predictors])).fit().summary())

In [17]:
# Check the prediction
# m = LogisticRegression()
# m.fit(df2[["cash", "assets"]], df2["default-next"])
# print(m.intercept_, m.coef_)
# print(m.predict([[0,50], [50,0],[50,50],[-100,-100],[1000,1000]]))
# print(m.predict_proba([[10,0.9]]))

In [18]:
# ##############################
# K-fold cross validation
# ##############################
# fold = 9
# kf = KFold(n_splits=fold, shuffle=True)

# for pp in chosen_rfe_X[1:]:
#     xx = df2[pp]
#     score = 0
# #     f1 = 0
#     con = np.array([[0, 0], [0, 0]])
# #     mat = 0
#     brier = 0
#     for train_index, test_index in kf.split(xx):
#         logreg = LogisticRegression()
#         logreg.fit(xx.iloc[train_index], y.iloc[train_index])
#         score += logreg.score(xx.iloc[test_index], y.iloc[test_index])
#         con += confusion_matrix(y.iloc[test_index], logreg.predict(xx.iloc[test_index]))
# #         f1 += f1_score(y.iloc[test_index], logreg.predict(xx.iloc[test_index]))
# #         mat += matthews_corrcoef(y.iloc[test_index], logreg.predict(xx.iloc[test_index]))
#         brier += brier_score_loss(y.iloc[test_index], logreg.predict(xx.iloc[test_index]))
#     print("{}\n {}\n accuracy:{:24}\n  brier:{:24}\n".format(
#         xx.columns.values, con, score/fold, brier/fold))
    

# for p in X.columns.values:
#     xx = df2[p].values.reshape(-1, 1)
#     score = 0
# #     f1 = 0
#     con = np.array([[0, 0], [0, 0]])
#     brier = 0
#     for train_index, test_index in kf.split(xx):
#         logreg = LogisticRegression()
#         logreg.fit(xx[train_index], y[train_index])
#         score += logreg.score(xx[test_index], y[test_index])
#         con += confusion_matrix(y[test_index], logreg.predict(xx[test_index]))
# #         f1 += f1_score(y[test_index], logreg.predict(xx[test_index]))
#         brier += brier_score_loss(y[test_index], logreg.predict(xx[test_index]))
#     print("{}\n {}\n accuracy:{:24}\n brier:{:24}\n".format(
#         p, con, score/fold, brier/fold))

In [19]:
# ##############################
# Regularization + cross validation 
# trial and error for the best lambda
# ##############################
# fold = 12
# kf = KFold(n_splits=fold, shuffle=True)

# accuracy = 0
# conf = np.array([[0, 0], [0, 0]])
# brier = 0
# co_effs = pd.DataFrame(columns = np.append(X.columns.values, "const"), 
#                        index=range(fold))
# row = 0

# for train_index, test_index in kf.split(X):
#     lasso = LogisticRegression(penalty="l1", C=0.007)
#     lasso.fit(X.iloc[train_index], y.iloc[train_index])
#     new_row = np.append(lasso.coef_, lasso.intercept_)
#     co_effs.loc[row] = new_row
#     accuracy += lasso.score(X.iloc[test_index], y.iloc[test_index])
#     conf += confusion_matrix(y.iloc[test_index], lasso.predict(X.iloc[test_index]))
#     brier += brier_score_loss(y.iloc[test_index], lasso.predict(X.iloc[test_index]))
#     row += 1
# print("{}\n accuracy:{:24}\n brier:{:24}\n".format(
#         conf, accuracy/fold, brier/fold))
# co_effs

In [31]:
final = LogisticRegression(penalty="l1", C=0.007)
final.fit(X,y)
print(final.coef_, final.intercept_)

LogisticRegression(C=0.007, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
# ##############################
# Read input for 1 interest rate
# ##############################
df_1 =  read_sims_result("/Users/xcheng/Documents/Oberlin/Summer2/DataAnalysis/data/0614/1IR", 32)
df_1c = cleanup_0IR_exp(df_1, numNode=32, numPeriod=15, numSim=50, balanced=False)

In [42]:
# ##############################
# Examine the prediction interest rate
# ##############################
print(final.score(df_1c[independent], df_1c[dependent]))
print(confusion_matrix(df_1c[dependent], final.predict(df_1c[independent])))
print(brier_score_loss(df_1c[dependent], final.predict(df_1c[independent])))

0.9093400687876182
[[16786  1686]
 [    1   135]]
0.09065993121238178


In [43]:
# ##############################
# Read input for 2 interest rates
# ##############################
df_2 =  read_sims_result("/Users/xcheng/Documents/Oberlin/Summer2/DataAnalysis/data/0614/2IR", 32)
df_2c = cleanup_0IR_exp(df_2, numNode=32, numPeriod=15, numSim=50, balanced=False)

In [44]:
# ##############################
# Examine the prediction for 2 interest rates
# ##############################
print(final.score(df_2c[independent], df_2c[dependent]))
print(confusion_matrix(df_2c[dependent], final.predict(df_2c[independent])))
print(brier_score_loss(df_2c[dependent], final.predict(df_2c[independent])))

0.8994513553153349
[[16407  1848]
 [    3   151]]
0.10054864468466511


As we can see, the prediction is fairly good.