In [148]:
# the packages
import pandas as pd
import numpy as np
import networkx as nx

# import my own helper functions
from read import read_sims_result
from clean import cleanup_0IR_exp
from clean import cleanup_network

# pearson correlation coeffcient
from scipy.stats.stats import pearsonr

# deque
from collections import deque

# deep copy
from copy import deepcopy

# random [0,1)
from random import random

# Page Rank
from networkx.algorithms.link_analysis.pagerank_alg import pagerank, pagerank_numpy, pagerank_scipy

# distance
from networkx.algorithms.shortest_paths.generic import shortest_path_length

# logistic regression
from sklearn.linear_model import LogisticRegression

# plot
import matplotlib.pyplot as plt

In [2]:
# independent variables
independent = ["deposits", "cash", "assets", "credit available", "wealth", "leverage", 
         "dummy-0-leverage",
         "wealth-lag", "deposits-lag", "cash-lag", "assets-lag", "leverage-lag", 
         "credit-available-lag", "credit-issued-lag", "dummy-0-leverage-lag",
         "over-leverage-frequency"]

In [3]:
# ###########################
# Read OIR results, and fit the model
# ###########################
df0 = read_sims_result("/Users/xcheng/Documents/Oberlin/Summer2/DataAnalysis/data/0622/0IR300s", 32)
df0c = cleanup_0IR_exp(df0, numNode=32, numPeriod=15, numSim=100, balanced=True)

X = df0c[independent]
y = df0c["default-next"]

final = LogisticRegression(penalty="l1", C=0.007)
final.fit(X,y)

LogisticRegression(C=0.007, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [4]:
# make sure no defaults in 0IR
# sum(df0[df0["defaults due to interest"]
#     +df0["defaults due to negative wealth"]
#     +df0["defaults due to deposit shock"] == 0].loc[:,"dot0":"dot30"].values)

In [5]:
# ###########################
# Read & process positive IR results
# ###########################
df_1 = read_sims_result("/Users/xcheng/Documents/Oberlin/Summer2/DataAnalysis/data/0625/1IR", 32)
mx_1n = cleanup_network(df_1, numNode=32, numPeriod=15, numSim=50)
df_1c = cleanup_0IR_exp(df_1, numNode=32, numPeriod=15, numSim=50)

In [6]:
def create_edge_weight(N, dff, mid):
    """
    Calculate weight for edges
    Each debt is divided by lenders' wealth (w/o haircut)，
    The result number r is scaled to [0, 1) using g(r)=r/(mid+r)
    
    Parameters
    ----------
    N: 4D numpy array [n_simulations, n_periods, n_borrowers, n_lenders]
        debt adjacency matrices 
    df: Pandas dataframe (no cleanup)
        where we get banks' wealth (w/o haircut)
    mid: int
        the debt-to-wealth ratio resulting in 50% probability of spreading default
    
    Returns
    ----------
    WN: 4D numpy array [n_simulations, n_periods, n_borrowers, n_lenders]
        new weighted debt adjacency matrices
    """
    
    WN = np.copy(N)
    simNum, periodNum, bankNum, _= N.shape
    dff["book_wealth"] = (dff["assets"] + dff["cash"] + dff["debt owed"] 
                         - dff["debt to pay"] - dff["deposits"])
    
    for s in range(simNum):
        for p in range(1,periodNum+1):
            for lender in range(bankNum):
                w = dff[np.array(dff["sim#"]==s) &
                        np.array(dff["period"]==p) & 
                        np.array(dff["bankID"]==lender)
                       ]["book_wealth"].values[0]
                
                # helper function
                def f(a):
                    if a > 0: # there is debt
                        if w > 0: # positive wealth
                            t = a/w
                            return t/(t+mid)
                        else: # 0 or negative wealth
                            return 100/(100+mid)
                    else: # no debt or weird data 
                        return 0
                
                WN[s, p-1, :, lender] = [f(k) for k in WN[s, p-1, :, lender]]
                    
    return WN

In [98]:
def create_node_weight(N, dff, model, variables):
    """
    Calculate weight for nodes
    Each debt is multiplied by lenders' predicted default probability
    
    Parameters
    ----------
    N: 4D numpy array [n_simulations, n_periods, n_borrowers, n_lenders]
        debt adjacency matrices 
    dff: Pandas dataframe (yes cleanup)
        where we get bank's balance sheet info
    model: model for default probability
        scikit learn LogisticRegression
    variables: a list of strings
        independent variables for the model
    
    Returns
    ----------
    WN: 3D numpy array [n_simulations, n_periods, n_banks]
        array of predicted probability of default
    """
    
    simNum, periodNum, bankNum, _= N.shape
    WN = np.empty((simNum, periodNum, bankNum))
    WN.fill(-1)
    
    for s in range(simNum):
        for p in range(2,periodNum):
            for b in range(bankNum):
                X = dff[np.array(dff["sim#"]==s) &
                        np.array(dff["period"]==p) & 
                        np.array(dff["bankID"]==b)
                       ][variables].values
                if X.any():
                    predicted_default_probability = model.predict_proba(X)[0][1]
                    WN[s, p-1, b] = predicted_default_probability
                    
    return WN

In [99]:
edge_weights_all = create_edge_weight(mx_1n, df_1, 0.6)
node_weights_all = create_node_weight(mx_1n, df_1c, final, independent)

In [100]:
# visualize the debt adjacency matrix in a 2d graph
# plt.figure(figsize=(10,10))
# plt.imshow(edge_weights_all[0,1], interpolation='nearest')
# plt.show()

In [133]:
def customized_random_walk_single(N, solvent, iterations=10):
    """
    NEW NEW NEW modified random walk algorithm
    
    Parameters
    ----------
    N: 2D numpy array [n_borrowers, n_lenders]
        debt adjacency matrices 
    solvent: list 
        solvent[solvent bank] = predicted default probability
        solvent[insolvent bank] = -1
    iterations: int
        number of iterations
    
    Returns
    ----------
    result: dict {BankID : # of default}
        index for the bank's default probability
    """
    num_bank, _ = N.shape
    G = nx.DiGraph(N)
    nextDefault = deque()
    defaulted = frozenset([i for i in range(num_bank) if solvent[i] == -1])
    tempDefault = set(defaulted)
    probDefault = {i:solvent[i] for i in range(num_bank) if solvent[i] >= 0}
    result = dict(zip(list(probDefault.keys()), [0 for _ in probDefault.keys()]))
    def coin(p=0.5):
        return random() < p
    
    for _ in range(iterations):
        for b in probDefault.keys():
#             print("initial bank:", b)
            if coin(probDefault[b]): 
                nextDefault.append(b)
            while len(nextDefault) > 0: # queue not empty
                n = nextDefault.popleft() # next on the queue
                if n not in tempDefault: # not aleady defaulted
                    tempDefault.add(n)
                    result[n] += 1
                    for s in G.successors(n): # creditors of n
                        if s not in tempDefault and coin(G.edges[(n,s)]['weight']):
                            nextDefault.append(s)
#                 print("--- currently default:", tempDefault)
#                 print("--- death roll:", nextDefault)
            
#             print("currently default:", tempDefault)
#             print("death roll:", nextDefault)
                
            tempDefault = set(defaulted)
            
#         print(result)
    return result

In [140]:
def customized_random_walk_exp(N, solvent, iterations=10):
    """
    NEW NEW NEW modified random walk algorithm
    
    Parameters
    ----------
    N: 4D numpy array [n_simulations, n_periods, n_borrowers, n_lenders]
        debt adjacency matrices 
    solvent: 3D numpy array [n_simulations, n_periods, n_borrowers, n_lenders] 
        predicted default probability OR -1
    iterations: int
        number of iterations
    
    Returns
    ----------
    result: 1D numpy array
        index for the bank's default probability   
    """
    n_s, n_p, n_b, _ = N.shape
    s_s, s_p, s_b = solvent.shape
    big_result = []
    
    if n_s != s_s or n_p != s_p or n_b != s_b:
        raise ValueError('Two arrays have incompatible sizes.')
        
    for i, j in np.ndindex((n_s,n_p)):
        big_result.extend(customized_random_walk_single(N[i,j], 
                                                        solvent[i,j], 
                                                        iterations=iterations
                                                       ).values())
        
    return np.array(big_result)

In [155]:
overall_prediction = customized_random_walk_exp(edge_weights_all, node_weights_all, 50)

In [156]:
# pearson correlation coeffcient is not very big 
# it gets slightly bigger with lots of iterations
pearsonr(overall_prediction, df_1c["default-next"])

(0.28435375895423526, 0.0)

In [161]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import brier_score_loss

fold = 12
kf = KFold(n_splits=fold, shuffle=True)

accuracy = 0
conf = np.array([[0, 0], [0, 0]])
brier = 0

for train_index, test_index in kf.split(X):
    model = LogisticRegression()
    model.fit(overall_prediction[train_index].reshape(-1, 1), 
              df_1c["default-next"].iloc[train_index])
    print(model.coef_, model.intercept_)
    accuracy += model.score(overall_prediction[train_index].reshape(-1, 1), 
                            df_1c["default-next"].iloc[train_index])
    conf += confusion_matrix(df_1c["default-next"].iloc[train_index], 
                             model.predict(overall_prediction[train_index].reshape(-1, 1)))
    brier += brier_score_loss(df_1c["default-next"].iloc[train_index], 
                              model.predict(overall_prediction[train_index].reshape(-1, 1)))
print("{}\n accuracy:{:24}\n brier:{:24}\n".format(
        conf, accuracy/fold, brier/fold))

[[0.06957806]] [-5.14169066]
[[0.07195801]] [-5.19075611]
[[0.07508638]] [-5.25283331]
[[0.06973751]] [-5.18959742]
[[0.06885175]] [-5.15957947]
[[0.07250736]] [-5.19995719]
[[0.08079058]] [-5.32340788]
[[0.07036459]] [-5.2266497]
[[0.06147661]] [-5.04879686]
[[0.07190421]] [-5.24258101]
[[0.06786615]] [-5.27154027]
[[0.068674]] [-5.14397034]
[[8193   13]
 [ 125    7]]
 accuracy:       0.983449090871395
 brier:    0.016550909128604897



----------------------------------------------------------------------
Stuff Below this are old stuff that I might or might not need.
----------------------------------------------------------------------
----------------------------------------------------------------------

In [125]:
def dist_avg_max(N):
    """
    calculate average & max distances between all pair of nodes
    
    Parameters
    ----------
    N: 4D numpy array [n_simulations, n_periods, n_borrowers, n_lenders]
        debt adjacency matrices 
    
    Returns
    ----------
    avg_d: 2D numpy array [n_simulations, n_periods]
        average distances between all pair of nodes
    max_d: 2D numpy array [n_simulations, n_periods]
        max distances between all pair of nodes
    """
    numSim, numPeriod, _, _ = N.shape
    avg_d = np.empty((numSim, numPeriod-2))
    max_d = np.empty((numSim, numPeriod-2))
    
    for s in range(numSim):
        for p in range(1,numPeriod-1):
            disG = nx.DiGraph(N[s,p])
            dists = shortest_path_length(disG, weight=None)
            curlist=[]
            for source in dists:
                curlist.extend(source[1].values())
            avg_d[s,p-1] = sum(curlist) / float(len(curlist))
            max_d[s,p-1] = max(curlist)
            
    return avg_d, max_d

In [120]:
# ###########################
# Visualize max/avg distances between banks
# ###########################
# avgg, maxx = dist_avg_max(mx_1n)
#
# pavg = pd.DataFrame(avgg)
# pmax = pd.DataFrame(maxx)
# # pavg.mean().plot()
# abc = pmax.stack().value_counts().sort_index().plot(
#     kind="bar",
#     title="max distances, 1 interest rates, 50 simulations, 15 periods",
#     figsize=(8,6),
#     fontsize=12
# )
# abc.set_xlabel("max distance between any pair of reachable nodes")
# abc.set_ylabel("frequncy")
# abc.title.set_fontsize(15)
# abc.xaxis.label.set_fontsize(15)
# abc.yaxis.label.set_fontsize(15)

In [6]:
def weigh_networks(N, model, variables):
    """
    Add weight to network
    Each debt is multiplied by lenders' predicted default probability
    
    Parameters
    ----------
    N: 4D numpy array [n_simulations, n_periods, n_borrowers, n_lenders]
        debt adjacency matrices 
    model: scikit learn LogisticRegression
        model for default probability
    variables: a list of strings
        independent variables for the model
    
    Returns
    ----------
    WN: 4D numpy array [n_simulations, n_periods, n_borrowers, n_lenders]
        new weighted debt adjacency matrices
    """
    
    WN = np.copy(N)
    simNum, periodNum, bankNum, _= N.shape
    
    for s in range(simNum):
        for p in range(1,periodNum-1):
            for b in range(bankNum):
                X = df_1c[np.array(df_1c["sim#"]==s) &
                          np.array(df_1c["period"]==p) & 
                          np.array(df_1c["bankID"]==b)
                         ][variables].values
                if X.any():
                    predicted_default_probability = model.predict_proba(X)[0][1]
                    WN[s, p-1, b] *= predicted_default_probability
                    
    return WN

In [46]:
def my_pagerank_numpy(G, alpha=0.85, personalization=None, weight='weight', dangling=None):
    """
    This is basically pagerank_numpy without normalization.
    """
    from networkx.algorithms.link_analysis.pagerank_alg import google_matrix
    
    if len(G) == 0:
        return {}
    M = google_matrix(G, alpha, personalization=personalization,
                      weight=weight, dangling=dangling)
    # use numpy LAPACK solver
    eigenvalues, eigenvectors = np.linalg.eig(M.T)
    ind = np.argmax(eigenvalues)
    # eigenvector of largest eigenvalue is at ind
    largest = np.array(eigenvectors[:, ind]).flatten().real
    return dict(zip(G, map(float, abs(largest))))

In [8]:
def apply_to_networks(f, N):
    """
    Calculate Page Rank scores for all the networks 
    
    Parameters
    ----------
    f: function (2D numpy array -> matrix)
        the function to apply to each network (e.g. Page Rank)
    N: 4D numpy array [n_simulations, n_periods, n_borrowers, n_lenders]
        debt adjacency matrices (netowrks)
    
    Returns
    ----------
    PG: 3D numpy array [n_simulations, n_periods, n_banks]
        Page Rank scores
    """
    
    simNum, periodNum, bankNum, _= N.shape
    PG = np.empty((simNum, periodNum, bankNum))
    
    for s in range(simNum):
        for p in range(1,periodNum-1):
            PG[s, p] = np.array(list(f(nx.DiGraph(N[s, p])).values()))
            
    return PG

In [9]:
# ###########################
# Let's add the weight
# ###########################
weighted = weigh_networks(mx_1n, final, independent)

In [50]:
# ###########################
# Let's calculate pagerank
# ###########################
pg_iter = apply_to_networks(pagerank, weighted)
pg_norm = apply_to_networks(pagerank_numpy, weighted)
pg_not_norm = apply_to_networks(my_pagerank_numpy, weighted)