In [10]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.metrics import roc_auc_score
import statsmodels.api as sm

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

path = 'C:\\CRA-Assignment\\'
filename0 = 'bank financials.pickle'
filename = path + filename0    
fininfo = pd.read_pickle(filename)
filename1 = 'default_data.pickle'
filename = path + filename1
defaults = pd.read_pickle(filename)

In [11]:
data = pd.merge(fininfo, defaults[['IDRSSD', 'dflt_date']], on = 'IDRSSD', how = 'left')
data.loc[pd.isnull(data.date), 'date'] = dt.date(2999,12,31)
data['default'] = 0
data.loc[(data.dflt_date >= (data.date + dt.timedelta(days=183))) & 
         (data.dflt_date < (data.date + dt.timedelta(days=549))), 'default'] = 1
data = data[data.deposits > 0]
data['expense']=data.nonIE/(data.nonII+data.TII)
data['loan_to_deposit']=data.loans/data.deposits
factors = ['assets','expense','loan_to_deposit']
for i in ['TCOs', 'tot_recoveries', 'allowance']:
    data[i + '_loans'] = data[i] / data.loans
    factors = factors + [i + '_loans'] 
for i in ['TII', 'nonII', 'nonIE', 'staff', 'cash', 'securities', 'loans', 'equity']:
    data[i + '_assets'] = data[i] / data.assets
    factors = factors + [i + '_assets']
print(factors)
print(data.describe)

['assets', 'expense', 'loan_to_deposit', 'TCOs_loans', 'tot_recoveries_loans', 'allowance_loans', 'TII_assets', 'nonII_assets', 'nonIE_assets', 'staff_assets', 'cash_assets', 'securities_assets', 'loans_assets', 'equity_assets']
<bound method NDFrame.describe of         IDRSSD                                         name     mutual  trust  \
0       991340                              1st Source Bank      Stock      1   
1       282208                        Abington Savings Bank      Stock      0   
2         5210                   Adams County National Bank      Stock      1   
3       774730                               Admiralty Bank      Stock      0   
4      2079493                                 Advance Bank      Stock      1   
...        ...                                          ...        ...    ...   
23941   833749                               WOODTRUST BANK      Stock      1   
23942   384018                           WOORI AMERICA BANK      Stock      0   
23943   

In [12]:
numeric_data=data.select_dtypes(include=[float, int])
for i in factors:
    correlations=numeric_data['default'].corr(numeric_data[i])
    print(f'{correlations:2.4f}\t{i}')
    print( f'{100*roc_auc_score(data.default, data[ i ]):2.1f} \t {i}')

0.0001	assets
49.5 	 assets
0.0125	expense
47.4 	 expense
-0.0005	loan_to_deposit
55.6 	 loan_to_deposit
0.2273	TCOs_loans
88.6 	 TCOs_loans
0.0029	tot_recoveries_loans
49.6 	 tot_recoveries_loans
0.1402	allowance_loans
86.2 	 allowance_loans
0.0560	TII_assets
75.6 	 TII_assets
-0.0205	nonII_assets
28.2 	 nonII_assets
0.0145	nonIE_assets
58.5 	 nonIE_assets
-0.0218	staff_assets
40.8 	 staff_assets
-0.0042	cash_assets
44.9 	 cash_assets
-0.0325	securities_assets
36.9 	 securities_assets
0.0179	loans_assets
57.7 	 loans_assets
-0.1017	equity_assets
12.0 	 equity_assets


In [13]:
print(factors)

['assets', 'expense', 'loan_to_deposit', 'TCOs_loans', 'tot_recoveries_loans', 'allowance_loans', 'TII_assets', 'nonII_assets', 'nonIE_assets', 'staff_assets', 'cash_assets', 'securities_assets', 'loans_assets', 'equity_assets']


In [14]:
from itertools import combinations
factor_combs = list(combinations(factors, 4))
print(len(factor_combs))


1001


In [15]:
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [16]:
results = []
failed_combinations = []  

for comb in factor_combs:
    comb = list(comb)  
    y = data['default']
    x = data[comb]  
    x = sm.add_constant(x)  

    if x.iloc[:, 1:].corr().abs().unstack().loc[lambda x: x != 1].max() < 0.5:
        try:
            model = sm.Logit(y, x).fit(disp=0)
            pseudo_r2 = 1 - (model.llf / model.llnull)
            pred_probs = model.predict(x)
            auc = roc_auc_score(y, pred_probs)
            results.append((comb, model, pseudo_r2, auc))
        except:
            continue
            
print(f"Total number of valid models: {len(results)}")

Total number of valid models: 718


In [17]:
print(x.corr())

                   const  cash_assets  securities_assets  loans_assets  \
const                NaN          NaN                NaN           NaN   
cash_assets          NaN     1.000000          -0.156368     -0.165350   
securities_assets    NaN    -0.156368           1.000000     -0.575542   
loans_assets         NaN    -0.165350          -0.575542      1.000000   
equity_assets        NaN     0.042064          -0.048442      0.219928   

                   equity_assets  
const                        NaN  
cash_assets             0.042064  
securities_assets      -0.048442  
loans_assets            0.219928  
equity_assets           1.000000  


In [18]:
#Sort and select the top three models based on the pseudo R-square
top_models = sorted(results, key=lambda x: x[2], reverse=True)[:3]
#print(data[top_models[0][0]].corr())
#print(data[top_models[1][0]].corr())
for i, (comb, model, pseudo_r2, auc) in enumerate(top_models, 1):
    print(f"\nModel {i}: Factors: {comb}, Pseudo R-squared: {pseudo_r2:.4f}, AUC: {auc:.4f}")
    print(model.summary())

    #Calculate the predicted PDs for the top 10 samples
    predictions = model.predict(sm.add_constant(data[comb]))
    print(f"Predicted default values for Model {i}:")
    print(predictions.head(10))

    #Output Pseudo R-squared
    print(f"Pseudo R-squared for Model {i}: {pseudo_r2:.4f}\n")
    print(data[comb].corr())
    



Model 1: Factors: ['TCOs_loans', 'TII_assets', 'loans_assets', 'equity_assets'], Pseudo R-squared: 0.3287, AUC: 0.9415
                           Logit Regression Results                           
Dep. Variable:                default   No. Observations:                23945
Model:                          Logit   Df Residuals:                    23940
Method:                           MLE   Df Model:                            4
Date:                Thu, 05 Sep 2024   Pseudo R-squ.:                  0.3287
Time:                        17:39:34   Log-Likelihood:                -517.59
converged:                       True   LL-Null:                       -771.06
Covariance Type:            nonrobust   LLR p-value:                2.115e-108
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -4.6598      0.581     -8.022      0.000      -5.798      -3.521
TC

Comment: We compare the AUC values and Pseudo R-squared of three regressions to determine whether the models are significantly different.
According to the output, while Model 2 has a slightly higher AUC compared to Model 1 and Model 3, the minor difference can't indicate that three models are significantly different. The models have similar performance in predicting PDs.

Pseudo R-squared of three models are also very close, indicating they have similar fit quality.

While the models are not significantly different in terms of quantitative metrics (AUC and pseudo R-squared), they are qualitatively different due to the different factor combinations used.

All top three combinations include TCOs_loans and equity_assets, indicating their importance in predicting default risk. High charge-offs relative to loans and low equity relative to assets are warning signs for a bank's financial health. TII_assets, nonII_assets, and loans_assets appear in different models, reflecting a bank's revenue structure from different aspects. These three factors also play important roles in predicting default risks and evaluating banks' capabilities in withstanding risks.

In addition to TCOs_loans and equity_assets, Model 1 includes TII_assets and loans_assets, suitable for scenarios emphasizing traditional banking revenue. Model 2 includes both total interest income and non-interest income, suitable for evaluating banks with diversified income sources. Model 3 has nonII_assets and loans_assets, suitable for assessing banks with a mixed income structure.

In [19]:
chosen_model_comb, chosen_model, chosen_model_pseudo_r2, chosen_model_auc = top_models[0]

silicon_valley_data = data[data['name'] == "Silicon Valley Bank"]
signature_bank_data = data[data['name'] == "Signature Bank"]
print(silicon_valley_data['default'])
print(signature_bank_data['default']) 

required_features = list(chosen_model.params.index)
required_features.remove('const')  

silicon_valley_data = silicon_valley_data[required_features].dropna()
signature_bank_data = signature_bank_data[required_features].dropna()

silicon_valley_data_with_const = sm.add_constant(silicon_valley_data)  
predicted_silicon_pds = chosen_model.predict(silicon_valley_data_with_const)
print(f"Predicted default probabilities for Silicon Valley Bank:")
print(predicted_silicon_pds)

signature_bank_data_with_const = sm.add_constant(signature_bank_data)  
predicted_signature_pds = chosen_model.predict(signature_bank_data_with_const)
print(f"Predicted default probabilities for Signature Bank:")
print(predicted_signature_pds)

463      0
1133     0
1843     0
2645     0
3557     0
4568     0
5662     0
6797     0
7927     0
8986     0
10010    0
11022    0
12068    0
13157    0
14302    0
Name: default, dtype: int64
1842     0
2644     0
3556     0
12065    0
13154    0
14299    0
Name: default, dtype: int64
Predicted default probabilities for Silicon Valley Bank:
463      0.000434
1133     0.000373
1843     0.000352
2645     0.002342
3557     0.002567
4568     0.003338
5662     0.005487
6797     0.007439
7927     0.004605
8986     0.003940
10010    0.003425
11022    0.003511
12068    0.004813
13157    0.004046
14302    0.003412
dtype: float64
Predicted default probabilities for Signature Bank:
1842     0.000288
2644     0.000061
3556     0.000398
12065    0.000640
13154    0.002795
14299    0.005718
dtype: float64


comment: the model does not caputure the default of these banks well because the predicted default probabilities are at very low levels.
For Silicon Valley Bank, their default was largely influenced by macro interest rate changes, and they had invested heavily in treasuries. Additionally, panic-induced withdrawals significantly contributed to their default. Therefore, incorporating these factors into the model could improve the prediction of their default probability.

And for Signature bank, maybe we should also consider the factors that we mentioned above.

In other common situations, for the PD model of the bank, We could potentially improve the prediction of defaults by incorporating the following factors:
Macroeconomic Indicators:
GDP Growth Rate: Economic downturns often lead to higher default rates.
Unemployment Rate: Higher unemployment can increase the risk of loan defaults.
Interest Rates: Rising interest rates can increase the cost of borrowing, leading to higher defaults.
Bank-Specific Financial Ratios:
Non-Performing Loans (NPL) Ratio: A higher NPL ratio indicates a higher risk of default.
Capital Adequacy Ratio (CAR): This ratio indicates the bank's capital buffer to absorb potential losses.
Liquidity Coverage Ratio (LCR): This ratio measures the bank's ability to meet short-term obligations.
By integrating these factors into our model, we can achieve a more comprehensive and accurate assessment of the default risk for banks.