In [78]:
#Import the required libraries
import pandas as pd
import os.path as osp

#Build the path for the Airfares data file
data_path = osp.join(osp.curdir,'banks.csv')

#Use the read_excel function to pull data from the 'Airfares'
df = pd.read_csv(data_path)
df.head()

In [79]:
df.dtypes

Financial Condition      int64
TotExp/Assets          float64
TotLns&Lses/Assets     float64
dtype: object

In [121]:
df.describe()

Unnamed: 0,Obs,Financial Condition,TotCap/Assets,TotExp/Assets,TotLns&Lses/Assets
count,20.0,20.0,20.0,20.0,20.0
mean,10.5,0.5,9.32,0.1045,0.6285
std,5.91608,0.512989,4.797214,0.026052,0.159779
min,1.0,0.0,1.0,0.07,0.3
25%,5.75,0.0,7.125,0.08,0.525
50%,10.5,0.5,9.2,0.1,0.64
75%,15.25,1.0,11.3,0.12,0.7225
max,20.0,1.0,20.5,0.16,1.02


In [122]:
#Check if there is null data in the dataset
df.isna().sum()

Obs                    0
Financial Condition    0
TotCap/Assets          0
TotExp/Assets          0
TotLns&Lses/Assets     0
dtype: int64

In [125]:
#Split data into train and test, also we need to standardize the predictor variable before fitting in the model
from sklearn.model_selection import train_test_split
y = df['Financial Condition']
X = df[['TotExp/Assets','TotLns&Lses/Assets']]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,y,train_size=0.7,random_state=0)

In [126]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
X_train_1 = sm.add_constant(X_train)

model = sm.OLS(y_train, X_train_1).fit()
print(model.summary())

                             OLS Regression Results                            
Dep. Variable:     Financial Condition   R-squared:                       0.500
Model:                             OLS   Adj. R-squared:                  0.409
Method:                  Least Squares   F-statistic:                     5.501
Date:                 Sun, 21 May 2023   Prob (F-statistic):             0.0221
Time:                         13:00:08   Log-Likelihood:                -5.1642
No. Observations:                   14   AIC:                             16.33
Df Residuals:                       11   BIC:                             18.25
Df Model:                            2                                         
Covariance Type:             nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.5252      0.108      4.85



In [127]:
from sklearn.linear_model import LogisticRegression
#using the liblinear version this time since the dataset is pretty small
logit_reg = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
logit_reg.fit(X_train, y_train)

In [129]:
from dmba.metric import AIC_score
print('intercept ', logit_reg.intercept_[0])
print(pd.DataFrame({'coeff': logit_reg.coef_[0]}, index=X.columns).transpose())
print('AIC', AIC_score(y_test, logit_reg.predict(X_test), df = len(X_train)
+ 1))

intercept  0.1635275101912511
       TotExp/Assets  TotLns&Lses/Assets
coeff       2.133089            0.967546
AIC 38.27670558308775


In [136]:
import numpy as np
from dmba import classificationSummary

y_pred = model.predict(X_train_1)
cutoff = 0.5
y_pred_classes = np.zeros_like(y_pred)
y_pred_classes[y_pred > cutoff] = 1

classificationSummary(y_train, y_pred_classes)

Confusion Matrix (Accuracy 0.9286)

       Prediction
Actual 0 1
     0 5 1
     1 0 8


In [137]:
from dmba import classificationSummary
classificationSummary(y_train, logit_reg.predict(X_train))

Confusion Matrix (Accuracy 0.9286)

       Prediction
Actual 0 1
     0 5 1
     1 0 8


In [144]:
B0 = logit_reg.intercept_[0]
B1 = logit_reg.coef_[0][0]
B2 = logit_reg.coef_[0][1]

In [149]:
print('Logit = ',B0,'+',B1,' * TotExp/Assets +', B2,'* TotLns&Lses/Assets')
print('Odds = exp(',B0,'+',B1,' * TotExp/Assets +', B2,'* TotLns&Lses/Assets')
print('Probability = 1/1 +exp(-(',B0,'+',B1,' * TotExp/Assets +', B2,'* TotLns&Lses/Assets ))')

Logit =  0.1635275101912511 + 2.133088779833814  * TotExp/Assets + 0.9675463795900938 * TotLns&Lses/Assets
Odds = exp( 0.1635275101912511 + 2.133088779833814  * TotExp/Assets + 0.9675463795900938 * TotLns&Lses/Assets
Probability = 1/1 +exp(-( 0.1635275101912511 + 2.133088779833814  * TotExp/Assets + 0.9675463795900938 * TotLns&Lses/Assets ))


#1:Write the estimated equation that associates the financial condition of a bank with its two predictors in three formats:
A.	The logit as a function of the predictors
B.	The odds as a function of the predictors
C.	The probability as a function of the predictors


1.By run the above logistic regression, we have the logit as a function of the predictors: Financial Conditions = 0.163528 + 2.133089 * TotExp/Assets + 0.967546 *TotLns&Lses/Assets
the odds function would be:Financial Conditions = exp(0.163528 + 2.133089 * TotExp/Assets + 0.967546 *TotLns&Lses/Assets)
the probability function would be:p(Financial Conditions =1) =1/(1+ exp(-(0.163528 + 2.133089 * TotExp/Assets + 0.967546 *TotLns&Lses/Assets)))

In [138]:
#2 Consider a new bank whose total loans and leases/assets ratio = 0.6 and total expenses/assets ratio = 0.11. From your logistic regression model, estimate the following four quantities for this bank (use R to do all the intermediate calculations; show your final answers to four decimal places): the logit, the odds, the probability of being financially weak, and the classification of the bank (use cutoff = 0.5).

In [150]:
Total_loans_and_Leases_Asstes = 0.6
Total_expense_Assets = 0.11

In [151]:
Logit = B0 + B1 * Total_expense_Assets +B2 * Total_loans_and_Leases_Asstes
print(f'Logit: {Logit:.4f}')

Logit: 0.9787


In [152]:
odds = np.exp(Logit)
print(f'Odds: {odds:.4f}')

Odds: 2.6610


In [153]:
probability = 1 / (1 + np.exp(-Logit))
print(f'Probability: {probability:.4f}')

Probability: 0.7268


In [154]:
if(probability > 0.5):
    print('The bank is financially weak')
else:
    print('The bank is financially strong')

The bank is financially weak


In [159]:
#3.	The cutoff value of 0.5 is used in conjunction with the probability of being financially weak. Compute the threshold that should be used if we want to make a classification based on the odds of being financially weak, and the threshold for the corresponding logit.
odds_cutoff = cutoff/(1-cutoff)
print('The corresponding odd cutoff is', odds_cutoff)
print('The corresponding logit cutoff is', np.log(odds_cutoff))

The corresponding odd cutoff is 1.0
The corresponding logit cutoff is 0.0


We know that the logit function can also be written as : 1-p/p  = 1/ odds function, thus the odds function can be revised as: odds function = p/1-p which is the cutoff we need to calculate. based on the cutoff probability is 0.5, we calculated the corresponding odds cut off is 1. the corresponding logit is the ln(odds_cutoff), which is 0.

In [164]:
#4.	Interpret the estimated coefficient for the total loans & leases to total assets ratio (TotLns&Lses/Assets) in terms of the odds of being financially weak.
print('the coefficient for total loans & leases to total assets is',B2)
print('the coefficient for total loans & leases to total assetes in terms of odds is', np.log(B2))

the coefficient for total loans & leases to total assets is 0.9675463795900938
the coefficient for total loans & leases to total assetes in terms of odds is -0.03299191766634785


The coefficient of total loans & leases to total assets is 0.9675 which represents, that total loans & leases to total assets ratio increased by 1, the log of the bank being financially weak increases. in terms of odds,the coefficient is -0.033, which means the decreasing on the coefficient that the total loans & leases to total assets ratio decreased by 1, the odds of the bank being financially strong is increasing.

In [None]:
#5.	When a bank that is in poor financial condition is misclassified as financially strong, the misclassification cost is much higher than when a financially strong bank is misclassified as weak. To minimize the expected cost of misclassification, should the cutoff value for classification (which is currently at 0.5) be increased or decreased?

When the cost of misclassfication is high, we should reduce the false negative, since the strong bank is misclassified as weak. To reduce the false negative, we should lower the cutoff value. In this way, it increased the probability of predicting a bank as weak which will have higher chance to have more actual weak bank and reduce the chance of misclassifying the strong banks to the weak.