In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns', None)
sba_loans = pd.read_csv('selected_sba_loans_data.csv')
sba_loans

Unnamed: 0,ChgOffPrinGr,Term,DisbursementGross,UrbanRural,RevLineCr,LowDoc,BankState,State,NAICS,MIS_Status
0,-0.211808,-0.277691,60000.0,0,0,1,38,15,831,0.0
1,-0.211808,-0.595858,40000.0,0,0,1,18,15,1233,0.0
2,-0.211808,0.994975,287000.0,0,0,0,18,15,1157,0.0
3,-0.211808,-0.595858,35000.0,0,0,1,39,36,0,0.0
4,-0.211808,1.790392,229000.0,0,0,0,11,9,0,0.0
...,...,...,...,...,...,...,...,...,...,...
615477,-0.211808,-0.277691,79000.0,0,0,1,8,43,0,0.0
615478,-0.211808,-0.595858,85000.0,0,1,0,17,35,832,0.0
615479,-0.211808,0.040475,300000.0,0,0,0,5,4,436,0.0
615480,0.625731,-0.595858,75000.0,0,0,1,14,11,0,1.0


In [3]:
# Drop rows with any NaN values
sba_loans = sba_loans.dropna()

In [4]:
sba_loans.info()

<class 'pandas.core.frame.DataFrame'>
Index: 613723 entries, 0 to 615481
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ChgOffPrinGr       613723 non-null  float64
 1   Term               613723 non-null  float64
 2   DisbursementGross  613723 non-null  float64
 3   UrbanRural         613723 non-null  int64  
 4   RevLineCr          613723 non-null  int64  
 5   LowDoc             613723 non-null  int64  
 6   BankState          613723 non-null  int64  
 7   State              613723 non-null  int64  
 8   NAICS              613723 non-null  int64  
 9   MIS_Status         613723 non-null  float64
dtypes: float64(4), int64(6)
memory usage: 51.5 MB


In [5]:
# List of columns to convert to int64
columns_to_convert = ['UrbanRural', 'RevLineCr', 'LowDoc', 'BankState', 'State', 'NAICS', 'MIS_Status']

# Convert each specified column to int64
for column in columns_to_convert:
    sba_loans[column] = sba_loans[column].astype('int64', errors='ignore')  # errors='ignore' will skip incompatible columns

In [6]:
sba_loans['MIS_Status'].value_counts()

MIS_Status
0    502794
1    110929
Name: count, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LassoCV, ElasticNetCV
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, confusion_matrix

# Prepare the data
X = sba_loans.drop(columns=['MIS_Status', 'DisbursementGross'])
y = sba_loans['MIS_Status']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Dictionary to store model results
model_results = {}

In [8]:
# 1. k-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)
knn_probs = knn.predict_proba(X_test)[:, 1]
model_results['kNN'] = {
    'accuracy': accuracy_score(y_test, knn_preds),
    'confusion_matrix': confusion_matrix(y_test, knn_preds),
    'probability_default': knn_probs
}

In [9]:
# 2. Decision Tree
tree = DecisionTreeClassifier(random_state=123)
tree.fit(X_train, y_train)
tree_preds = tree.predict(X_test)
tree_probs = tree.predict_proba(X_test)[:, 1]
model_results['Decision Tree'] = {
    'accuracy': accuracy_score(y_test, tree_preds),
    'confusion_matrix': confusion_matrix(y_test, tree_preds),
    'probability_default': tree_probs
}

In [10]:
# 3. Bagging (using Decision Tree)
bagging = BaggingClassifier(estimator=DecisionTreeClassifier(), random_state=123)
bagging.fit(X_train, y_train)
bagging_preds = bagging.predict(X_test)
bagging_probs = bagging.predict_proba(X_test)[:, 1]
model_results['Bagging'] = {
    'accuracy': accuracy_score(y_test, bagging_preds),
    'confusion_matrix': confusion_matrix(y_test, bagging_preds),
    'probability_default': bagging_probs
}

In [11]:
# 4. Random Forest
random_forest = RandomForestClassifier(random_state=123)
random_forest.fit(X_train, y_train)
rf_preds = random_forest.predict(X_test)
rf_probs = random_forest.predict_proba(X_test)[:, 1]
model_results['Random Forest'] = {
    'accuracy': accuracy_score(y_test, rf_preds),
    'confusion_matrix': confusion_matrix(y_test, rf_preds),
    'probability_default': rf_probs
}

In [12]:
# 5. Boosting (Gradient Boosting)
boosting = GradientBoostingClassifier(random_state=123)
boosting.fit(X_train, y_train)
boosting_preds = boosting.predict(X_test)
boosting_probs = boosting.predict_proba(X_test)[:, 1]
model_results['Boosting'] = {
    'accuracy': accuracy_score(y_test, boosting_preds),
    'confusion_matrix': confusion_matrix(y_test, boosting_preds),
    'probability_default': boosting_probs
}

In [13]:
# 6. Logistic Regression with Lasso, Ridge, and ElasticNet
# Lasso
lasso = LassoCV(cv=5, random_state=123)
lasso.fit(X_train, y_train)
lasso_preds = np.round(lasso.predict(X_test))
model_results['Lasso Logistic Regression'] = {
    'accuracy': accuracy_score(y_test, lasso_preds),
    'confusion_matrix': confusion_matrix(y_test, lasso_preds),
    'probability_default': lasso.predict(X_test)
}

# Logistic regression with L2 penalty (Ridge)
ridge_logistic = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000, random_state=123)
ridge_logistic.fit(X_train, y_train)
ridge_preds = ridge_logistic.predict(X_test)
model_results['Ridge Logistic Regression'] = {
    'accuracy': accuracy_score(y_test, ridge_preds),
    'confusion_matrix': confusion_matrix(y_test, ridge_preds),
    'probability_default': ridge_logistic.predict_proba(X_test)[:, 1]
}

# ElasticNet
elastic_net = ElasticNetCV(cv=5, random_state=123)
elastic_net.fit(X_train, y_train)
elastic_net_preds = np.round(elastic_net.predict(X_test))
model_results['ElasticNet Logistic Regression'] = {
    'accuracy': accuracy_score(y_test, elastic_net_preds),
    'confusion_matrix': confusion_matrix(y_test, elastic_net_preds),
    'probability_default': elastic_net.predict(X_test)
}

In [14]:
# 7. Neural Network
nn = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=1000, random_state=123)
nn.fit(X_train, y_train)
nn_preds = nn.predict(X_test)
nn_probs = nn.predict_proba(X_test)[:, 1]
model_results['Neural Network'] = {
    'accuracy': accuracy_score(y_test, nn_preds),
    'confusion_matrix': confusion_matrix(y_test, nn_preds),
    'probability_default': nn_probs
}

In [15]:
# 8. Discriminant Analysis
# Linear Discriminant Analysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
lda_preds = lda.predict(X_test)
lda_probs = lda.predict_proba(X_test)[:, 1]
model_results['LDA'] = {
    'accuracy': accuracy_score(y_test, lda_preds),
    'confusion_matrix': confusion_matrix(y_test, lda_preds),
    'probability_default': lda_probs
}

# Quadratic Discriminant Analysis
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
qda_preds = qda.predict(X_test)
qda_probs = qda.predict_proba(X_test)[:, 1]
model_results['QDA'] = {
    'accuracy': accuracy_score(y_test, qda_preds),
    'confusion_matrix': confusion_matrix(y_test, qda_preds),
    'probability_default': qda_probs
}

In [16]:
# Display results
for model_name, results in model_results.items():
    print(f"\n{model_name} Results:")
    print("Accuracy:", results['accuracy'])
    print("Confusion Matrix:\n", results['confusion_matrix'])
    print("Probability of Default (first 5 predictions):", results['probability_default'][:5])


kNN Results:
Accuracy: 0.9035657722363799
Confusion Matrix:
 [[122348   3507]
 [ 11289  16287]]
Probability of Default (first 5 predictions): [0.  0.2 0.  0.4 0. ]

Decision Tree Results:
Accuracy: 0.9878381813323253
Confusion Matrix:
 [[125053    802]
 [  1064  26512]]
Probability of Default (first 5 predictions): [0. 0. 0. 0. 0.]

Bagging Results:
Accuracy: 0.9927915479922571
Confusion Matrix:
 [[125054    801]
 [   305  27271]]
Probability of Default (first 5 predictions): [0. 0. 0. 0. 0.]

Random Forest Results:
Accuracy: 0.993312955009092
Confusion Matrix:
 [[125042    813]
 [   213  27363]]
Probability of Default (first 5 predictions): [0. 0. 0. 0. 0.]

Boosting Results:
Accuracy: 0.9935475881666678
Confusion Matrix:
 [[125061    794]
 [   196  27380]]
Probability of Default (first 5 predictions): [0.00062248 0.00056137 0.00059558 0.00070109 0.00056137]

Lasso Logistic Regression Results:
Accuracy: 0.8404364176730909
Confusion Matrix:
 [[125837     17      0      1      0      0