In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)
sba_loans = pd.read_csv('selected_sba_loans_data.csv')
sba_loans

Unnamed: 0,ChgOffPrinGr,Term,UrbanRural,RevLineCr,LowDoc,BankState,State,NAICS,MIS_Status
0,-0.207258,-0.339513,0,0.0,1.0,38,15,831,0.0
1,-0.207258,-0.643861,0,0.0,1.0,18,15,1233,0.0
2,-0.207258,0.877876,0,0.0,0.0,18,15,1157,0.0
3,-0.207258,-0.643861,0,0.0,1.0,39,36,0,0.0
4,-0.207258,1.638745,0,0.0,0.0,11,9,0,0.0
...,...,...,...,...,...,...,...,...,...
899159,-0.207258,-0.643861,0,0.0,0.0,17,35,831,0.0
899160,-0.207258,-0.643861,0,1.0,0.0,17,35,832,0.0
899161,-0.207258,-0.035166,0,0.0,0.0,5,4,436,0.0
899162,0.504659,-0.643861,0,0.0,1.0,14,11,0,1.0


In [3]:
# Drop rows with any NaN values
sba_loans = sba_loans.dropna()

In [4]:
sba_loans.info()

<class 'pandas.core.frame.DataFrame'>
Index: 872919 entries, 0 to 899163
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   ChgOffPrinGr  872919 non-null  float64
 1   Term          872919 non-null  float64
 2   UrbanRural    872919 non-null  int64  
 3   RevLineCr     872919 non-null  float64
 4   LowDoc        872919 non-null  float64
 5   BankState     872919 non-null  int64  
 6   State         872919 non-null  int64  
 7   NAICS         872919 non-null  int64  
 8   MIS_Status    872919 non-null  float64
dtypes: float64(5), int64(4)
memory usage: 66.6 MB


In [5]:
# List of columns to convert to int64
columns_to_convert = ['UrbanRural', 'RevLineCr', 'LowDoc', 'BankState', 'State', 'NAICS', 'MIS_Status']

# Convert each specified column to int64
for column in columns_to_convert:
    sba_loans[column] = sba_loans[column].astype('int64', errors='ignore')  # errors='ignore' will skip incompatible columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sba_loans[column] = sba_loans[column].astype('int64', errors='ignore')  # errors='ignore' will skip incompatible columns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sba_loans[column] = sba_loans[column].astype('int64', errors='ignore')  # errors='ignore' will skip incompatible columns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

In [6]:
sba_loans['MIS_Status'].value_counts()

MIS_Status
0    723540
1    149379
Name: count, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LassoCV, ElasticNetCV
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, confusion_matrix

# Prepare the data
X = sba_loans.drop(columns=['MIS_Status'])
y = sba_loans['MIS_Status']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Dictionary to store model results
model_results = {}

In [8]:
# 1. k-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)
knn_probs = knn.predict_proba(X_test)[:, 1]
model_results['kNN'] = {
    'accuracy': accuracy_score(y_test, knn_preds),
    'confusion_matrix': confusion_matrix(y_test, knn_preds),
    'probability_default': knn_probs
}

In [9]:
# 2. Decision Tree
tree = DecisionTreeClassifier(random_state=123)
tree.fit(X_train, y_train)
tree_preds = tree.predict(X_test)
tree_probs = tree.predict_proba(X_test)[:, 1]
model_results['Decision Tree'] = {
    'accuracy': accuracy_score(y_test, tree_preds),
    'confusion_matrix': confusion_matrix(y_test, tree_preds),
    'probability_default': tree_probs
}

In [10]:
# 3. Bagging (using Decision Tree)
bagging = BaggingClassifier(estimator=DecisionTreeClassifier(), random_state=123)
bagging.fit(X_train, y_train)
bagging_preds = bagging.predict(X_test)
bagging_probs = bagging.predict_proba(X_test)[:, 1]
model_results['Bagging'] = {
    'accuracy': accuracy_score(y_test, bagging_preds),
    'confusion_matrix': confusion_matrix(y_test, bagging_preds),
    'probability_default': bagging_probs
}

In [11]:
# 4. Random Forest
random_forest = RandomForestClassifier(random_state=123)
random_forest.fit(X_train, y_train)
rf_preds = random_forest.predict(X_test)
rf_probs = random_forest.predict_proba(X_test)[:, 1]
model_results['Random Forest'] = {
    'accuracy': accuracy_score(y_test, rf_preds),
    'confusion_matrix': confusion_matrix(y_test, rf_preds),
    'probability_default': rf_probs
}

In [12]:
# 5. Boosting (Gradient Boosting)
boosting = GradientBoostingClassifier(random_state=123)
boosting.fit(X_train, y_train)
boosting_preds = boosting.predict(X_test)
boosting_probs = boosting.predict_proba(X_test)[:, 1]
model_results['Boosting'] = {
    'accuracy': accuracy_score(y_test, boosting_preds),
    'confusion_matrix': confusion_matrix(y_test, boosting_preds),
    'probability_default': boosting_probs
}

In [13]:
# 6. Logistic Regression with Lasso, Ridge, and ElasticNet
# Lasso
lasso = LassoCV(cv=5, random_state=123)
lasso.fit(X_train, y_train)
lasso_preds = np.round(lasso.predict(X_test))
model_results['Lasso Logistic Regression'] = {
    'accuracy': accuracy_score(y_test, lasso_preds),
    'confusion_matrix': confusion_matrix(y_test, lasso_preds),
    'probability_default': lasso.predict(X_test)
}

# Logistic regression with L2 penalty (Ridge)
ridge_logistic = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000, random_state=123)
ridge_logistic.fit(X_train, y_train)
ridge_preds = ridge_logistic.predict(X_test)
model_results['Ridge Logistic Regression'] = {
    'accuracy': accuracy_score(y_test, ridge_preds),
    'confusion_matrix': confusion_matrix(y_test, ridge_preds),
    'probability_default': ridge_logistic.predict_proba(X_test)[:, 1]
}

# ElasticNet
elastic_net = ElasticNetCV(cv=5, random_state=123)
elastic_net.fit(X_train, y_train)
elastic_net_preds = np.round(elastic_net.predict(X_test))
model_results['ElasticNet Logistic Regression'] = {
    'accuracy': accuracy_score(y_test, elastic_net_preds),
    'confusion_matrix': confusion_matrix(y_test, elastic_net_preds),
    'probability_default': elastic_net.predict(X_test)
}

In [14]:
# 7. Neural Network
nn = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=1000, random_state=123)
nn.fit(X_train, y_train)
nn_preds = nn.predict(X_test)
nn_probs = nn.predict_proba(X_test)[:, 1]
model_results['Neural Network'] = {
    'accuracy': accuracy_score(y_test, nn_preds),
    'confusion_matrix': confusion_matrix(y_test, nn_preds),
    'probability_default': nn_probs
}

In [15]:
# 8. Discriminant Analysis
# Linear Discriminant Analysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
lda_preds = lda.predict(X_test)
lda_probs = lda.predict_proba(X_test)[:, 1]
model_results['LDA'] = {
    'accuracy': accuracy_score(y_test, lda_preds),
    'confusion_matrix': confusion_matrix(y_test, lda_preds),
    'probability_default': lda_probs
}

# Quadratic Discriminant Analysis
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
qda_preds = qda.predict(X_test)
qda_probs = qda.predict_proba(X_test)[:, 1]
model_results['QDA'] = {
    'accuracy': accuracy_score(y_test, qda_preds),
    'confusion_matrix': confusion_matrix(y_test, qda_preds),
    'probability_default': qda_probs
}

In [16]:
# Display results
for model_name, results in model_results.items():
    print(f"\n{model_name} Results:")
    print("Accuracy:", results['accuracy'])
    print("Confusion Matrix:\n", results['confusion_matrix'])
    print("Probability of Default (first 5 predictions):", results['probability_default'][:5])


kNN Results:
Accuracy: 0.9078036933510516
Confusion Matrix:
 [[176666   4333]
 [ 15787  21444]]
Probability of Default (first 5 predictions): [0. 0. 0. 0. 0.]

Decision Tree Results:
Accuracy: 0.9882463455986803
Confusion Matrix:
 [[179884   1115]
 [  1450  35781]]
Probability of Default (first 5 predictions): [0. 0. 0. 0. 0.]

Bagging Results:
Accuracy: 0.9931448471795812
Confusion Matrix:
 [[179861   1138]
 [   358  36873]]
Probability of Default (first 5 predictions): [0. 0. 0. 0. 0.]

Random Forest Results:
Accuracy: 0.9938596893186088
Confusion Matrix:
 [[179847   1152]
 [   188  37043]]
Probability of Default (first 5 predictions): [0. 0. 0. 0. 0.]

Boosting Results:
Accuracy: 0.9939925766393255
Confusion Matrix:
 [[179866   1133]
 [   178  37053]]
Probability of Default (first 5 predictions): [0.00049095 0.00045283 0.00093967 0.00033494 0.00032668]

Lasso Logistic Regression Results:
Accuracy: 0.8491591440223617
Confusion Matrix:
 [[180979     17      3      0      0      0]
 [