In [8]:
# Using Logistic Regression

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm

# Replace this with the path to your CSV file
file_path = '/Users/christopherfrye/Library/Mobile Documents/com~apple~CloudDocs/NYU Stern/2025_Summer Term/AI in Finance/home_credit_cleaned.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Ensure all categorical variables are encoded
categorical_columns = ['code_gender', 'flag_own_car', 'flag_own_realty', 'age_range', 'educated', 'children', 'married']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Define your features and target
X = df[['amt_income_total', 'credit_score_mean', 'amt_credit', 'days_employed', 
        'document_count', 'credit_score_stdev', 'age_range', 'educated', 
        'children', 'married', 'flag_own_car', 'flag_own_realty']]
y = df['default']

# Add a constant to the model (for the intercept term in OLS)
X = sm.add_constant(X)

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Model (using statsmodels for detailed statistics)
logit_model = sm.Logit(y_train, X_train).fit()

# Output the logistic regression summary statistics (p-values, odds ratios, etc.)
print("Logistic Regression Summary:")
print(logit_model.summary())

# Get predicted probabilities from logistic regression model
y_pred_prob = logit_model.predict(X_test)

# Convert probabilities to binary outcomes (1 for default, 0 for repaid) using a 0.5 threshold
y_pred = np.where(y_pred_prob > 0.5, 1, 0)

# Evaluate the performance using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy: {accuracy:.4f}')

# Additional evaluation: confusion matrix and classification report
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Optimization terminated successfully.
         Current function value: 0.255309
         Iterations 7
Logistic Regression Summary:
                           Logit Regression Results                           
Dep. Variable:                default   No. Observations:               137968
Model:                          Logit   Df Residuals:                   137955
Method:                           MLE   Df Model:                           12
Date:                Mon, 26 May 2025   Pseudo R-squ.:                  0.1021
Time:                        15:41:00   Log-Likelihood:                -35224.
converged:                       True   LL-Null:                       -39232.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                  0.0470      0.052      0.909      0.363 

In [12]:
# Compare all ML models

# Count the number of 0s (non-defaults) and 1s (defaults) in the 'default' column (for reference)
default_counts = df['default'].value_counts()

# Print the counts
print(default_counts)

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Ensure all categorical variables are encoded
categorical_columns = ['code_gender', 'flag_own_car', 'flag_own_realty', 'age_range', 'educated', 'children', 'married']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Define your features and target
X = df[['amt_income_total', 'credit_score_mean', 'amt_credit', 'days_employed', 
        'document_count', 'credit_score_stdev', 'age_range', 'educated', 
        'children', 'married', 'flag_own_car', 'flag_own_realty']]
y = df['default']

# Add a constant to the model (for the intercept term in statsmodels)
X = sm.add_constant(X)

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---------------------------
# 1. Linear Regression (Logistic)
# ---------------------------
# Initialize the logistic regression model
linear_model = LogisticRegression(solver='liblinear')
linear_model.fit(X_train, y_train)

# Predict and evaluate linear regression model
y_pred_linear = linear_model.predict(X_test)
accuracy_linear = accuracy_score(y_test, y_pred_linear)

# -------------------------
# 2. Decision Tree Classifier
# -------------------------
# Initialize the decision tree classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predict and evaluate decision tree model
y_pred_dt = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

# ------------------------
# 3. Neural Network (MLP)
# ------------------------
# Initialize the MLP classifier (simple neural network)
mlp_model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)

# Predict and evaluate neural network model
y_pred_mlp = mlp_model.predict(X_test)
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)

# ------------------------
# Output and Evaluation
# ------------------------

# Accuracy Comparison
print("Model Comparison (Accuracy):")
print(f"Linear Regression (Logistic): {accuracy_linear:.4f}")
print(f"Decision Tree: {accuracy_dt:.4f}")
print(f"Neural Network: {accuracy_mlp:.4f}")

# Confusion Matrices
print("\nConfusion Matrix (Linear Regression):")
print(confusion_matrix(y_test, y_pred_linear))

print("\nConfusion Matrix (Decision Tree):")
print(confusion_matrix(y_test, y_pred_dt))

print("\nConfusion Matrix (Neural Network):")
print(confusion_matrix(y_test, y_pred_mlp))

# Classification Report Comparison
print("\nClassification Report (Linear Regression):")
print(classification_report(y_test, y_pred_linear))

print("\nClassification Report (Decision Tree):")
print(classification_report(y_test, y_pred_dt))

print("\nClassification Report (Neural Network):")
print(classification_report(y_test, y_pred_mlp))

default
0    158311
1     14149
Name: count, dtype: int64
Model Comparison (Accuracy):
Linear Regression (Logistic): 0.9190
Decision Tree: 0.8522
Neural Network: 0.9184

Confusion Matrix (Linear Regression):
[[31698     0]
 [ 2794     0]]

Confusion Matrix (Decision Tree):
[[28918  2780]
 [ 2318   476]]

Confusion Matrix (Neural Network):
[[31677    21]
 [ 2792     2]]

Classification Report (Linear Regression):
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     31698
           1       0.00      0.00      0.00      2794

    accuracy                           0.92     34492
   macro avg       0.46      0.50      0.48     34492
weighted avg       0.84      0.92      0.88     34492


Classification Report (Decision Tree):


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.93      0.91      0.92     31698
           1       0.15      0.17      0.16      2794

    accuracy                           0.85     34492
   macro avg       0.54      0.54      0.54     34492
weighted avg       0.86      0.85      0.86     34492


Classification Report (Neural Network):
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     31698
           1       0.09      0.00      0.00      2794

    accuracy                           0.92     34492
   macro avg       0.50      0.50      0.48     34492
weighted avg       0.85      0.92      0.88     34492

