In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm

# Replace this with the path to your CSV file
file_path = '/Users/christopherfrye/Library/Mobile Documents/com~apple~CloudDocs/NYU Stern/2025_Summer Term/AI in Finance/home_credit_cleaned.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Ensure all categorical variables are encoded
categorical_columns = ['code_gender', 'flag_own_car', 'flag_own_realty', 'age_range', 'educated', 'children', 'married']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Define your features and target
X = df[['amt_income_total', 'credit_score_mean', 'amt_credit', 'days_employed', 
        'document_count', 'credit_score_stdev', 'age_range', 'educated', 
        'children', 'married', 'flag_own_car', 'flag_own_realty']]
y = df['default']

# Add a constant to the model (for the intercept term in statsmodels)
X = sm.add_constant(X)

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------
# 1. Predict all good (baseline model)
# -------------------------
# The "predict all good" model predicts "0" (non-default) for all instances
y_pred_all_good_train = np.zeros_like(y_train)
y_pred_all_good_test = np.zeros_like(y_test)

accuracy_train_all_good = accuracy_score(y_train, y_pred_all_good_train)
accuracy_test_all_good = accuracy_score(y_test, y_pred_all_good_test)

# -------------------------
# 2. Logistic Regression (Logit)
# -------------------------
# Initialize and train the logistic regression (Logit) model using statsmodels
logit_model = sm.Logit(y_train, X_train)
logit_model_fitted = logit_model.fit()

# Predict on train and test data
y_pred_logit_train = logit_model_fitted.predict(X_train)
y_pred_logit_test = logit_model_fitted.predict(X_test)

# Convert probabilities to binary outcomes (1 for default, 0 for repaid) using a 0.5 threshold
y_pred_logit_train_bin = np.where(y_pred_logit_train > 0.5, 1, 0)
y_pred_logit_test_bin = np.where(y_pred_logit_test > 0.5, 1, 0)

accuracy_train_logit = accuracy_score(y_train, y_pred_logit_train_bin)
accuracy_test_logit = accuracy_score(y_test, y_pred_logit_test_bin)

# -------------------------
# 3. Decision Tree
# -------------------------
# Initialize and train the decision tree classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predict on train and test data
y_pred_dt_train = dt_model.predict(X_train)
y_pred_dt_test = dt_model.predict(X_test)

accuracy_train_dt = accuracy_score(y_train, y_pred_dt_train)
accuracy_test_dt = accuracy_score(y_test, y_pred_dt_test)

# ------------------------
# 4. Neural Network (MLP)
# ------------------------
# Initialize and train the MLP classifier (neural network)
mlp_model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)

# Predict on train and test data
y_pred_mlp_train = mlp_model.predict(X_train)
y_pred_mlp_test = mlp_model.predict(X_test)

accuracy_train_mlp = accuracy_score(y_train, y_pred_mlp_train)
accuracy_test_mlp = accuracy_score(y_test, y_pred_mlp_test)

# ------------------------
# Output and Evaluation
# ------------------------

# Print the results in the requested table format
print("\nIn-Sample vs Out-of-Sample Performance:")
print(f"Model\t\tTrain Accuracy\tTest Accuracy")
print(f"Predict all good\t{accuracy_train_all_good:.4f}\t{accuracy_test_all_good:.4f}")
print(f"Logistic Regression (Logit)\t{accuracy_train_logit:.4f}\t{accuracy_test_logit:.4f}")
print(f"Decision Tree\t\t{accuracy_train_dt:.4f}\t{accuracy_test_dt:.4f}")
print(f"Neural Network\t\t{accuracy_train_mlp:.4f}\t{accuracy_test_mlp:.4f}")

# -------------------------
# Optional: Display the Logit summary for the Logistic Regression model
print("\nLogistic Regression (Logit) Model Summary:")
print(logit_model_fitted.summary())

# Get the depth of the tree (number of levels)
tree_depth = dt_model.tree_.max_depth

print(f"Depth (number of levels) of the Decision Tree: {tree_depth}")

Optimization terminated successfully.
         Current function value: 0.255309
         Iterations 7

In-Sample vs Out-of-Sample Performance:
Model		Train Accuracy	Test Accuracy
Predict all good	0.9177	0.9190
Logistic Regression (Logit)	0.9178	0.9192
Decision Tree		1.0000	0.8522
Neural Network		0.9174	0.9184

Logistic Regression (Logit) Model Summary:
                           Logit Regression Results                           
Dep. Variable:                default   No. Observations:               137968
Model:                          Logit   Df Residuals:                   137955
Method:                           MLE   Df Model:                           12
Date:                Mon, 26 May 2025   Pseudo R-squ.:                  0.1021
Time:                        16:14:29   Log-Likelihood:                -35224.
converged:                       True   LL-Null:                       -39232.
Covariance Type:            nonrobust   LLR p-value:                     0.000
             