In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm

# Replace this with the path to your CSV file
file_path = '/Users/christopherfrye/Library/Mobile Documents/com~apple~CloudDocs/NYU Stern/2025_Summer Term/AI in Finance/home_credit_cleaned.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Ensure all categorical variables are encoded
categorical_columns = ['code_gender', 'flag_own_car', 'flag_own_realty', 'age_range', 'educated', 'children', 'married']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Define your features and target
X = df[['amt_income_total', 'credit_score_mean', 'amt_credit', 'days_employed', 
        'document_count', 'credit_score_stdev', 'age_range', 'educated', 
        'children', 'married', 'flag_own_car', 'flag_own_realty','code_gender']]
y = df['default']

# Add a constant to the model (for the intercept term in statsmodels)
X = sm.add_constant(X)

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression Model with gender
logit_model_with_gender = sm.Logit(y_train, X_train)
logit_model_fitted_with_gender = logit_model_with_gender.fit()

# Check the parameter names to ensure 'code_gender' is in the model
print("\nLogistic Regression Coefficients:")
print(logit_model_fitted_with_gender.params)

# Extract and print the coefficient for 'code_gender'
# Check if 'code_gender' is included in the params, which could have a different name (e.g., 'const' for the intercept)
if 'code_gender' in logit_model_fitted_with_gender.params.index:
    gender_coef = logit_model_fitted_with_gender.params['code_gender']
    print(f"Coefficient for code_gender: {gender_coef}")
else:
    print("'code_gender' not found in the model parameters. The column might be named differently.")

# Initialize and train the decision tree classifier with gender
dt_model_with_gender = DecisionTreeClassifier(random_state=42)
dt_model_with_gender.fit(X_train, y_train)

# Check the first few splits (feature used and thresholds)
print("First few splits in the Decision Tree:")
# Retrieve the feature names and thresholds used in the first few splits
tree_features = dt_model_with_gender.feature_importances_
split_features = dt_model_with_gender.tree_.feature

# Print feature names that are used in the first few splits
print(f"Features used in the first few splits: {X_train.columns[split_features[:5]]}")

Optimization terminated successfully.
         Current function value: 0.255010
         Iterations 7

Logistic Regression Coefficients:
const                -4.744049e-02
amt_income_total      2.534591e-08
credit_score_mean    -5.533946e+00
amt_credit            2.241374e-08
days_employed        -8.343578e-05
document_count        2.563134e-01
credit_score_stdev    5.461444e-01
age_range             5.505579e-02
educated             -4.283181e-01
children              4.055944e-02
married              -3.620657e-02
flag_own_car         -2.550386e-01
flag_own_realty       7.884399e-02
code_gender           1.990322e-01
dtype: float64
Coefficient for code_gender: 0.19903218815061957
First few splits in the Decision Tree:
Features used in the first few splits: Index(['credit_score_mean', 'credit_score_mean', 'credit_score_mean',
       'credit_score_mean', 'credit_score_mean'],
      dtype='object')


In [11]:
# Re-train models without gender

# Remove 'code_gender' column from the features for training
X_train_no_gender = X_train.drop(columns=['code_gender'])
X_test_no_gender = X_test.drop(columns=['code_gender'])

# Logistic Regression (Logit) without gender
logit_model_no_gender = sm.Logit(y_train, X_train_no_gender)
logit_model_fitted_no_gender = logit_model_no_gender.fit()

# Decision Tree without gender
dt_model_no_gender = DecisionTreeClassifier(random_state=42)
dt_model_no_gender.fit(X_train_no_gender, y_train)

# Neural Network (MLP) without gender
mlp_model_no_gender = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=42)
mlp_model_no_gender.fit(X_train_no_gender, y_train)

# -------------------------
# Evaluate the models again on training and test data

# Logistic Regression (Logit) - Convert predicted probabilities to binary outcomes using a threshold of 0.5
y_pred_logit_train_bin = np.where(logit_model_fitted_no_gender.predict(X_train_no_gender) > 0.5, 1, 0)
y_pred_logit_test_bin = np.where(logit_model_fitted_no_gender.predict(X_test_no_gender) > 0.5, 1, 0)

accuracy_train_logit_no_gender = accuracy_score(y_train, y_pred_logit_train_bin)
accuracy_test_logit_no_gender = accuracy_score(y_test, y_pred_logit_test_bin)

# Decision Tree without gender
accuracy_train_dt_no_gender = accuracy_score(y_train, dt_model_no_gender.predict(X_train_no_gender))
accuracy_test_dt_no_gender = accuracy_score(y_test, dt_model_no_gender.predict(X_test_no_gender))

# Neural Network (MLP) without gender
accuracy_train_mlp_no_gender = accuracy_score(y_train, mlp_model_no_gender.predict(X_train_no_gender))
accuracy_test_mlp_no_gender = accuracy_score(y_test, mlp_model_no_gender.predict(X_test_no_gender))

# Display the performance drop
print("\nPerformance Comparison (With and Without Gender):")
print(f"Logistic Regression Train Accuracy: {accuracy_train_logit_no_gender:.4f}, Test Accuracy: {accuracy_test_logit_no_gender:.4f}")
print(f"Decision Tree Train Accuracy: {accuracy_train_dt_no_gender:.4f}, Test Accuracy: {accuracy_test_dt_no_gender:.4f}")
print(f"Neural Network Train Accuracy: {accuracy_train_mlp_no_gender:.4f}, Test Accuracy: {accuracy_test_mlp_no_gender:.4f}")

Optimization terminated successfully.
         Current function value: 0.255309
         Iterations 7

Performance Comparison (With and Without Gender):
Logistic Regression Train Accuracy: 0.9178, Test Accuracy: 0.9192
Decision Tree Train Accuracy: 1.0000, Test Accuracy: 0.8522
Neural Network Train Accuracy: 0.9174, Test Accuracy: 0.9184


In [12]:
# Testing for Proxy Discimination

# Get the predicted default probabilities for the test set from all models
y_pred_logit_no_gender_test = logit_model_fitted_no_gender.predict(X_test_no_gender)
y_pred_dt_no_gender_test = dt_model_no_gender.predict(X_test_no_gender)
y_pred_mlp_no_gender_test = mlp_model_no_gender.predict(X_test_no_gender)

# Create DataFrames to compare the predicted default probabilities by gender
df_pred_logit = pd.DataFrame({'code_gender': y_test, 'predicted_default': y_pred_logit_no_gender_test})
df_pred_dt = pd.DataFrame({'code_gender': y_test, 'predicted_default': y_pred_dt_no_gender_test})
df_pred_mlp = pd.DataFrame({'code_gender': y_test, 'predicted_default': y_pred_mlp_no_gender_test})

# Group by 'code_gender' and calculate the average predicted default probability for each gender
logit_avg_pred_default_by_gender = df_pred_logit.groupby('code_gender')['predicted_default'].mean()
dt_avg_pred_default_by_gender = df_pred_dt.groupby('code_gender')['predicted_default'].mean()
mlp_avg_pred_default_by_gender = df_pred_mlp.groupby('code_gender')['predicted_default'].mean()

# Print the results to see if there's a significant difference between genders
print("\nAverage Predicted Default by Gender (Logit Model):")
print(logit_avg_pred_default_by_gender)

print("\nAverage Predicted Default by Gender (Decision Tree):")
print(dt_avg_pred_default_by_gender)

print("\nAverage Predicted Default by Gender (Neural Network):")
print(mlp_avg_pred_default_by_gender)


Average Predicted Default by Gender (Logit Model):
code_gender
0    0.076428
1    0.146871
Name: predicted_default, dtype: float64

Average Predicted Default by Gender (Decision Tree):
code_gender
0    0.087703
1    0.170365
Name: predicted_default, dtype: float64

Average Predicted Default by Gender (Neural Network):
code_gender
0    0.000663
1    0.000716
Name: predicted_default, dtype: float64
