In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm

# Replace this with the path to your CSV file
file_path = '/Users/christopherfrye/Library/Mobile Documents/com~apple~CloudDocs/NYU Stern/2025_Summer Term/AI in Finance/home_credit_cleaned.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Ensure all categorical variables are encoded
categorical_columns = ['code_gender', 'flag_own_car', 'flag_own_realty', 'age_range', 'educated', 'children', 'married']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Define your features and target
X = df[['amt_income_total', 'credit_score_mean', 'amt_credit', 'days_employed', 
        'document_count', 'credit_score_stdev', 'age_range', 'educated', 
        'children', 'married', 'flag_own_car', 'flag_own_realty']]
y = df['default']

# Add a constant to the model (for the intercept term in statsmodels)
X = sm.add_constant(X)

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the Linear Regression model from sklearn
model = LinearRegression()
model.fit(X_train, y_train)

# Get predictions (probabilities)
y_pred_prob = model.predict(X_test)

# Convert probabilities to binary outcomes (1 for default, 0 for repaid) using a 0.5 threshold
y_pred = np.where(y_pred_prob > 0.5, 1, 0)

# Evaluate the performance using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy: {accuracy:.4f}')

# Additional evaluation: confusion matrix and classification report
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Now, we will fit the same model using statsmodels to get regression statistics
ols_model = sm.OLS(y_train, X_train).fit()

# Output the OLS regression summary statistics (R², p-values, t-statistics, etc.)
print("\nLinear Regression Statistics (from statsmodels):")
print(ols_model.summary())


Accuracy: 0.9190

Confusion Matrix:
[[31698     0]
 [ 2794     0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     31698
           1       0.00      0.00      0.00      2794

    accuracy                           0.92     34492
   macro avg       0.46      0.50      0.48     34492
weighted avg       0.84      0.92      0.88     34492



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Linear Regression Statistics (from statsmodels):
                            OLS Regression Results                            
Dep. Variable:                default   R-squared:                       0.059
Model:                            OLS   Adj. R-squared:                  0.059
Method:                 Least Squares   F-statistic:                     727.3
Date:                Mon, 26 May 2025   Prob (F-statistic):               0.00
Time:                        15:24:36   Log-Likelihood:                -13334.
No. Observations:              137968   AIC:                         2.669e+04
Df Residuals:                  137955   BIC:                         2.682e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------

In [15]:
# Backward Elimination on Linear Regression

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Replace this with the path to your CSV file
file_path = '/Users/christopherfrye/Library/Mobile Documents/com~apple~CloudDocs/NYU Stern/2025_Summer Term/AI in Finance/home_credit_cleaned.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Ensure all categorical variables are encoded
categorical_columns = ['code_gender', 'flag_own_car', 'flag_own_realty', 'age_range', 'educated', 'children', 'married']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Define your features and target
X = df[['amt_income_total', 'credit_score_mean', 'amt_credit', 'days_employed', 
        'document_count', 'credit_score_stdev', 'age_range', 'educated', 
        'children', 'married', 'flag_own_car', 'flag_own_realty']]
y = df['default']

# Add a constant to the model (for the intercept term in statsmodels)
X = sm.add_constant(X)

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Backward Elimination Process
def backward_elimination(X_train, y_train, significance_level=0.05):
    initial_features = X_train.columns
    while True:
        # Fit the model with the current features
        model = sm.OLS(y_train, X_train).fit()
        
        # Get p-values for the features
        p_values = model.pvalues[1:]  # Exclude the constant term
        
        # Find the feature with the highest p-value
        max_p_value = p_values.max()
        feature_with_max_p_value = p_values.idxmax()
        
        # If the max p-value is greater than the significance level, remove the feature
        if max_p_value > significance_level:
            X_train = X_train.drop(columns=[feature_with_max_p_value])
        else:
            break  # If no feature has p-value greater than the threshold, stop
    
    return X_train, model

# Perform backward elimination to get the optimal set of features
X_train_optimal, optimal_model = backward_elimination(X_train, y_train)

# Output the final model summary with optimal features
print("\nFinal Model Summary (After Backward Elimination):")
print(optimal_model.summary())

# Make sure to add the constant to the test set as well (so it has the same shape as the training set)
X_test_optimal = sm.add_constant(X_test[X_train_optimal.columns])

# Evaluate the performance of the final model on the test set
y_pred_optimal = optimal_model.predict(X_test_optimal)

# Convert the probabilities to binary outcomes (1 for default, 0 for repaid) using a 0.5 threshold
y_pred_binary = np.where(y_pred_optimal > 0.5, 1, 0)

# Performance evaluation: accuracy, confusion matrix, and classification report
accuracy = accuracy_score(y_test, y_pred_binary)
print(f'\nOptimal Model Accuracy: {accuracy:.4f}')

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_binary))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_binary))


Final Model Summary (After Backward Elimination):
                            OLS Regression Results                            
Dep. Variable:                default   R-squared:                       0.059
Model:                            OLS   Adj. R-squared:                  0.059
Method:                 Least Squares   F-statistic:                     872.6
Date:                Mon, 26 May 2025   Prob (F-statistic):               0.00
Time:                        15:27:44   Log-Likelihood:                -13334.
No. Observations:              137968   AIC:                         2.669e+04
Df Residuals:                  137957   BIC:                         2.680e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
