In [None]:
from google.colab import files
import io
import pandas as pd

uploaded = files.upload()  # select your CSV file
df = pd.read_csv(io.BytesIO(next(iter(uploaded.values()))))
print("Loaded dataset with shape:", df.shape)


Saving Delinquency_prediction_dataset.csv to Delinquency_prediction_dataset (3).csv
Loaded dataset with shape: (500, 19)


In [None]:
# 3. Feature Engineering
import numpy as np

# Binary target
df['Delinquent_Account'] = (df['Delinquent_Account'] > 0).astype(int)

# Derive payment-history features
payment_cols = [f"Month_{i}" for i in range(1, 7)]
df['Missed_Payments_Count'] = (df[payment_cols] == 'Missed').sum(axis=1)
df['Late_Payments_Count']   = (df[payment_cols] == 'Late').sum(axis=1)
df['OnTime_Payments_Count'] = (df[payment_cols] == 'On-time').sum(axis=1)
df['Payment_Consistency']    = (df[payment_cols].nunique(axis=1) == 1).astype(int)

# Drop original month columns
df.drop(columns=payment_cols, inplace=True)


In [None]:
# 4. Balance Classes by Oversampling
from sklearn.utils import resample

df_majority = df[df.Delinquent_Account == 0]
df_minority = df[df.Delinquent_Account == 1]

df_min_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)

df_balanced = pd.concat([df_majority, df_min_upsampled]).sample(frac=1, random_state=42)
print("Balanced class counts:\n", df_balanced.Delinquent_Account.value_counts())


Balanced class counts:
 Delinquent_Account
1    420
0    420
Name: count, dtype: int64


In [None]:
# 5. Split Data & Build Preprocessor
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = df_balanced.drop(columns=['Customer_ID', 'Delinquent_Account'])
y = df_balanced['Delinquent_Account']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

numeric_feats = [
    'Age','Income','Credit_Score','Credit_Utilization',
    'Loan_Balance','Debt_to_Income_Ratio','Account_Tenure',
    'Missed_Payments_Count','Late_Payments_Count','OnTime_Payments_Count',
    'Payment_Consistency'
]
categorical_feats = ['Employment_Status','Credit_Card_Type','Location']

numeric_pipe = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot',  OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipe, numeric_feats),
    ('cat', categorical_pipe, categorical_feats)
])


In [None]:
# 6. Train Base Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42))
])
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(class_weight='balanced', n_estimators=200, max_depth=10, random_state=42))
])

print("Training Logistic Regression...")
lr_pipeline.fit(X_train, y_train)
print("Training Random Forest...")
rf_pipeline.fit(X_train, y_train)

print("\n=== Evaluation of Base Models ===")
for name, model in zip(['Logistic Regression', 'Random Forest'], [lr_pipeline, rf_pipeline]):
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, model.predict(X_test)))

# Choose best model
best_model = rf_pipeline

Training Logistic Regression...
Training Random Forest...

=== Evaluation of Base Models ===

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.60      0.57        84
           1       0.56      0.52      0.54        84

    accuracy                           0.56       168
   macro avg       0.56      0.56      0.56       168
weighted avg       0.56      0.56      0.56       168


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98        84
           1       0.98      0.99      0.98        84

    accuracy                           0.98       168
   macro avg       0.98      0.98      0.98       168
weighted avg       0.98      0.98      0.98       168



In [None]:
# 7. Save Model
import joblib
joblib.dump(best_model, 'bdelinquency_model.pkl')
print("Saved best model to bdelinquency_model.pkl")


Saved best model to bdelinquency_model.pkl


In [None]:
# 8. Fairness Audit with Fairlearn
from fairlearn.metrics import MetricFrame, selection_rate, true_positive_rate, false_positive_rate
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference

# Predictions
y_pred = best_model.predict(X_test)

# Sensitive features
sf_emp = X_test['Employment_Status']
sf_loc = X_test['Location']

metrics = {
    "Selection Rate":      selection_rate,
    "True Positive Rate":  true_positive_rate,
    "False Positive Rate": false_positive_rate
}

# Group metrics
mf_emp = MetricFrame(metrics=metrics, y_true=y_test, y_pred=y_pred, sensitive_features=sf_emp)
mf_loc = MetricFrame(metrics=metrics, y_true=y_test, y_pred=y_pred, sensitive_features=sf_loc)

print("\nGroup metrics by Employment_Status:")
display(mf_emp.by_group)
print("\nGroup metrics by Location:")
display(mf_loc.by_group)

# Disparity measures
dp_emp = demographic_parity_difference(y_true=y_test, y_pred=y_pred, sensitive_features=sf_emp)
eo_emp = equalized_odds_difference(y_true=y_test, y_pred=y_pred, sensitive_features=sf_emp)
dp_loc = demographic_parity_difference(y_true=y_test, y_pred=y_pred, sensitive_features=sf_loc)
eo_loc = equalized_odds_difference(y_true=y_test, y_pred=y_pred, sensitive_features=sf_loc)

print(f"\nEmployment_Status → Demographic Parity Δ: {dp_emp:.3f}")
print(f"Employment_Status → Equalized Odds Δ:    {eo_emp:.3f}")
print(f"Location         → Demographic Parity Δ: {dp_loc:.3f}")
print(f"Location         → Equalized Odds Δ:    {eo_loc:.3f}")



Group metrics by Employment_Status:


Unnamed: 0_level_0,Selection Rate,True Positive Rate,False Positive Rate
Employment_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EMP,0.4,1.0,0.0
Employed,0.555556,1.0,0.058824
Self-employed,0.380952,1.0,0.0
Unemployed,0.617647,1.0,0.071429
employed,0.416667,0.909091,0.0
retired,0.571429,1.0,0.0



Group metrics by Location:


Unnamed: 0_level_0,Selection Rate,True Positive Rate,False Positive Rate
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chicago,0.484848,1.0,0.0
Houston,0.4,1.0,0.0
Los Angeles,0.604651,0.961538,0.058824
New York,0.558824,1.0,0.0625
Phoenix,0.424242,1.0,0.0



Employment_Status → Demographic Parity Δ: 0.237
Employment_Status → Equalized Odds Δ:    0.091
Location         → Demographic Parity Δ: 0.205
Location         → Equalized Odds Δ:    0.062


In [None]:
# 9. In-Processing Mitigation with ExponentiatedGradient
from fairlearn.reductions import ExponentiatedGradient, DemographicParity, EqualizedOdds
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Configure constraint
constraint = DemographicParity()  # or EqualizedOdds()
expgrad = ExponentiatedGradient(
    estimator=RandomForestClassifier(class_weight='balanced', n_estimators=200, max_depth=10, random_state=42),
    constraints=constraint,
    eps=0.01,
    max_iter=50
)

fair_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('fair', expgrad)
])

print(f"\nTraining fairness-aware model with {constraint.__class__.__name__}...")
fair_pipeline.fit(
    X_train,
    y_train,
    fair__sensitive_features=X_train['Employment_Status']
)

# Evaluate fairness-aware model
y_pred_fair = fair_pipeline.predict(X_test)
print("\n=== Fairness-Aware Model Evaluation ===")
print("Accuracy :", accuracy_score(y_test, y_pred_fair))
print("Precision:", precision_score(y_test, y_pred_fair))
print("Recall   :", recall_score(y_test, y_pred_fair))

# Re-audit group metrics
mf_emp_fair = MetricFrame(metrics=metrics, y_true=y_test, y_pred=y_pred_fair, sensitive_features=sf_emp)
print("\nGroup metrics by Employment_Status (fair model):")
display(mf_emp_fair.by_group)

dp_fair = demographic_parity_difference(y_true=y_test, y_pred=y_pred_fair, sensitive_features=sf_emp)
eo_fair = equalized_odds_difference(y_true=y_test, y_pred=y_pred_fair, sensitive_features=sf_emp)
print(f"\nPost-training Demographic Parity Δ: {dp_fair:.3f}")
print(f"Post-training Equalized Odds Δ:    {eo_fair:.3f}")


Training fairness-aware model with DemographicParity...

=== Fairness-Aware Model Evaluation ===
Accuracy : 0.9523809523809523
Precision: 0.9318181818181818
Recall   : 0.9761904761904762

Group metrics by Employment_Status (fair model):


Unnamed: 0_level_0,Selection Rate,True Positive Rate,False Positive Rate
Employment_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EMP,0.44,1.0,0.066667
Employed,0.472222,0.894737,0.0
Self-employed,0.428571,1.0,0.076923
Unemployed,0.588235,1.0,0.0
employed,0.5,1.0,0.076923
retired,0.678571,1.0,0.25



Post-training Demographic Parity Δ: 0.250
Post-training Equalized Odds Δ:    0.250


In [None]:
from fairlearn.postprocessing import ThresholdOptimizer
from fairlearn.metrics import MetricFrame, selection_rate, true_positive_rate, false_positive_rate
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference
import pandas as pd

# 1) Load your already-trained pipeline
#    (best_model should already include the preprocessor + classifier)
pipeline = best_model

# 2) Apply ThresholdOptimizer for demographic parity
postproc = ThresholdOptimizer(
    estimator=pipeline,
    constraints="equalized_odds",  # instead of demographic_parity
    predict_method="predict_proba",
    prefit=True,
    grid_size=100
)


# Fit on training data + sensitive feature
postproc.fit(
    X_train,
    y_train,
    sensitive_features=X_train["Employment_Status"]
)

# 3) Predict on test set
y_pred_post = postproc.predict(
    X_test,
    sensitive_features=X_test["Employment_Status"]
)

# 4) Re‑evaluate overall metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("Post‑processing Metrics:")
print(" Accuracy :", accuracy_score(y_test, y_pred_post))
print(" Precision:", precision_score(y_test, y_pred_post))
print(" Recall   :", recall_score(y_test, y_pred_post))

# 5) Re‑audit group fairness
metrics = {
    "Selection Rate":      selection_rate,
    "True Positive Rate":  true_positive_rate,
    "False Positive Rate": false_positive_rate
}
mf = MetricFrame(
    metrics=metrics,
    y_true=y_test,
    y_pred=y_pred_post,
    sensitive_features=X_test["Employment_Status"]
)
print("\nGroup metrics by Employment_Status (post‑processing):")
display(mf.by_group)

print("\nΔ Demographic Parity:", demographic_parity_difference(
    y_true=y_test, y_pred=y_pred_post, sensitive_features=X_test["Employment_Status"]))
print("Δ Equalized Odds:   ", equalized_odds_difference(
    y_true=y_test, y_pred=y_pred_post, sensitive_features=X_test["Employment_Status"]))


Post‑processing Metrics:
 Accuracy : 0.9880952380952381
 Precision: 0.9880952380952381
 Recall   : 0.9880952380952381

Group metrics by Employment_Status (post‑processing):


Unnamed: 0_level_0,Selection Rate,True Positive Rate,False Positive Rate
Employment_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EMP,0.4,1.0,0.0
Employed,0.527778,1.0,0.0
Self-employed,0.380952,1.0,0.0
Unemployed,0.617647,1.0,0.071429
employed,0.416667,0.909091,0.0
retired,0.571429,1.0,0.0



Δ Demographic Parity: 0.2366946778711485
Δ Equalized Odds:    0.09090909090909094


In [None]:
from google.colab import files

# Download the trained and post-processed model
files.download('bdelinquency_model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>