In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load datasets
train_df = pd.read_csv("../data/processed/train_set.csv")
test_df = pd.read_csv("../data/processed/test_set.csv")

### Logistic regression Model

In [4]:
# Define feature columns (all except 'id' and 'credit_status')
feature_columns = [col for col in train_df.columns if col not in ["credit_status", "id"]]

# Split features and target
X_train = train_df[feature_columns]
y_train = LabelEncoder().fit_transform(train_df["credit_status"])
X_test = test_df[feature_columns]
y_test = LabelEncoder().fit_transform(test_df["credit_status"])

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression model
logreg = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced") #
logreg.fit(X_train_scaled, y_train)


### Evaluation

In [5]:
# Predict on test set
y_pred = logreg.predict(X_test_scaled)

# Attach predictions to customer IDs
results_df = test_df[["id"]].copy()
results_df["predicted_credit_status"] = y_pred

# Display results
print(results_df.head())

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

        id  predicted_credit_status
0  5052720                        0
1  5087861                        1
2  5068206                        0
3  5137255                        0
4  5023163                        1
Accuracy: 0.8414701042238069

Confusion Matrix:
 [[  95  546]
 [ 610 6041]]

Classification Report:
               precision    recall  f1-score   support

           0       0.13      0.15      0.14       641
           1       0.92      0.91      0.91      6651

    accuracy                           0.84      7292
   macro avg       0.53      0.53      0.53      7292
weighted avg       0.85      0.84      0.84      7292



Evaluating fairness using fairlearn library

In [17]:
from fairlearn.metrics import *
#testing true labels
#using y_test as target labels
#y_pred as predicted labels

#as age is a senstive feature but continous we bin the different ages into 5 bins
test_df['age_bin'] = pd.cut(test_df['age'], bins = 5)
#sensitive features from features 

fairness_log_reg_gender = MetricFrame(
    metrics = {
        "Accuracy": accuracy_score,
        "TPR": true_positive_rate,
        "FPR": false_positive_rate},
        y_true= y_test,
        y_pred=y_pred,
        sensitive_features=test_df['gender']
)
print("Logistic Regression Fairness Metrics:\n", fairness_log_reg_gender.by_group)

fairness_log_reg_age = MetricFrame(
    metrics = {
        "Accuracy": accuracy_score,
        "TPR": true_positive_rate,
        "FPR": false_positive_rate},
        y_true= y_test,
        y_pred=y_pred,
        sensitive_features=test_df['age_bin']
)
print("Logistic Regression Fairness Metrics:\n", fairness_log_reg_age.by_group)

dpd_log_reg_gender = demographic_parity_difference(y_test, y_pred, sensitive_features = test_df['gender'])
eod_log_reg_gender = equalized_odds_difference(y_test, y_pred, sensitive_features = test_df['gender'])
dpd_log_reg_age = demographic_parity_difference(y_test, y_pred, sensitive_features = test_df['age_bin'])
eod_log_reg_age = equalized_odds_difference(y_test, y_pred, sensitive_features = test_df['age_bin'])
print(dpd_log_reg_gender)
print(eod_log_reg_gender)
print(dpd_log_reg_age)
print(eod_log_reg_age)



Logistic Regression Fairness Metrics:
         Accuracy       TPR       FPR
gender                              
0       0.822317  0.883783  0.835322
1       0.880753  0.958948  0.882883
Logistic Regression Fairness Metrics:
                    Accuracy       TPR       FPR
age_bin                                        
(-0.00098, 0.196]  0.924623  0.987124  1.000000
(0.196, 0.392]     0.885002  0.960293  0.947977
(0.392, 0.588]     0.844695  0.913129  0.826923
(0.588, 0.784]     0.779040  0.842847  0.769697
(0.784, 0.98]      0.756124  0.805848  0.750000
0.0722418823159875
0.0751651585777513
0.18708772831141862
0.25
