In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 1. Load your fully encoded feature dataset + original DISPOSIT column
#    (Ensure 'DISPOSIT' wasn't one-hot encoded in Phase 3 if it's truly the target)
feature_file = "../data/cleaned_data_phase3.csv"
encoded_features_df = pd.read_csv(feature_file)

# You might need to merge or re-load DISPOSIT from your original or intermediate dataset
# if you haven't included 'DISPOSIT' in 'cleaned_data_phase3_features.csv'.
# Example:
original_df = pd.read_csv("../data/cleaned_data_phase3_unencoded_DISPOSIT.csv")
final_df = encoded_features_df.join(original_df['DISPOSIT'])

# 2. Encode the Target Variable DISPOSIT
label_encoder = LabelEncoder()
final_df['DISPOSIT_ENCODED'] = label_encoder.fit_transform(final_df['DISPOSIT'])

# 3. Define Feature Matrix X and Target y
X = final_df.drop(columns=['DISPOSIT', 'DISPOSIT_ENCODED'])
y = final_df['DISPOSIT_ENCODED']

# 4. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Shapes:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_test:", X_test.shape, "y_test:", y_test.shape)

# 5. (Optional) Verify Class Distribution
print("\nTrain distribution of DISPOSIT_ENCODED:")
print(y_train.value_counts(normalize=True))

print("\nTest distribution of DISPOSIT_ENCODED:")
print(y_test.value_counts(normalize=True))

print("\nData is now ready for baseline modeling.")


Shapes:
X_train: (59803, 439) y_train: (59803,)
X_test: (14951, 439) y_test: (14951,)

Train distribution of DISPOSIT_ENCODED:
DISPOSIT_ENCODED
0    0.974650
2    0.023260
3    0.000936
4    0.000836
1    0.000318
Name: proportion, dtype: float64

Test distribution of DISPOSIT_ENCODED:
DISPOSIT_ENCODED
0    0.974651
2    0.023276
3    0.000936
4    0.000870
1    0.000268
Name: proportion, dtype: float64

Data is now ready for baseline modeling.


  original_df = pd.read_csv("../data/cleaned_data_phase3_unencoded_DISPOSIT.csv")


In [2]:
# 1. Load the Final Dataset from Sub-Phase 4.2
# Assuming you've done a train/test split and have X_train, X_test, y_train, y_test

# 2. Select a Simple Classifier (Logistic Regression)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter=1000, random_state=42)

# 3. Train the Model & Evaluate Performance
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("=== Baseline Logistic Regression ===")

# Training Performance
train_acc = accuracy_score(y_train, y_pred_train)
print(f"Training Accuracy: {train_acc:.3f}")

# Testing Performance
test_acc = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_acc:.3f}")

# Detailed Classification Report
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))


=== Baseline Logistic Regression ===
Training Accuracy: 0.975
Test Accuracy: 0.975

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     14572
           1       0.00      0.00      0.00         4
           2       0.50      0.01      0.01       348
           3       0.00      0.00      0.00        14
           4       0.00      0.00      0.00        13

    accuracy                           0.97     14951
   macro avg       0.29      0.20      0.20     14951
weighted avg       0.96      0.97      0.96     14951



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
import numpy as np

# 1. Identify Protected Attribute(s)
# Example: 'NEWRACE_Hispanic' might be a column in X_test after get_dummies
# Or you might re-merge the original 'NEWRACE' label if you dropped it.
protected_col = 'NEWRACE_Black'  # Example column

# 2. Predict on the Test Set
y_pred_test = model.predict(X_test)  # Already done in Sub-Phase 4.3

# 3. Compute Group-Level Metrics
# We'll create a small function to compute accuracy for a given group
def group_accuracy(X, y_true, y_pred, group_col, group_val):
    idx = X[group_col] == group_val
    return accuracy_score(y_true[idx], y_pred[idx])

# Example: If your protected_col is binary (0 or 1)
acc_group_1 = group_accuracy(X_test, y_test, y_pred_test, protected_col, 1)
acc_group_0 = group_accuracy(X_test, y_test, y_pred_test, protected_col, 0)

print(f"\nAccuracy for {protected_col} = 1:", acc_group_1)
print(f"Accuracy for {protected_col} = 0:", acc_group_0)

# 4. (Optional) More Fairness Metrics
# - Demographic Parity: Compare positive prediction rates (y_pred=1) across groups
# - Equalized Odds: Compare TPR/FPR across groups

from sklearn.metrics import confusion_matrix

def demographic_parity(X, y_pred, group_col):
    # Probability of predicting "positive" for each group
    groups = np.unique(X[group_col])
    for g in groups:
        idx = X[group_col] == g
        pos_rate = np.mean(y_pred[idx])
        print(f"Demographic Parity -> Group: {g}, Positive Rate: {pos_rate:.3f}")

demographic_parity(X_test, y_pred_test, protected_col)


Accuracy for NEWRACE_Black = 1: 0.9540191862388356
Accuracy for NEWRACE_Black = 0: 0.9798792756539235
Demographic Parity -> Group: False, Positive Rate: 0.000
Demographic Parity -> Group: True, Positive Rate: 0.001


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, roc_auc_score

# 1. Train the Baseline Model (Logistic Regression)
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# 2. Predict on Test Set
y_pred_test = model.predict(X_test)

# 3. Evaluate Performance Metrics
print("=== Baseline Logistic Regression ===")

# Accuracy
train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, y_pred_test)
print(f"Training Accuracy: {train_acc:.3f}")
print(f"Test Accuracy: {test_acc:.3f}")

# Classification Report
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))

# Precision, Recall, F1 Score with appropriate averaging
precision = precision_score(y_test, y_pred_test, average='weighted')
recall = recall_score(y_test, y_pred_test, average='weighted')
f1 = f1_score(y_test, y_pred_test, average='weighted')

print(f"Precision (weighted): {precision:.3f}")
print(f"Recall (weighted): {recall:.3f}")
print(f"F1 Score (weighted): {f1:.3f}")

# ROC AUC Score (only for binary classification)
# If multiclass, use 'ovr' or 'ovo' strategies
if len(np.unique(y)) == 2:
    roc_auc = roc_auc_score(y_test, y_pred_test)
    print(f"ROC AUC Score: {roc_auc:.3f}")
else:
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
    print(f"ROC AUC Score (OvR): {roc_auc:.3f}")


=== Baseline Logistic Regression ===
Training Accuracy: 0.975
Test Accuracy: 0.975

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     14572
           1       0.00      0.00      0.00         4
           2       0.50      0.01      0.01       348
           3       0.00      0.00      0.00        14
           4       0.00      0.00      0.00        13

    accuracy                           0.97     14951
   macro avg       0.29      0.20      0.20     14951
weighted avg       0.96      0.97      0.96     14951

Precision (weighted): 0.962
Recall (weighted): 0.975
F1 Score (weighted): 0.962
ROC AUC Score (OvR): 0.740


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
from sklearn.metrics import confusion_matrix

def compute_tpr_fpr(y_true, y_pred):
    # Ensure the confusion matrix has both classes
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    
    if cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
    else:
        # Handle cases with missing classes
        tn, fp, fn, tp = 0, 0, 0, 0
        if len(cm) == 1:
            if cm.shape[0] == 1:
                # Only one class present in y_true and y_pred
                if y_true.iloc[0] == 0:
                    tn = cm[0, 0]
                else:
                    tp = cm[0, 0]
    
    # Calculate TPR and FPR with checks to avoid division by zero
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    
    return tpr, fpr

# Example usage:
# Assuming 'NEWRACE_Black' is one-hot encoded as 'NEWRACE_Black'
protected_col = 'NEWRACE_Black'

# Compute metrics for Black group
tpr_black, fpr_black = compute_tpr_fpr(y_test[X_test[protected_col] == 1], y_pred_test[X_test[protected_col] == 1])

# Compute metrics for Non-Black group
tpr_non_black, fpr_non_black = compute_tpr_fpr(y_test[X_test[protected_col] == 0], y_pred_test[X_test[protected_col] == 0])

print(f"\nBlack Group TPR: {tpr_black:.3f}, FPR: {fpr_black:.3f}")
print(f"Non-Black Group TPR: {tpr_non_black:.3f}, FPR: {fpr_non_black:.3f}")



Black Group TPR: 0.000, FPR: 0.000
Non-Black Group TPR: 0.000, FPR: 0.000
