In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

# Load the data
data = pd.read_csv("data/compas-scores-two-years.csv")

# Keep only the relevant features and the target variable
features = ["age", "sex", "juv_misd_count", "juv_fel_count", "priors_count", "c_charge_degree", "c_charge_desc"]
target = "two_year_recid"

# One-hot encode the categorical columns and keep the 'race' column for separate analysis
categorical = ["sex", "c_charge_degree", "c_charge_desc"]
data = pd.get_dummies(data[features + ['race', target]], columns=categorical, drop_first=True)

# Splitting the data into features and target variable
X = data.drop([target, 'race'], axis=1)
y = data[target]

# Separately, store race for filtering during evaluation
races = data['race']

# Initialize lists to store metrics
overall_accuracies, black_accuracies, white_accuracies = [], [], []
overall_fprs, black_fprs, white_fprs = [], [], []
overall_fnrs, black_fnrs, white_fnrs = [], [], []

# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    accuracy = accuracy_score(y_true, y_pred)
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0.0
    return accuracy, fpr, fnr

# Perform the 1000 train-test splits and evaluations
for _ in range(100):
    X_train, X_test, y_train, y_test, races_train, races_test = train_test_split(X, y, races, test_size=0.20, stratify=y)

    # Fit the logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate and store overall metrics
    acc, fpr, fnr = calculate_metrics(y_test, y_pred)
    overall_accuracies.append(acc)
    overall_fprs.append(fpr)
    overall_fnrs.append(fnr)
    
    # Calculate and store metrics for African-American
    mask_black = races_test == 'African-American'
    acc, fpr, fnr = calculate_metrics(y_test[mask_black], y_pred[mask_black])
    black_accuracies.append(acc)
    black_fprs.append(fpr)
    black_fnrs.append(fnr)
    
    # Calculate and store metrics for Caucasian
    mask_white = races_test == 'Caucasian'
    acc, fpr, fnr = calculate_metrics(y_test[mask_white], y_pred[mask_white])
    white_accuracies.append(acc)
    white_fprs.append(fpr)
    white_fnrs.append(fnr)

# Calculate average and confidence intervals
def calculate_average_and_confidence_interval(metric_list):
    average = np.mean(metric_list)
    confidence_interval = np.percentile(metric_list, [2.5, 97.5])
    return average, confidence_interval

# Output the results
results = {}
results['Overall Accuracy'], results['Overall Accuracy CI'] = calculate_average_and_confidence_interval(overall_accuracies)
results['Black Accuracy'], results['Black Accuracy CI'] = calculate_average_and_confidence_interval(black_accuracies)
results['White Accuracy'], results['White Accuracy CI'] = calculate_average_and_confidence_interval(white_accuracies)

results['Overall FPR'], results['Overall FPR CI'] = calculate_average_and_confidence_interval(overall_fprs)
results['Black FPR'], results['Black FPR CI'] = calculate_average_and_confidence_interval(black_fprs)
results['White FPR'], results['White FPR CI'] = calculate_average_and_confidence_interval(white_fprs)

results['Overall FNR'], results['Overall FNR CI'] = calculate_average_and_confidence_interval(overall_fnrs)
results['Black FNR'], results['Black FNR CI'] = calculate_average_and_confidence_interval(black_fnrs)
results['White FNR'], results['White FNR CI'] = calculate_average_and_confidence_interval(white_fnrs)

for metric, value in results.items():
    if isinstance(value, np.ndarray):
        print(f"{metric}: {value[0]*100:.2f}%-{value[1]*100:.2f}%\n")
    else:
        print(f"{metric}: {value*100:.2f}%")

# The 'results' dictionary now contains all the average metrics and their confidence intervals


Overall Accuracy: 66.81%
Overall Accuracy CI: 64.89%-68.99%

Black Accuracy: 66.45%
Black Accuracy CI: 63.07%-69.48%

White Accuracy: 66.67%
White Accuracy CI: 62.12%-70.34%

Overall FPR: 23.15%
Overall FPR CI: 19.79%-26.17%

Black FPR: 30.12%
Black FPR CI: 25.05%-35.17%

White FPR: 17.93%
White FPR CI: 13.87%-22.02%

Overall FNR: 45.44%
Overall FNR CI: 42.07%-48.77%

Black FNR: 36.79%
Black FNR CI: 32.81%-41.01%

White FNR: 57.05%
White FNR CI: 50.00%-63.66%

