In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Load data
file_path = "/kaggle/input/recidivism/NIJ_s_Recidivism_Challenge_Full_Dataset_20241018.csv"
df = pd.read_csv(file_path)

# Drop leakage/irrelevant columns
drop_cols = ['ID', 'Recidivism_Arrest_Year1', 'Recidivism_Arrest_Year2', 'Recidivism_Arrest_Year3', 'Training_Sample']
df.drop(columns=drop_cols, inplace=True, errors='ignore')

# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)
df.fillna("Unknown", inplace=True)

# Encode categorical features
df = pd.get_dummies(df, drop_first=True)

# Define target and features
target = 'Recidivism_Within_3years'
X = df.drop(columns=[target])
y = df[target]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Logistic Regression (default = lbfgs)
lr1 = LogisticRegression(max_iter=2000, solver='lbfgs')
lr1.fit(X_train, y_train)
pred1 = lr1.predict(X_test)
acc1 = accuracy_score(y_test, pred1)
auc1 = roc_auc_score(y_test, lr1.predict_proba(X_test)[:, 1])

# Logistic Regression (saga solver)
lr2 = LogisticRegression(max_iter=2000, solver='saga')
lr2.fit(X_train, y_train)
pred2 = lr2.predict(X_test)
acc2 = accuracy_score(y_test, pred2)
auc2 = roc_auc_score(y_test, lr2.predict_proba(X_test)[:, 1])

# Logistic Regression (L1 penalty with saga)
lr3 = LogisticRegression(max_iter=2000, solver='saga', penalty='l1')
lr3.fit(X_train, y_train)
pred3 = lr3.predict(X_test)
acc3 = accuracy_score(y_test, pred3)
auc3 = roc_auc_score(y_test, lr3.predict_proba(X_test)[:, 1])

# Output comparison
print("\n🔍 Model Comparison (Accuracy and AUC):")
print(f"Default Logistic Regression (lbfgs):   Accuracy = {acc1:.4f} | AUC = {auc1:.4f}")
print(f"Logistic Regression (saga):            Accuracy = {acc2:.4f} | AUC = {auc2:.4f}")
print(f"Logistic Regression (L1, saga):        Accuracy = {acc3:.4f} | AUC = {auc3:.4f}")



🔍 Model Comparison (Accuracy and AUC):
Default Logistic Regression (lbfgs):   Accuracy = 0.7109 | AUC = 0.7740
Logistic Regression (saga):            Accuracy = 0.7109 | AUC = 0.7740
Logistic Regression (L1, saga):        Accuracy = 0.7114 | AUC = 0.7740
