In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Load data
file_path = "/kaggle/input/recidivism/NIJ_s_Recidivism_Challenge_Full_Dataset_20241018.csv"
df = pd.read_csv(file_path)

# Drop leakage/irrelevant columns
drop_cols = ['ID', 'Recidivism_Arrest_Year1', 'Recidivism_Arrest_Year2', 'Recidivism_Arrest_Year3', 'Training_Sample']
df.drop(columns=drop_cols, inplace=True, errors='ignore')

# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)
df.fillna("Unknown", inplace=True)

# Encode categorical features
df = pd.get_dummies(df, drop_first=True)

# Define target and features
target = 'Recidivism_Within_3years'
X = df.drop(columns=[target])
y = df[target]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

### 🧠 MODEL TRAINING ###

# 1. Logistic Regression (default = lbfgs)
lr1 = LogisticRegression(max_iter=2000, solver='lbfgs')
lr1.fit(X_train, y_train)
pred1 = lr1.predict(X_test)
acc1 = accuracy_score(y_test, pred1)
auc1 = roc_auc_score(y_test, lr1.predict_proba(X_test)[:, 1])

# 2. Logistic Regression (saga solver)
lr2 = LogisticRegression(max_iter=2000, solver='saga')
lr2.fit(X_train, y_train)
pred2 = lr2.predict(X_test)
acc2 = accuracy_score(y_test, pred2)
auc2 = roc_auc_score(y_test, lr2.predict_proba(X_test)[:, 1])

# 3. Logistic Regression (L1 penalty with saga)
lr3 = LogisticRegression(max_iter=2000, solver='saga', penalty='l1')
lr3.fit(X_train, y_train)
pred3 = lr3.predict(X_test)
acc3 = accuracy_score(y_test, pred3)
auc3 = roc_auc_score(y_test, lr3.predict_proba(X_test)[:, 1])

# 4. Decision Tree
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
pred_tree = tree.predict(X_test)
acc_tree = accuracy_score(y_test, pred_tree)
auc_tree = roc_auc_score(y_test, tree.predict_proba(X_test)[:, 1])

# 5. K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, pred_knn)
auc_knn = roc_auc_score(y_test, knn.predict_proba(X_test)[:, 1])

# 6. Naïve Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
acc_nb = accuracy_score(y_test, pred_nb)
auc_nb = roc_auc_score(y_test, nb.predict_proba(X_test)[:, 1])

### 📊 OUTPUT COMPARISON ###
print("\n🔍 Model Comparison (Accuracy and AUC):")
print(f"1. Logistic Regression (lbfgs):           Accuracy = {acc1:.4f} | AUC = {auc1:.4f}")
print(f"2. Logistic Regression (saga):            Accuracy = {acc2:.4f} | AUC = {auc2:.4f}")
print(f"3. Logistic Regression (L1, saga):        Accuracy = {acc3:.4f} | AUC = {auc3:.4f}")
print(f"4. Decision Tree:                         Accuracy = {acc_tree:.4f} | AUC = {auc_tree:.4f}")
print(f"5. K-Nearest Neighbors (K=5):             Accuracy = {acc_knn:.4f} | AUC = {auc_knn:.4f}")
print(f"6. Naïve Bayes:                           Accuracy = {acc_nb:.4f} | AUC = {auc_nb:.4f}")



🔍 Model Comparison (Accuracy and AUC):
1. Logistic Regression (lbfgs):           Accuracy = 0.7109 | AUC = 0.7740
2. Logistic Regression (saga):            Accuracy = 0.7109 | AUC = 0.7740
3. Logistic Regression (L1, saga):        Accuracy = 0.7114 | AUC = 0.7740
4. Decision Tree:                         Accuracy = 0.6322 | AUC = 0.6247
5. K-Nearest Neighbors (K=5):             Accuracy = 0.6368 | AUC = 0.6608
6. Naïve Bayes:                           Accuracy = 0.6488 | AUC = 0.7013


ValueError: could not convert string to float: '23-27'