# Recidivism Analysis Using COMPAS Dataset
This notebook analyzes recidivism rates using the COMPAS dataset. It explores whether the length of incarceration influences the likelihood of reoffending, considering other factors such as age, prior offenses, offense severity, and educational attainment.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import chi2_contingency
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
url = "https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"
data = pd.read_csv(url)

# Selecting relevant columns
data = data[['sex', 'age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 
             'priors_count', 'c_charge_degree', 'decile_score', 'is_recid', 'days_b_screening_arrest']]

# Renaming columns
data.rename(columns={'is_recid': 'recidivism', 'priors_count': 'prior_arrests',
                     'days_b_screening_arrest': 'incarceration_length'}, inplace=True)

# Convert categorical variables
data['c_charge_degree'] = data['c_charge_degree'].map({'M': 0, 'F': 1})  # Misdemeanor: 0, Felony: 1

# Drop missing values
data = data.dropna()

# Show dataset info
data.info()
data.head()


In [None]:
# Exploratory Data Analysis

# Summary statistics
summary_stats = data.describe()
print(summary_stats)

# Histograms
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
sns.histplot(data['incarceration_length'], bins=30, kde=True, ax=axes[0, 0])
axes[0, 0].set_title("Incarceration Length")

sns.histplot(data['age'], bins=20, kde=True, ax=axes[0, 1])
axes[0, 1].set_title("Age")

sns.histplot(data['prior_arrests'], bins=15, kde=True, ax=axes[0, 2])
axes[0, 2].set_title("Prior Arrests")

sns.histplot(data['recidivism'], discrete=True, kde=False, ax=axes[1, 0])
axes[1, 0].set_title("Recidivism")

sns.histplot(data['c_charge_degree'], discrete=True, kde=False, ax=axes[1, 1])
axes[1, 1].set_title("Charge Degree")

plt.tight_layout()
plt.show()


In [None]:
# PMF Comparison: Recidivism by Age Group

data['age_group'] = pd.cut(data['age'], bins=[18, 30, 100], labels=['18-30', '30+'])

# Compute PMF for each age group
pmf_recid = data.groupby('age_group')['recidivism'].mean()

print("Recidivism Rate by Age Group:")
print(pmf_recid)


In [None]:
# CDF Analysis: Incarceration Length

sorted_data = np.sort(data['incarceration_length'])
cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)

plt.figure(figsize=(8, 5))
plt.plot(sorted_data, cdf, marker=".", linestyle="none")
plt.xlabel("Incarceration Length (Days)")
plt.ylabel("CDF")
plt.title("CDF of Incarceration Length")
plt.show()


In [None]:
# Scatter Plots and Correlation

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.scatterplot(x=data['age'], y=data['incarceration_length'], ax=axes[0])
axes[0].set_title("Age vs. Incarceration Length")

sns.scatterplot(x=data['prior_arrests'], y=data['recidivism'], ax=axes[1])
axes[1].set_title("Prior Arrests vs. Recidivism")

plt.tight_layout()
plt.show()

# Compute Pearson correlation
correlation_matrix = data[['age', 'incarceration_length', 'prior_arrests', 'recidivism']].corr()
print("Correlation Matrix:
", correlation_matrix)


In [None]:
# Hypothesis Testing: Chi-Square Test for Recidivism and Charge Degree

contingency_table = pd.crosstab(data['recidivism'], data['c_charge_degree'])
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Test Results: chi2={chi2}, p-value={p}")


In [None]:
# Logistic Regression: Predicting Recidivism

X = data[['age', 'incarceration_length', 'prior_arrests', 'c_charge_degree']]
y = data['recidivism']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:
", classification_report(y_test, y_pred))
