In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans

import kaggle


In [None]:
data = pd.read_csv("data/Arrests_Presented_and_Prosecutions.csv")
data.head()

In [None]:
data['Suspect Charge List'] = data['Suspect Charge List'].apply(lambda x: x.strip(',').split(','))
data = data.dropna()
data.head()

In [None]:
#This is for the case of removing the suspect charge list and training using other metrics

# Select the necessary columns
data_no_charge = data[['Case Filed', 'Case Type', 'Arrest Date', 'Status']].copy()

# Convert 'Arrest Date' to datetime
data_no_charge['Arrest Date'] = pd.to_datetime(data_no_charge['Arrest Date'])

# Define a reference date
reference_date = pd.to_datetime("2010-12-01")

# Subtract the reference date to calculate days
data_no_charge['Arrest Date'] = (data_no_charge['Arrest Date'] - reference_date).dt.days

# Encode categorical variables
label_encoder = LabelEncoder()
data_no_charge['Case Filed'] = label_encoder.fit_transform(data_no_charge['Case Filed'])
data_no_charge['Case Type'] = label_encoder.fit_transform(data_no_charge['Case Type'])

data_no_charge.head()

In [None]:
#setting training and test data
x_vals = data_no_charge.drop(columns=['Status'])
y_vals = data['Status']
x_train, x_test, y_train, y_test = train_test_split(x_vals, y_vals, test_size=0.2, random_state=21)

In [None]:
#Here is the model for random forest
model = RandomForestClassifier(n_estimators=20, random_state=21)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
#This function is super helpful - look for similar ones for other classification models
report = classification_report(y_test, y_pred, target_names=['Other Action', 'Discharge Only', 'Filed'])

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

In [None]:
# Creation of the heatmap showing the distribution of predictions
cm = confusion_matrix(y_test, y_pred)
class_names = ['Other Action', 'Discharge Only', 'Filed']

fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(cm, cmap='Blues')

ax.set_xticks(range(len(class_names)))
ax.set_yticks(range(len(class_names)))
ax.set_xticklabels(class_names)
ax.set_yticklabels(class_names)

plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

for i in range(len(class_names)):
    for j in range(len(class_names)):
        text = ax.text(j, i, cm[i, j],
                       ha="center", va="center", color="black")

ax.set_title("Confusion Matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

# Add a colorbar for reference
fig.colorbar(im, ax=ax)

plt.tight_layout()
plt.show()

In [None]:
#This is the example with one hote encoded the Suspect Charge List


data_with_charge = data[['ID', 'Case Filed', 'Case Type', 'Arrest Date', 'Status', 'Suspect Charge List']].copy()

# Convert 'Arrest Date' to datetime
data_with_charge['Arrest Date'] = pd.to_datetime(data_with_charge['Arrest Date'])

# Define a reference date
reference_date = pd.to_datetime("2010-12-01")

# Subtract the reference date to calculate days
data_with_charge['Arrest Date'] = (data_with_charge['Arrest Date'] - reference_date).dt.days

# Encode categorical variables
label_encoder = LabelEncoder()
data_with_charge['Case Filed'] = label_encoder.fit_transform(data_with_charge['Case Filed'])
data_with_charge['Case Type'] = label_encoder.fit_transform(data_with_charge['Case Type'])

data_with_charge.head()

In [None]:
charges = set(charge for sublist in data["Suspect Charge List"] for charge in sublist)

encoded_data = pd.DataFrame(
    {charge: data["Suspect Charge List"].apply(lambda x: 1 if charge in x else 0) for charge in charges}
)

data_with_charge = pd.concat([data_with_charge.drop(columns=["Suspect Charge List"]), encoded_data], axis=1)
data_with_charge.head()

In [None]:
x_vals = data_with_charge.drop(columns=['ID', 'Status'])
y_vals = data_with_charge['Status']
x_train, x_test, y_train, y_test = train_test_split(x_vals, y_vals, test_size=0.2, random_state=21)

In [None]:
model = RandomForestClassifier(n_estimators=20, random_state=21)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
#This function is super helpful - look for similar ones for other classification models
report = classification_report(y_test, y_pred, target_names=['Other Action', 'Discharge Only', 'Filed'])

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

In [None]:
# Creation of the heatmap showing the distribution of predictions
cm = confusion_matrix(y_test, y_pred)
class_names = ['Other Action', 'Discharge Only', 'Filed']

fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(cm, cmap='Blues')

ax.set_xticks(range(len(class_names)))
ax.set_yticks(range(len(class_names)))
ax.set_xticklabels(class_names)
ax.set_yticklabels(class_names)

plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

for i in range(len(class_names)):
    for j in range(len(class_names)):
        text = ax.text(j, i, cm[i, j],
                       ha="center", va="center", color="black")

ax.set_title("Confusion Matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

# Add a colorbar for reference
fig.colorbar(im, ax=ax)

plt.tight_layout()
plt.show()

In [None]:
# This is the example where k means clustering is used to cluster groups of charge lists

#Exploding the data to have new entries for every charge
expanded_with_charge_list = data.explode("Suspect Charge List").rename(columns={"Suspect Charge List": "Charge"})
expanded_with_charge_list.head()

In [None]:
# Elbow Method
cluster_range = range(1, 10)  # Limit k to the number of samples
inertia = []

for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)  # Set n_init explicitly
    kmeans.fit(features_scaled)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Curve
plt.figure(figsize=(10, 5))
plt.plot(cluster_range, inertia, marker="o")
plt.title("Elbow Method for Best Number of Clusters")
plt.xlabel("Clusters (k)")
plt.ylabel("Inertia")
plt.show()

In [None]:
#Encode variables
label_encoders = {}
for col in ["Case Filed", "Case Type"]:
    label_encoders[col] = LabelEncoder()
    expanded_with_charge_list[col] = label_encoders[col].fit_transform(expanded_with_charge_list[col])

# Convert dates
expanded_with_charge_list["Arrest Date"] = pd.to_datetime(expanded_with_charge_list["Arrest Date"]).map(pd.Timestamp.toordinal)

features = expanded_with_charge_list[["Arrest Date", "Case Filed", "Case Type"]]
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=3, random_state=42)
expanded_with_charge_list["Cluster"] = kmeans.fit_predict(features_scaled)

expanded_with_charge_list.head()

In [None]:
# Show the charges in each cluster
charge_groups = expanded_with_charge_list.groupby("Cluster")["Charge"].apply(list)

for cluster, charges in charge_groups.items():
    print(f"Cluster {cluster}:")
    print(", ".join(charges))
    print("\n")

In [None]:
#Use random forest on each individual cluster

expanded_with_charge_list["Status"] = LabelEncoder().fit_transform(expanded_with_charge_list["Status"])

# Split the data by clusters
results = {}
for cluster in expanded_with_charge_list["Cluster"].unique():
    cluster_data = expanded_with_charge_list[expanded_with_charge_list["Cluster"] == cluster]

    X = cluster_data[["Arrest Date", "Case Filed", "Case Type"]]
    y = cluster_data["Status"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=["Other Action", "Discharge Only", "Filed"], zero_division=0)
    
    # Keep results for each cluster
    results[cluster] = {"accuracy": accuracy, "report": report}

# Print results
for cluster, metrics in results.items():
    print(f"Cluster {cluster}:")
    if isinstance(metrics, str):
        print(metrics)
    else:
        print(f"Accuracy: {metrics['accuracy']:.2f}")
        print("Classification Report:")
        print(metrics["report"])
    print("\n")