In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import kaggle


In [2]:
data = pd.read_csv("data/Arrests_Presented_and_Prosecutions.csv")
data.head()

  data = pd.read_csv("data/Arrests_Presented_and_Prosecutions.csv")


Unnamed: 0,ID,Incident Number,Arrest Date,Court Number,Case Filed,Status,Suspect Charge List,Case Type
0,1,21650836,2011/12/01,2085441,N,Other Action,",11352,",Felony
1,2,110000346,2011/01/01,11000030,N,Discharge Only,",21658(A),23152(A),11550(A),",Misdemeanor
2,3,Z20110101-11000037,2011/01/01,11000037,Y,Filed,",23152(A),23152(B),12500(A),",Misdemeanor
3,4,Z20110101-11000039,2011/01/01,11000039,Y,Filed,",23152(A),23152(B),14601.2(A),",Misdemeanor
4,5,Z20110101-11000046,2011/01/01,11000046,Y,Filed,",14601(A),23152(A),23152(B),",Misdemeanor


In [3]:
data['Suspect Charge List'] = data['Suspect Charge List'].apply(lambda x: x.strip(',').split(','))
data = data.dropna()
data.head()

Unnamed: 0,ID,Incident Number,Arrest Date,Court Number,Case Filed,Status,Suspect Charge List,Case Type
0,1,21650836,2011/12/01,2085441,N,Other Action,[11352],Felony
1,2,110000346,2011/01/01,11000030,N,Discharge Only,"[21658(A), 23152(A), 11550(A)]",Misdemeanor
2,3,Z20110101-11000037,2011/01/01,11000037,Y,Filed,"[23152(A), 23152(B), 12500(A)]",Misdemeanor
3,4,Z20110101-11000039,2011/01/01,11000039,Y,Filed,"[23152(A), 23152(B), 14601.2(A)]",Misdemeanor
4,5,Z20110101-11000046,2011/01/01,11000046,Y,Filed,"[14601(A), 23152(A), 23152(B)]",Misdemeanor


In [4]:
#This is for the case of removing the suspect charge list and training using other metrics

# Select the necessary columns
data_no_charge = data[['Case Filed', 'Case Type', 'Arrest Date', 'Status']].copy()

# Convert 'Arrest Date' to datetime
data_no_charge['Arrest Date'] = pd.to_datetime(data_no_charge['Arrest Date'])

# Define a reference date
reference_date = pd.to_datetime("2010-12-01")

# Subtract the reference date to calculate days
data_no_charge['Arrest Date'] = (data_no_charge['Arrest Date'] - reference_date).dt.days

# Encode categorical variables
label_encoder = LabelEncoder()
data_no_charge['Case Filed'] = label_encoder.fit_transform(data_no_charge['Case Filed'])
data_no_charge['Case Type'] = label_encoder.fit_transform(data_no_charge['Case Type'])

data_no_charge.head()

Unnamed: 0,Case Filed,Case Type,Arrest Date,Status
0,0,0,365,Other Action
1,0,1,31,Discharge Only
2,1,1,31,Filed
3,1,1,31,Filed
4,1,1,31,Filed


In [5]:
#setting training and test data
x_vals = data_no_charge.drop(columns=['Status'])
y_vals = data['Status']
x_train, x_test, y_train, y_test = train_test_split(x_vals, y_vals, test_size=0.2, random_state=21)

In [6]:
#Here is the model for random forest
model = RandomForestClassifier(n_estimators=20, random_state=21)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
#This function is super helpful - look for similar ones for other classification models
report = classification_report(y_test, y_pred, target_names=['Other Action', 'Discharge Only', 'Filed'])

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

Accuracy: 0.8249849729513123

Classification Report:
                 precision    recall  f1-score   support

  Other Action       0.73      0.87      0.80      3915
Discharge Only       0.98      1.00      0.99      4600
         Filed       0.35      0.15      0.21      1467

      accuracy                           0.82      9982
     macro avg       0.69      0.68      0.67      9982
  weighted avg       0.79      0.82      0.80      9982



In [22]:
#This is the example with one hote encoded the Suspect Charge List


data_with_charge = data[['ID', 'Case Filed', 'Case Type', 'Arrest Date', 'Status']].copy()

# Convert 'Arrest Date' to datetime
data_with_charge['Arrest Date'] = pd.to_datetime(data_with_charge['Arrest Date'])

# Define a reference date
reference_date = pd.to_datetime("2010-12-01")

# Subtract the reference date to calculate days
data_with_charge['Arrest Date'] = (data_with_charge['Arrest Date'] - reference_date).dt.days

# Encode categorical variables
label_encoder = LabelEncoder()
data_with_charge['Case Filed'] = label_encoder.fit_transform(data_with_charge['Case Filed'])
data_with_charge['Case Type'] = label_encoder.fit_transform(data_with_charge['Case Type'])

data_with_charge.head()

Unnamed: 0,ID,Case Filed,Case Type,Arrest Date,Status
0,1,0,0,365,Other Action
1,2,0,1,31,Discharge Only
2,3,1,1,31,Filed
3,4,1,1,31,Filed
4,5,1,1,31,Filed


In [23]:
data_with_charge['Status'] = data_with_charge['Status'].astype('category').cat.codes
data_with_charge = pd.get_dummies(data_with_charge, columns=['Case Type'], drop_first=True)
data_with_charge.head()

Unnamed: 0,ID,Case Filed,Arrest Date,Status,Case Type_1
0,1,0,365,2,False
1,2,0,31,0,True
2,3,1,31,1,True
3,4,1,31,1,True
4,5,1,31,1,True


In [15]:
x_vals = data_with_charge.drop(columns=['ID', 'Status'])
y_vals = data_with_charge['Status']
x_train, x_test, y_train, y_test = train_test_split(x_vals, y_vals, test_size=0.2, random_state=21)

In [16]:
model = RandomForestClassifier(n_estimators=20, random_state=21)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
#This function is super helpful - look for similar ones for other classification models
report = classification_report(y_test, y_pred, target_names=['Other Action', 'Discharge Only', 'Filed'])

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

ValueError: could not convert string to float: 'Y'