In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import kaggle


In [2]:
kaggle.api.dataset_download_files("vivovinco/san-francisco-arrests-presented-and-prosecutions", path="data", unzip=True)

Dataset URL: https://www.kaggle.com/datasets/vivovinco/san-francisco-arrests-presented-and-prosecutions


In [3]:
data = pd.read_csv("data/Arrests_Presented_and_Prosecutions.csv")
data.head()

  data = pd.read_csv("data/Arrests_Presented_and_Prosecutions.csv")


Unnamed: 0,ID,Incident Number,Arrest Date,Court Number,Case Filed,Status,Suspect Charge List,Case Type
0,1,21650836,2011/12/01,2085441,N,Other Action,",11352,",Felony
1,2,110000346,2011/01/01,11000030,N,Discharge Only,",21658(A),23152(A),11550(A),",Misdemeanor
2,3,Z20110101-11000037,2011/01/01,11000037,Y,Filed,",23152(A),23152(B),12500(A),",Misdemeanor
3,4,Z20110101-11000039,2011/01/01,11000039,Y,Filed,",23152(A),23152(B),14601.2(A),",Misdemeanor
4,5,Z20110101-11000046,2011/01/01,11000046,Y,Filed,",14601(A),23152(A),23152(B),",Misdemeanor


In [4]:
data['Suspect Charge List'] = data['Suspect Charge List'].apply(lambda x: x.strip(',').split(','))
data.head()

Unnamed: 0,ID,Incident Number,Arrest Date,Court Number,Case Filed,Status,Suspect Charge List,Case Type
0,1,21650836,2011/12/01,2085441,N,Other Action,[11352],Felony
1,2,110000346,2011/01/01,11000030,N,Discharge Only,"[21658(A), 23152(A), 11550(A)]",Misdemeanor
2,3,Z20110101-11000037,2011/01/01,11000037,Y,Filed,"[23152(A), 23152(B), 12500(A)]",Misdemeanor
3,4,Z20110101-11000039,2011/01/01,11000039,Y,Filed,"[23152(A), 23152(B), 14601.2(A)]",Misdemeanor
4,5,Z20110101-11000046,2011/01/01,11000046,Y,Filed,"[14601(A), 23152(A), 23152(B)]",Misdemeanor


In [5]:
data = data.drop(['Incident Number', 'Court Number'], axis=1)
data.head()

Unnamed: 0,ID,Arrest Date,Case Filed,Status,Suspect Charge List,Case Type
0,1,2011/12/01,N,Other Action,[11352],Felony
1,2,2011/01/01,N,Discharge Only,"[21658(A), 23152(A), 11550(A)]",Misdemeanor
2,3,2011/01/01,Y,Filed,"[23152(A), 23152(B), 12500(A)]",Misdemeanor
3,4,2011/01/01,Y,Filed,"[23152(A), 23152(B), 14601.2(A)]",Misdemeanor
4,5,2011/01/01,Y,Filed,"[14601(A), 23152(A), 23152(B)]",Misdemeanor


In [6]:
charges_list = data.explode('Suspect Charge List')
charge_dummies = pd.get_dummies(charges_list['Suspect Charge List']).groupby(charges_list['ID']).max()
data = data.drop(columns=['Suspect Charge List'])
data = data.merge(charge_dummies, on='ID', how='left')

data.head()

Unnamed: 0,ID,Arrest Date,Case Filed,Status,Case Type,000,100,10085.6A,1009.22(E),1009.22C,...,PAROLVIO,PROBVIO,RA148,RAMEYWT,REENTRY,RWS647F,SAFEKEEP,TFWT,WRNTJUV,XX
0,1,2011/12/01,N,Other Action,Felony,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,2011/01/01,N,Discharge Only,Misdemeanor,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,2011/01/01,Y,Filed,Misdemeanor,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,4,2011/01/01,Y,Filed,Misdemeanor,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,5,2011/01/01,Y,Filed,Misdemeanor,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
data['Status'] = data['Status'].astype('category').cat.codes
data = pd.get_dummies(data, columns=['Case Type'], drop_first=True)
data.head()

Unnamed: 0,ID,Arrest Date,Case Filed,Status,000,100,10085.6A,1009.22(E),1009.22C,1009.22D,...,PROBVIO,RA148,RAMEYWT,REENTRY,RWS647F,SAFEKEEP,TFWT,WRNTJUV,XX,Case Type_Misdemeanor
0,1,2011/12/01,N,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,2011/01/01,N,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,3,2011/01/01,Y,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,4,2011/01/01,Y,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,5,2011/01/01,Y,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [8]:
x_vals = data.drop(columns=['ID', 'Arrest Date', 'Case Filed', 'Status'])
y_vals = data['Status']
x_train, x_test, y_train, y_test = train_test_split(x_vals, y_vals, test_size=0.2, random_state=21)

In [9]:
#Want to do elbow method
def elbow_method(max_estimators, x_vals, y_vals, random_state=50):
    accuracies = []
    x_train, x_test, y_train, y_test = train_test_split(x_vals, y_vals, test_size=0.2, random_state=random_state)
    for estimator in range(1, max_estimators + 1):
        model = RandomForestClassifier(n_estimators=estimator, random_state=random_state)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        accuracies.append(accuracy_score(y_test, y_pred))
    
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, max_estimators + 1), accuracies, marker='o')
    plt.xlabel("Number of Estimators")
    plt.ylabel("Accuracy")
    plt.title("Elbow Method for Random Forest: Accuracy vs. Number of Estimators")
    plt.show()

In [10]:
#elbow_method(50, x_vals, y_vals);

In [11]:
model = RandomForestClassifier(n_estimators=20, random_state=21)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
#This function is super helpful - look for similar ones for other classification models
report = classification_report(y_test, y_pred, target_names=['Other Action', 'Discharge Only', 'Filed'])

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

Accuracy: 0.6419311222460828

Classification Report:
                 precision    recall  f1-score   support

  Other Action       0.61      0.67      0.64      9903
Discharge Only       0.70      0.74      0.72     11483
         Filed       0.45      0.26      0.33      3760

      accuracy                           0.64     25146
     macro avg       0.59      0.56      0.56     25146
  weighted avg       0.63      0.64      0.63     25146

