In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [5]:
#Load cleaned data
df = pd.read_csv('../data/homicide_cleaned.csv', parse_dates=['Incident_Date'])
df = df.dropna(subset=['Latitude', 'Longitude', 'Weapon'])
               
#Extract time features for prediction
df['Year'] = df['Incident_Date'].dt.year
df['Month'] = df['Incident_Date'].dt.month
df['Weekday'] = df['Incident_Date'].dt.dayofweek

def simplify_weapon(weapon):
    if not isinstance(weapon, str):
        return 'Other'
    weapon = weapon.lower()
    weapon_list = [w.strip() for w in weapon.split(',')]
    for w in weapon_list:
        if any(term in w for term in['gun','firearm','rifle','shotgun']):
            return 'Gun'
        elif any(term in w for term in['knife', 'cutting instrument', 'sharp', 'razor', 'screwdriver']):
            return 'Blade'
        return 'Other'

#Apply mapping
df['Weapon_Simplified'] = df['Weapon'].apply(simplify_weapon)
print(df['Weapon_Simplified'].value_counts())

#Prepare features and labels
X = df[['Year', 'Month', 'Weekday']]
y = df['Weapon_Simplified']

#Encode target labels
y_encoded = y.astype('category').cat.codes
label_mapping = dict(enumerate(y.astype('category').cat.categories))
print("Label mapping:", label_mapping)

#Train-test split
X_train, X_test, y_train, y_test =  train_test_split(
                                    X, y_encoded, test_size=0.3,
                                    random_state=42, stratify= y_encoded)
#Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

#Predictions
y_pred = model.predict(X_test)

#Evalutation
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_mapping.values()))

#Save model for later use
joblib.dump(model, '../models/random_forest_weapon_classifier.pkl')

#Save label mapping for decoding
with open('../models/label_mapping.json', 'w') as f:
    json.dump(label_mapping, f)
df.to_csv('../data/homicide_cleaned.csv', index=False)

Weapon_Simplified
Gun      762
Other     88
Blade     62
Name: count, dtype: int64
Label mapping: {0: 'Blade', 1: 'Gun', 2: 'Other'}
Confusion Matrix:
[[  3  15   1]
 [ 11 202  16]
 [  2  23   1]]

Classification Report:
              precision    recall  f1-score   support

       Blade       0.19      0.16      0.17        19
         Gun       0.84      0.88      0.86       229
       Other       0.06      0.04      0.05        26

    accuracy                           0.75       274
   macro avg       0.36      0.36      0.36       274
weighted avg       0.72      0.75      0.74       274

