In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import class_weight


In [4]:
df = pd.read_csv('../data/homicide_cleaned.csv', parse_dates=['Incident_Date'])
df = df.dropna(subset=['Latitude','Longitude','Weapon'])


In [5]:
le = LabelEncoder()
df['Weapon_Label'] = le.fit_transform(df['Weapon_Simplified'])

In [6]:
df['Year'] = df['Incident_Date'].dt.year
df['Month'] = df['Incident_Date'].dt.month
df['Weekday'] = df['Incident_Date'].dt.dayofweek

In [7]:
def age_group(age):
    if pd.isnull(age):
        return'Unknown'
    elif age < 18:
        return 'Minor'
    elif 18 <= age < 30:
        return 'Young Adult'
    elif 30 <= age < 50:
        return 'Adult'
    elif 50 <= age < 70:
        return 'Senior'
    else:
        return 'Elderly'
df['Age_Group'] = df['Age'].apply(age_group)
print(df['Age_Group'].value_counts())

Age_Group
Young Adult    415
Adult          295
Senior          91
Minor           85
Unknown         21
Elderly          5
Name: count, dtype: int64


In [8]:
#Season features

def season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'
df['Season'] = df['Month'].apply(season)

print(df['Season'].value_counts())

Season
Summer    239
Spring    233
Fall      224
Winter    216
Name: count, dtype: int64


In [9]:
df['IsWeekend'] = df['Weekday'].apply(lambda x: 1 if x >= 5 else 0)

In [10]:
features = ['Year', 'Month', 'Weekday', 'Season', 'Age_Group', 'IsWeekend']

df_encoded = pd.get_dummies(df[features], drop_first=True)

X = df_encoded
y = df['Weapon_Label']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [12]:
weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=le.classes_,
    y=df['Weapon_Simplified']
)

class_weights_dict = {i:weight for i, weight in enumerate(weights)}

print('Class Weights:', class_weights_dict)

Class Weights: {0: 4.903225806451613, 1: 0.3989501312335958, 2: 3.4545454545454546}


In [13]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}

clf = RandomForestClassifier(class_weight=class_weights_dict, random_state=42)

grid = GridSearchCV(clf, param_grid, cv=3, n_jobs=-1, scoring='f1_weighted')
grid.fit(X_train, y_train)

print('Best Parameters:', grid.best_params_)

Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}


In [14]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

print('Tuned Classification Report:')
print(classification_report(y_test, y_pred, target_names=le.classes_))

Tuned Classification Report:
              precision    recall  f1-score   support

       Blade       0.10      0.08      0.09        12
         Gun       0.85      0.90      0.87       153
       Other       0.17      0.11      0.13        18

    accuracy                           0.77       183
   macro avg       0.37      0.36      0.37       183
weighted avg       0.73      0.77      0.75       183

