In [19]:
# Enchanced Random Forest Performance
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier

In [20]:
# Load Dataset
df = pd.read_csv('../data/pd_collisions_datasd.csv')

#Fix datetime
df['date_time'] = pd.to_datetime(df['date_time'])
df['hour'] = df['date_time'].dt.hour
df['day_of_week'] = df['date_time'].dt.dayofweek
df['month'] = df['date_time'].dt.month
df['is_weekend'] = df['day_of_week'].apply(lambda x: x >= 5)
df['rush_hour'] = df['hour'].apply(lambda x: 7 <= x <= 9 or 16 <= x <= 18)


In [21]:
# Target Engineering
df['severity'] = df['injured'] + 3 * df['killed']
df['severity_label'] = pd.cut(df['severity'], bins = [-1, 0, 2, 100], labels=['none', 'injury', 'fatal'])

# Interaction features
df['beat_rush'] = df['police_beat'].astype(str) + "_" + df['rush_hour'].astype(str)
df['beat_weekend'] = df['police_beat'].astype(str) + '_' + df['is_weekend'].astype(str)

# Select features
features = ['hour', 'day_of_week', 'month', 'is_weekend', 'rush_hour',
            'violation_type', 'hit_run_lvl', 'beat_rush', 'beat_weekend']


In [22]:
# Add one-hot encoded police_beat
df = pd.get_dummies(df, columns=['police_beat'], drop_first=True)
features += [col for col in df.columns if col.startswith('police_beat_')]

target = 'severity_label'
df_model = df[features + [target]].dropna()

# Encode categorical features
cat_cols = ['violation_type', 'hit_run_lvl', 'beat_rush', 'beat_weekend']
for col in cat_cols:
    df_model[col] = LabelEncoder().fit_transform(df_model[col].astype(str))

df_model['is_weekend'] = df_model['is_weekend'].astype(int)
df_model['rush_hour'] = df_model['rush_hour'].astype(int)
df_model[target] = LabelEncoder().fit_transform(df_model[target])

In [23]:
# Prepare data
X = df_model.drop(columns=[target])
y = df_model[target]

# Train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

# Oversample with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [24]:
# Initalize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=20, class_weight='balanced',random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=5000, class_weight='balanced', random_state=42),
    'XGBoost' : XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, event_metric='mlogloss', use_label_encoder=False, random_state=42 )
}

results = {}

In [25]:
# Fit models and eval
for name, model in models.items():
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    results[name] = report
    print(f"Classification Report for {name}: ")
    print(classification_report(y_test, y_pred))



Classification Report for Random Forest: 
              precision    recall  f1-score   support

           0       0.04      0.25      0.07        51
           1       0.17      0.23      0.19      1018
           2       0.88      0.79      0.83      6543

    accuracy                           0.71      7612
   macro avg       0.36      0.42      0.36      7612
weighted avg       0.78      0.71      0.74      7612



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classification Report for LogisticRegression: 
              precision    recall  f1-score   support

           0       0.14      0.20      0.16        51
           1       0.19      0.24      0.22      1018
           2       0.88      0.84      0.86      6543

    accuracy                           0.76      7612
   macro avg       0.40      0.43      0.41      7612
weighted avg       0.78      0.76      0.77      7612



Parameters: { "event_metric", "use_label_encoder" } are not used.



Classification Report for XGBoost: 
              precision    recall  f1-score   support

           0       0.05      0.22      0.08        51
           1       0.17      0.32      0.22      1018
           2       0.88      0.73      0.80      6543

    accuracy                           0.67      7612
   macro avg       0.37      0.42      0.37      7612
weighted avg       0.78      0.67      0.72      7612



In [27]:
#Build ensemble with weighted voting
ensemble = VotingClassifier(estimators=[
    ('rf', models['Random Forest']),
    ('lr', models['LogisticRegression']),
    ('xgb', models['XGBoost'])
], voting='soft', weights=[3,1,2])

ensemble.fit(X_train_res, y_train_res)
y_pred_ensemble = ensemble.predict(X_test)
print("\nClassification Report for Voting Ensemble: ")
print(classification_report(y_test, y_pred_ensemble))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "event_metric", "use_label_encoder" } are not used.




Classification Report for Voting Ensemble: 
              precision    recall  f1-score   support

           0       0.07      0.24      0.11        51
           1       0.17      0.25      0.20      1018
           2       0.88      0.80      0.84      6543

    accuracy                           0.72      7612
   macro avg       0.37      0.43      0.38      7612
weighted avg       0.78      0.72      0.75      7612

