In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_text
from sklearn.tree import plot_tree

In [2]:
df = pd.read_csv("ksi_collisions.csv")

In [3]:
df['ROAD_CONDITION'] = df['ROAD_CONDITION'].fillna('Unknown')

In [4]:
df.sample(2)

Unnamed: 0,DATE,STREET1,STREET2,DISTRICT,LATITUDE,LONGITUDE,VISIBILITY,LIGHT,ROAD_CONDITION,ACCLASS,...,FATAL_NO,HOOD_158,NEIGHBOURHOOD_158,DOW,HOUR,YEAR,MONTH,SEVERE_COLLISION,TIME_OF_DAY,SEASON
17726,2022-06-04,MARKHAM RD,STEELES AVE E,Scarborough,43.836528,-79.251148,Clear,Daylight,Dry,Non-Fatal Injury,...,0.0,144,Morningside Heights,Saturday,19,2022,6,1,Night,Summer
12029,2015-10-24,DON VALLEY PARKWAY N,DON MILLS RD,,43.700781,-79.33662,Rain,Daylight,Wet,Non-Fatal Injury,...,0.0,58,Old East York,Saturday,14,2015,10,1,Evening,Fall


In [5]:
categorical_features = ['LIGHT', 'VISIBILITY', 'ROAD_CONDITION', 'DOW', 'TIME_OF_DAY', 'SEASON']

In [6]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [7]:
encoded_array = encoder.fit_transform(df[categorical_features])
encoded_cols = encoder.get_feature_names_out(categorical_features)

In [8]:
encoded_df = pd.DataFrame(encoded_array, columns=encoded_cols, index=df.index)

In [9]:
df_encoded = pd.concat([df, encoded_df], axis=1)

In [10]:
df_encoded = df_encoded.drop(columns=categorical_features)

In [11]:
corr = df_encoded.corr(numeric_only=True)
corr_with_target = corr['SEVERE_COLLISION'].sort_values(ascending=False)
print(corr_with_target)

SEVERE_COLLISION                 1.000000
FATAL_NO                         0.195410
YEAR                             0.066061
TIME_OF_DAY_Late Night           0.025684
LIGHT_Dark                       0.025149
LATITUDE                         0.019677
DOW_Tuesday                      0.018317
DOW_Thursday                     0.015684
MONTH                            0.015181
SEASON_Fall                      0.014702
LONGITUDE                        0.012806
ROAD_CONDITION_Wet               0.007554
VISIBILITY_Rain                  0.005435
LIGHT_Dusk                       0.004547
SEASON_Winter                    0.003167
TIME_OF_DAY_Morning_Afternoon    0.002464
TIME_OF_DAY_Night                0.002347
VISIBILITY_Clear                 0.000853
DOW_Wednesday                   -0.000944
ROAD_CONDITION_Dry              -0.001031
LIGHT_Dawn                      -0.001584
SEASON_Summer                   -0.002329
DOW_Monday                      -0.004175
DOW_Friday                      -0

In [35]:
X = encoded_df
y = df['SEVERE_COLLISION']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier(
    n_estimators=8,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features = None,
    random_state=42
)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,8
,criterion,'gini'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [36]:
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.65      0.60      2040
           1       0.50      0.40      0.45      1752

    accuracy                           0.53      3792
   macro avg       0.53      0.53      0.52      3792
weighted avg       0.53      0.53      0.53      3792



In [38]:
user_input = pd.DataFrame([{
    'LIGHT': 'Day',
    'VISIBILITY': 'Clear',
    'ROAD_CONDITION': 'Dry',
    'DOW': 'Tuesday',
    'TIME_OF_DAY': 'Evening Rush',
    'SEASON': 'Spring'
}])
encoded_input = encoder.transform(user_input)
encoded_input_df = pd.DataFrame(encoded_input, columns=encoder.get_feature_names_out(categorical_features))
rf.predict_proba(encoded_input_df)[:, 1][0]

np.float64(0.46693346860181784)

In [39]:
rf.score(X_test, y_test)

0.5348101265822784