# Accident Severity Prediction

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import numpy as np
import joblib
import pickle

In [5]:
!pip install imblearn



In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

In [7]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced

In [8]:
df_full_data = pd.read_csv("df_ready_for_ml.csv")

In [9]:
df_full_data.count()

Severity            1785064
Start_Lat           1785064
Start_Lng           1785064
Start_Time          1785064
Temperature(F)      1785064
Visibility(mi)      1785064
Wind_Speed(mph)     1785064
Weather_Category    1785064
Pressure(in)        1785064
dtype: int64

In [10]:
# Convert Start_Time and End_Time to datetypes
df_full_data['Start_Time'] = pd.to_datetime(df_full_data['Start_Time'], errors='coerce')

# Extract year, month, day, hour and weekday
df_full_data['Year']=df_full_data['Start_Time'].dt.year
df_full_data['Month']=df_full_data['Start_Time'].dt.strftime('%b')
df_full_data['Day']=df_full_data['Start_Time'].dt.day
df_full_data['Hour']=df_full_data['Start_Time'].dt.hour
df_full_data['Weekday']=df_full_data['Start_Time'].dt.strftime('%a')

In [11]:
df_full_noUN=df_full_data[df_full_data["Weather_Category"] != "UNKNOWN"]

In [12]:
# df_full_data[df_full_data["Weather_Category"] == "Rain"]

In [13]:
df_full_noUN.count()

Severity            1781072
Start_Lat           1781072
Start_Lng           1781072
Start_Time          1781072
Temperature(F)      1781072
Visibility(mi)      1781072
Wind_Speed(mph)     1781072
Weather_Category    1781072
Pressure(in)        1781072
Year                1781072
Month               1781072
Day                 1781072
Hour                1781072
Weekday             1781072
dtype: int64

In [15]:
df_full_noUN.groupby("Severity").count()

Unnamed: 0_level_0,Start_Lat,Start_Lng,Start_Time,Temperature(F),Visibility(mi),Wind_Speed(mph),Weather_Category,Pressure(in),Year,Month,Day,Hour,Weekday
Severity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2,1159736,1159736,1159736,1159736,1159736,1159736,1159736,1159736,1159736,1159736,1159736,1159736,1159736
3,566943,566943,566943,566943,566943,566943,566943,566943,566943,566943,566943,566943,566943
4,54393,54393,54393,54393,54393,54393,54393,54393,54393,54393,54393,54393,54393


In [16]:
print(df_full_noUN['Severity'].value_counts()/len(df_full_noUN))

2    0.651145
3    0.318316
4    0.030539
Name: Severity, dtype: float64


In [None]:
# df_full_normalized = df_full_noUN.sample(frac=1)

# # sevr1_df = df_full_normalized.loc[df_full_normalized['Severity'] == 1]
# # sevr2_df = df_full_normalized.loc[df_full_normalized['Severity'] == 2][:1159736]
# # sevr3_df = df_full_normalized.loc[df_full_normalized['Severity'] == 3][:1726679]

# sevr1_df = df_full_normalized.loc[df_full_normalized['Severity'] == 1]
# sevr2_df = df_full_normalized.loc[df_full_normalized['Severity'] == 2]
# sevr3_df = df_full_normalized.loc[df_full_normalized['Severity'] == 3]

# normal_distributed_df = pd.concat([sevr1_df, sevr2_df, sevr3_df])

# # Shuffle dataframe rows
# normal_distributed_df = normal_distributed_df.sample(frac=1, random_state=101)

# normal_distributed_df.tail()

In [None]:
# print(normal_distributed_df['Severity'].value_counts()/len(normal_distributed_df))

In [14]:
# normal_distributed_df.count()

In [27]:
# X = df_full_noUN[["Start_Lat","Start_Lng", "Temperature(F)", "Visibility(mi)", "Wind_Speed(mph)", "Weather_Category", "Weekday", "Hour"]]
X = df_full_noUN[["Start_Lat","Start_Lng","Temperature(F)", "Visibility(mi)", "Weather_Category","Weekday" ]]
# X = normal_distributed_df[["Start_Lat","Start_Lng", "Temperature(F)", "Visibility(mi)", "Weather_Category","Weekday" ]]
label_encoder = LabelEncoder()
data = X.copy()

label_encoder.fit(data["Weather_Category"])
X["Weather_Category"]=label_encoder.transform(data["Weather_Category"])

In [28]:
label_encoder.classes_

array(['Clear', 'Cloudy', 'Low Visibility', 'Rain', 'Slippery'],
      dtype=object)

In [29]:
label_encoder.fit(data["Weekday"])
X["Weekday"]=label_encoder.transform(data["Weekday"])

In [30]:
label_encoder.classes_

array(['Fri', 'Mon', 'Sat', 'Sun', 'Thu', 'Tue', 'Wed'], dtype=object)

In [31]:
y = df_full_noUN["Severity"]
# y = normal_distributed_df["Severity"]
print(X.shape, y.shape)

(1781072, 6) (1781072,)


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101, stratify =y)
data = X.copy()

# With Random Forest Classifier (UnderSampling)

In [35]:
pipe_rf = make_pipeline_imb(RandomUnderSampler(),
                         StandardScaler(),
                         RandomForestClassifier())

pipe_rf.fit(X_train, y_train)
y_pred = pipe_rf.predict(X_test)

In [36]:
print(classification_report_imbalanced(y_test, y_pred))
print(f"Training Data Score with Random Forest Classifier US: {pipe_rf.score(X_train, y_train)}")
print(f"Testing Data Score with Random Forest Classifier US: {pipe_rf.score(X_test, y_test)}")

                   pre       rec       spe        f1       geo       iba       sup

          2       0.85      0.62      0.80      0.72      0.70      0.48    289934
          3       0.55      0.62      0.76      0.58      0.69      0.46    141736
          4       0.14      0.75      0.85      0.23      0.80      0.63     13598

avg / total       0.73      0.62      0.79      0.66      0.70      0.48    445268

Training Data Score with Random Forest Classifier US: 0.6544957194318927
Testing Data Score with Random Forest Classifier US: 0.6232111896655498


# With Random Forest Classifier (Over Sampling)

In [37]:
pipe_rf_OS = make_pipeline_imb(SMOTE(),
                         StandardScaler(),
                         RandomForestClassifier())

pipe_rf_OS.fit(X_train, y_train)
y_pred = pipe_rf_OS.predict(X_test)

In [38]:
print(classification_report_imbalanced(y_test, y_pred))
print(f"Training Data Score with Random Forest Classifier OS: {pipe_rf_OS.score(X_train, y_train)}")
print(f"Testing Data Score with Random Forest Classifier OS: {pipe_rf_OS.score(X_test, y_test)}")

                   pre       rec       spe        f1       geo       iba       sup

          2       0.85      0.79      0.74      0.82      0.76      0.58    289934
          3       0.63      0.71      0.81      0.67      0.76      0.57    141736
          4       0.40      0.50      0.98      0.45      0.70      0.47     13598

avg / total       0.77      0.76      0.77      0.76      0.76      0.58    445268

Training Data Score with Random Forest Classifier OS: 0.6544957194318927
Testing Data Score with Random Forest Classifier OS: 0.6232111896655498


# RFC with GridsearchCV

In [25]:
model1 = make_pipeline_imb(SMOTE(), StandardScaler(), RandomForestClassifier())
paramgrid = {"randomforestclassifier__n_estimators": [1,5,10],
             "randomforestclassifier__max_depth":[10, 15, 20] }
clf_grid_model1 = GridSearchCV(model1, paramgrid)
clf_grid_model1.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('smote',
                                        SMOTE(k_neighbors=5, kind='deprecated',
                                              m_neighbors='deprecated',
                                              n_jobs=1, out_step='deprecated',
                                              random_state=None, ratio=None,
                                              sampling_strategy='auto',
                                              svm_estimator='deprecated')),
                                       ('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestc...
                                                               min_weight_fraction_lea

In [57]:
y_pred = clf_grid_model1.predict(X_test)

In [58]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          2       0.85      0.71      0.76      0.77      0.74      0.54    289934
          3       0.58      0.69      0.77      0.63      0.73      0.53    141736
          4       0.21      0.52      0.94      0.30      0.70      0.47     13598

avg / total       0.74      0.70      0.77      0.71      0.73      0.53    445268



In [26]:
print(f"Training Accuracy with Random Forest Classifier with grid search: {clf_grid_model1.score(X_train, y_train)}")
print(f"Testing Accuracy with Random Forest Classifier with grid search: {clf_grid_model1.score(X_test, y_test)}")

Training Accuracy with Random Forest Classifier with grid search: 0.7779741638743408
Testing Accuracy with Random Forest Classifier with grid search: 0.6996550392123395


In [28]:
clf_grid_model1.best_params_

{'randomforestclassifier__max_depth': 20,
 'randomforestclassifier__n_estimators': 10}

# Save the model with best accuracy

In [33]:
filename = 'model\final_acc_sev_pred_grid.pkl'
joblib.dump(clf_grid_model1, filename)

['final_acc_sev_pred_grid.pkl']

# RFC with Grid Search to try another Hyper Parameter

In [39]:
pipe_rf_OS_grid = make_pipeline_imb(SMOTE(), StandardScaler(), RandomForestClassifier(n_estimators=10))
paramgrid = {"randomforestclassifier__max_depth":[50, 75, 100] }
rf_grid_OS = GridSearchCV(pipe_rf_OS_grid, paramgrid)
rf_grid_OS.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('smote',
                                        SMOTE(k_neighbors=5, kind='deprecated',
                                              m_neighbors='deprecated',
                                              n_jobs=1, out_step='deprecated',
                                              random_state=None, ratio=None,
                                              sampling_strategy='auto',
                                              svm_estimator='deprecated')),
                                       ('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestc...
                                                               min_impurity_split=None

In [40]:
y_pred_OS = rf_grid_OS.predict(X_test)

In [51]:
rf_grid_OS.best_params_

{'randomforestclassifier__max_depth': 75}

In [41]:
print(classification_report_imbalanced(y_test, y_pred_OS))
print(f"Training Data Score with Random Forest Classifier OS_Gridsearch: {rf_grid_OS.score(X_train, y_train)}")
print(f"Testing Data Score with Random Forest Classifier OS_Gridsearch: {rf_grid_OS.score(X_test, y_test)}")

                   pre       rec       spe        f1       geo       iba       sup

          2       0.82      0.79      0.69      0.81      0.74      0.55    289934
          3       0.62      0.66      0.81      0.64      0.73      0.53    141736
          4       0.38      0.48      0.98      0.42      0.68      0.44     13598

avg / total       0.75      0.74      0.73      0.74      0.73      0.54    445268

Training Data Score with Random Forest Classifier OS_Gridsearch: 0.9869831202781246
Testing Data Score with Random Forest Classifier OS_Gridsearch: 0.7393412506625223


In [48]:
pipe_rf_OS_gridpm = make_pipeline_imb(SMOTE(), StandardScaler(), RandomForestClassifier(n_estimators=10, max_depth=20))

pipe_rf_OS_gridpm.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('smote',
                 SMOTE(k_neighbors=5, kind='deprecated',
                       m_neighbors='deprecated', n_jobs=1,
                       out_step='deprecated', random_state=None, ratio=None,
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=20, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
           

In [49]:
y_pred_OS_gridpm = pipe_rf_OS_gridpm.predict(X_test)

In [50]:
print(classification_report_imbalanced(y_test, y_pred_OS_gridpm))
print(f"Training Data Score with Random Forest Classifier OS_Gridsearch: {pipe_rf_OS_gridpm.score(X_train, y_train)}")
print(f"Testing Data Score with Random Forest Classifier OS_Gridsearch: {pipe_rf_OS_gridpm.score(X_test, y_test)}")

                   pre       rec       spe        f1       geo       iba       sup

          2       0.84      0.71      0.74      0.77      0.72      0.52    289934
          3       0.57      0.67      0.76      0.62      0.71      0.51    141736
          4       0.21      0.51      0.94      0.29      0.69      0.46     13598

avg / total       0.73      0.69      0.76      0.70      0.72      0.52    445268

Training Data Score with Random Forest Classifier OS_Gridsearch: 0.7672824755727636
Testing Data Score with Random Forest Classifier OS_Gridsearch: 0.6891422693748485


# Random Forest Classifier (UnderSampling) with GridSearch

In [33]:
pipe_rf_undersample = make_pipeline_imb(RandomUnderSampler(),
                         StandardScaler(),
                         RandomForestClassifier(n_estimators=10, max_depth=75))

pipe_rf_undersample.fit(X_train, y_train)
y_pred_undersample = pipe_rf_undersample.predict(X_test)

In [34]:
print(classification_report_imbalanced(y_test, y_pred_undersample))
print(f"Training Data Score with Random Forest Classifier US: {pipe_rf_undersample.score(X_train, y_train)}")
print(f"Testing Data Score with Random Forest Classifier US: {pipe_rf_undersample.score(X_test, y_test)}")

                   pre       rec       spe        f1       geo       iba       sup

          2       0.83      0.63      0.75      0.72      0.69      0.47    289934
          3       0.54      0.58      0.77      0.56      0.67      0.44    141736
          4       0.14      0.71      0.86      0.23      0.78      0.60     13598

avg / total       0.72      0.62      0.76      0.65      0.69      0.46    445268

Training Data Score with Random Forest Classifier US: 0.6509922114322161
Testing Data Score with Random Forest Classifier US: 0.619123763665927


# Testing the saved model

In [53]:
my_model_rf_grid = joblib.load("model/final_acc_sev_pred_grid.pkl")

# Austin Weather-severity

In [54]:
my_model_rf_grid.predict([[30.26, -97.74, 60, 2.5, 3, 2]])

array([2])

In [108]:
my_model_rf_grid.predict([[30.26, -97.74, 60, 2.5, 3, 2]])

array([2])

In [55]:
my_model_rf_grid.predict([[30.26, -97.74, 90, 20, 0, 2]])

array([2])

In [57]:
my_model_rf_grid.predict([[39.77, -84.19, 85, 2.5, 0, 4]])

array([2])

In [58]:
my_model_rf_grid.predict([[39.77, -84.19, 78.2, 0, 1, 4]])

array([3])

In [60]:
my_model_rf_grid.predict([[30.77, -91.44, 60, 2.5, 3, 4]])

array([4])

In [56]:
my_model_rf_grid.predict([[30.77, -91.49, 60, 2.5, 3, 2]])

array([3])

In [61]:
my_model_rf_grid.predict([[40.44, -79.99, 60, 2.5, 3, 4]])

array([3])

In [62]:
my_model_rf_grid.predict([[40.44, -79.99, 60, 0, 4, 4]])

array([3])

In [63]:
my_model_rf_grid.predict([[40.44, -79.99, 60, 0, 2, 4]])

array([2])

In [66]:
model1_US = make_pipeline_imb(RandomUnderSampler(), StandardScaler(), RandomForestClassifier(n_estimators=10))
paramgrid = {"randomforestclassifier__max_depth":[30, 75, 90] }
clf_grid_model1_US = GridSearchCV(model1_US, paramgrid)
clf_grid_model1_US.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('randomundersampler',
                                        RandomUnderSampler(random_state=None,
                                                           ratio=None,
                                                           replacement=False,
                                                           return_indices=False,
                                                           sampling_strategy='auto')),
                                       ('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(bootstrap=True,
                                               

In [67]:
y_pred = clf_grid_model1_US.predict(X_test)

In [68]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          2       0.82      0.63      0.75      0.71      0.68      0.46    289934
          3       0.53      0.58      0.76      0.55      0.66      0.43    141736
          4       0.13      0.70      0.86      0.23      0.78      0.59     13598

avg / total       0.71      0.61      0.76      0.65      0.68      0.46    445268



In [69]:
print(f"Training Accuracy with Random Forest Classifier with grid search: {clf_grid_model1_US.score(X_train, y_train)}")
print(f"Testing Accuracy with Random Forest Classifier with grid search: {clf_grid_model1_US.score(X_test, y_test)}")


Training Accuracy with Random Forest Classifier with grid search: 0.6452286413276199
Testing Accuracy with Random Forest Classifier with grid search: 0.6130981790741755


In [70]:
clf_grid_model1_US.best_params_

{'randomforestclassifier__max_depth': 75}

In [65]:
filename = 'final_acc_sev_pred_grid_US.pkl'
joblib.dump(clf_grid_model1_US, filename)

['final_acc_sev_pred_grid_US.pkl']

In [71]:
my_model_rf_US = joblib.load("final_acc_sev_pred_grid_US.pkl")

In [72]:
my_model_rf_US.predict([[30.77, -91.44, 60, 2.5, 3, 2]])

array([3])

In [73]:
my_model_rf_US.predict([[30.77, -91.44, 60, 5, 0, 2]])

array([4])

In [74]:
my_model_rf_US.predict([[30.77, -91.44, 60, 0, 0, 2]])

array([2])

# Logistic Regression with Over Sampling

In [64]:
lg_pipe = make_pipeline_imb(SMOTE(),
                         StandardScaler(),
                         LogisticRegression())

lg_pipe.fit(X_train, y_train)
y_pred = lg_pipe.predict(X_test)

In [65]:
print(classification_report_imbalanced(y_test, y_pred))
print(f"Training Data Score with Logistic Classifier: {lg_pipe.score(X_train, y_train)}")
print(f"Testing Data Score with Logistic Classifier: {lg_pipe.score(X_test, y_test)}")

                   pre       rec       spe        f1       geo       iba       sup

          2       0.67      0.42      0.62      0.52      0.51      0.25    289934
          3       0.33      0.20      0.81      0.25      0.40      0.15    141736
          4       0.05      0.59      0.60      0.08      0.60      0.36     13598

avg / total       0.54      0.35      0.68      0.42      0.48      0.22    445268

Training Data Score with Logistic Classifier: 0.35346952097762846
Testing Data Score with Logistic Classifier: 0.35336696102122767


# Support Vector Classifier with Under Sampling

In [153]:
svc_pipe = make_pipeline_imb(RandomUnderSampler(),
                         StandardScaler(),
                         SVC(kernel='linear'))

svc_pipe.fit(X_train, y_train)
y_pred = svc_pipe.predict(X_test)

In [None]:
print(classification_report_imbalanced(y_test, y_pred))
print(f"Training Data Score with Support Vector Classifier: {svc_pipe.score(X_train, y_train)}")
print(f"Testing Data Score with Support Vector Classifier: {svc_pipe.score(X_test, y_test)}")

                   pre       rec       spe        f1       geo       iba       sup

          2       0.66      0.52      0.50      0.58      0.51      0.26    289934
          3       0.28      0.02      0.98      0.04      0.14      0.02    141736
          4       0.04      0.66      0.54      0.08      0.60      0.36     13598

avg / total       0.52      0.36      0.65      0.39      0.39      0.19    445268

