In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [94]:
df = pd.read_csv("../Dataset/accidents_2012_to_2014.csv", low_memory=False)

In [95]:
#Checking For Null Values
df.isna().sum()

Accident_Index                                      0
Location_Easting_OSGR                               0
Location_Northing_OSGR                              0
Longitude                                           0
Latitude                                            0
Police_Force                                        0
Accident_Severity                                   0
Number_of_Vehicles                                  0
Number_of_Casualties                                0
Date                                                0
Day_of_Week                                         0
Time                                               13
Local_Authority_(District)                          0
Local_Authority_(Highway)                           0
1st_Road_Class                                      0
1st_Road_Number                                     0
Road_Type                                           0
Speed_limit                                         0
Junction_Detail             

In [96]:
df.shape

(464697, 33)

In [97]:
def preProcess(data):
    data=data.drop(["Junction_Detail","Junction_Control"] , axis=1)
    data=data.dropna(axis=0)
    cols_to_remove=['Pedestrian_Crossing-Human_Control','Urban_or_Rural_Area','1st_Road_Class','1st_Road_Number','Road_Type','Pedestrian_Crossing-Physical_Facilities']
    data=data.drop(cols_to_remove ,axis=1)
    return data

In [98]:
df= preProcess(df)
df.isna().sum()

Accident_Index                                 0
Location_Easting_OSGR                          0
Location_Northing_OSGR                         0
Longitude                                      0
Latitude                                       0
Police_Force                                   0
Accident_Severity                              0
Number_of_Vehicles                             0
Number_of_Casualties                           0
Date                                           0
Day_of_Week                                    0
Time                                           0
Local_Authority_(District)                     0
Local_Authority_(Highway)                      0
Speed_limit                                    0
2nd_Road_Class                                 0
2nd_Road_Number                                0
Light_Conditions                               0
Weather_Conditions                             0
Road_Surface_Conditions                        0
Special_Conditions_a

In [133]:
df.shape

(550, 25)

In [99]:
df['Light_Conditions'].unique()

array(['Darkness: Street lights present and lit',
       'Daylight: Street light present', 'Darkeness: No street lighting',
       'Darkness: Street lighting unknown',
       'Darkness: Street lights present but unlit'], dtype=object)

In [100]:
df["Weather_Conditions"].unique()

array(['Fine without high winds', 'Unknown', 'Fine with high winds',
       'Raining without high winds', 'Snowing without high winds',
       'Raining with high winds', 'Fog or mist', 'Other',
       'Snowing with high winds'], dtype=object)

In [101]:
df["Road_Surface_Conditions"].unique()

array(['Dry', 'Wet/Damp', 'Frost/Ice', 'Flood (Over 3cm of water)',
       'Snow'], dtype=object)

In [102]:
def encodeData(data):
    encod={"Daylight: Street light present":1,"Darkness: Street lights present and lit":2,"Darkness: Street lighting unknown":3,"Darkness: Street lights present but unlit":4,"Darkeness: No street lighting":5}
    encod2={"Raining without high winds":1,"Fine without high winds":2,"Unknown":3,"Snowing without high winds":4,"Other":5, "Fine with high winds":6,"Raining with high winds":7,"Fog or mist":8}
    encod3={"Dry":1,"Wet/Damp":2,"Frost/Ice":3,"Snow":4}
    data["Light_Conditions"]=data["Light_Conditions"].map(encod)
    data["Weather_Conditions"]=data["Weather_Conditions"].map(encod2)
    data["Road_Surface_Conditions"]=data["Road_Surface_Conditions"].map(encod3)
    return data

In [103]:
df = encodeData(df)

In [104]:
df.isna().sum()

Accident_Index                                 0
Location_Easting_OSGR                          0
Location_Northing_OSGR                         0
Longitude                                      0
Latitude                                       0
Police_Force                                   0
Accident_Severity                              0
Number_of_Vehicles                             0
Number_of_Casualties                           0
Date                                           0
Day_of_Week                                    0
Time                                           0
Local_Authority_(District)                     0
Local_Authority_(Highway)                      0
Speed_limit                                    0
2nd_Road_Class                                 0
2nd_Road_Number                                0
Light_Conditions                               0
Weather_Conditions                             1
Road_Surface_Conditions                        7
Special_Conditions_a

In [105]:
df.dropna(inplace=True)

In [106]:
df.isna().sum()

Accident_Index                                 0
Location_Easting_OSGR                          0
Location_Northing_OSGR                         0
Longitude                                      0
Latitude                                       0
Police_Force                                   0
Accident_Severity                              0
Number_of_Vehicles                             0
Number_of_Casualties                           0
Date                                           0
Day_of_Week                                    0
Time                                           0
Local_Authority_(District)                     0
Local_Authority_(Highway)                      0
Speed_limit                                    0
2nd_Road_Class                                 0
2nd_Road_Number                                0
Light_Conditions                               0
Weather_Conditions                             0
Road_Surface_Conditions                        0
Special_Conditions_a

In [107]:
def convertTime(data):
    data["Time"]=pd.to_datetime(data["Time"], format='%H:%M').dt.time
    return data

In [132]:
df.shape

(550, 25)

In [108]:
df = convertTime(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 550 entries, 82 to 455552
Data columns (total 25 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Accident_Index                               550 non-null    object 
 1   Location_Easting_OSGR                        550 non-null    int64  
 2   Location_Northing_OSGR                       550 non-null    int64  
 3   Longitude                                    550 non-null    float64
 4   Latitude                                     550 non-null    float64
 5   Police_Force                                 550 non-null    int64  
 6   Accident_Severity                            550 non-null    int64  
 7   Number_of_Vehicles                           550 non-null    int64  
 8   Number_of_Casualties                         550 non-null    int64  
 9   Date                                         550 non-null    object 
 10  Day

In [109]:
#Start Training
data=df.iloc[:200000:]
#This is the data reserved for testing later on...
dataTest=df.iloc[300000:300400:]
data.shape

(550, 25)

In [110]:
# y_test = dataTest['Accident_Severity']
# print(y_test.to_string())

In [111]:
x=data[["Location_Easting_OSGR","Location_Northing_OSGR","Longitude","Latitude","Day_of_Week","Speed_limit","2nd_Road_Class","Number_of_Vehicles","Light_Conditions","Weather_Conditions", "Road_Surface_Conditions"]]
y=data['Accident_Severity']

In [112]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [113]:
dtc=DecisionTreeClassifier()
rfc=RandomForestClassifier(n_estimators=200,criterion="entropy")
nb=GaussianNB()
lr=LogisticRegression()

In [114]:
sc=StandardScaler()
x=sc.fit_transform(x)

In [115]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20)

In [116]:
data.isnull().sum()

Accident_Index                                 0
Location_Easting_OSGR                          0
Location_Northing_OSGR                         0
Longitude                                      0
Latitude                                       0
Police_Force                                   0
Accident_Severity                              0
Number_of_Vehicles                             0
Number_of_Casualties                           0
Date                                           0
Day_of_Week                                    0
Time                                           0
Local_Authority_(District)                     0
Local_Authority_(Highway)                      0
Speed_limit                                    0
2nd_Road_Class                                 0
2nd_Road_Number                                0
Light_Conditions                               0
Weather_Conditions                             0
Road_Surface_Conditions                        0
Special_Conditions_a

In [117]:
np.isfinite(x_train).all()

True

In [118]:
dtc.fit(x_train,y_train)
rfc.fit(x_train,y_train)
nb.fit(x_train,y_train)
lr.fit(x_train,y_train)

In [119]:
#PREDICTING TARGET VALUES AND CALCULAING ACCURACY SCORES OF MODEL
de_pred=dtc.predict(x_test)
rf_pred=rfc.predict(x_test)
nb_pred=nb.predict(x_test)
lr_pred=lr.predict(x_test)

In [120]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [121]:
print("accuracy for decision tree is:{}%".format(accuracy_score(y_test,de_pred)*100))
print(classification_report(y_test,de_pred))

accuracy for decision tree is:77.27272727272727%
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.18      0.12      0.14        17
           3       0.85      0.90      0.87        92

    accuracy                           0.77       110
   macro avg       0.34      0.34      0.34       110
weighted avg       0.74      0.77      0.75       110



In [122]:
print("accuracy for random forest is:{}%".format(accuracy_score(y_test,rf_pred)*100))
print(classification_report(y_test,rf_pred))

accuracy for random forest is:82.72727272727273%
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00        17
           3       0.83      0.99      0.91        92

    accuracy                           0.83       110
   macro avg       0.28      0.33      0.30       110
weighted avg       0.70      0.83      0.76       110



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [123]:
print("accuracy for naive bayes is:{}%".format(accuracy_score(y_test,nb_pred)*100))
print(classification_report(y_test,nb_pred))

accuracy for naive bayes is:80.9090909090909%
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00        17
           3       0.83      0.97      0.89        92

    accuracy                           0.81       110
   macro avg       0.28      0.32      0.30       110
weighted avg       0.70      0.81      0.75       110



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [124]:
print("accuracy for Logistic Regression is:{}%".format(accuracy_score(y_test,lr_pred)*100))
print(classification_report(y_test,lr_pred))

accuracy for Logistic Regression is:83.63636363636363%
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00        17
           3       0.84      1.00      0.91        92

    accuracy                           0.84       110
   macro avg       0.28      0.33      0.30       110
weighted avg       0.70      0.84      0.76       110



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [125]:
# Voting Classifier - Multiple Model Ensemble 
from sklearn.ensemble import VotingClassifier
dtc=DecisionTreeClassifier()
rfc=RandomForestClassifier(n_estimators=200,criterion="entropy")
nb=GaussianNB()
lr=LogisticRegression()

In [126]:
evc = VotingClassifier( estimators= [('lr',lr),('dtc',dtc),('rfc',rfc),('nb',nb)], voting = 'hard')

In [127]:
evc.fit(x_train,y_train)

In [128]:
evc.score(x_test,y_test)

0.8363636363636363

In [129]:
pred_evc = evc.predict(x_test)
print("Accuracy of the final voting classifier: {}%".format(round(accuracy_score(y_test, pred_evc)*100,2)))

Accuracy of the final voting classifier: 83.64%


In [130]:
#Testing The Model using Custom Test Data
customTestData=dataTest
customTestData.shape

(0, 25)

In [137]:
customX=customTestData[["Location_Easting_OSGR","Location_Northing_OSGR","Longitude","Latitude","Day_of_Week","Speed_limit","2nd_Road_Class","Number_of_Vehicles","Light_Conditions","Weather_Conditions", "Road_Surface_Conditions"]]
# print(customX.columns);
customY=evc.predict(customX)



ValueError: Found array with 0 sample(s) (shape=(0, 11)) while a minimum of 1 is required by LogisticRegression.

In [None]:
print(customY)

NameError: name 'customY' is not defined