In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import os
import numpy as np
import plotly.express as px
import scipy.stats as sts
from sklearn import datasets
import statsmodels.api as sm
from datetime import datetime
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
accident_df=pd.read_csv("Resources/accident-data.csv")
accident_df.head()

Unnamed: 0,accident_index,accident_year,accident_reference,longitude,latitude,accident_severity,number_of_vehicles,number_of_casualties,date,day_of_week,...,second_road_class,second_road_number,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,light_conditions,weather_conditions,road_surface_conditions,special_conditions_at_site,carriageway_hazards,urban_or_rural_area
0,2020010219808,2020,10219808,-0.254001,51.462262,3,1,1,04/02/2020,3,...,6,0,9,9,1,9,9,0,0,1
1,2020010220496,2020,10220496,-0.139253,51.470327,3,1,2,27/04/2020,2,...,6,0,0,4,1,1,1,0,0,1
2,2020010228005,2020,10228005,-0.178719,51.529614,3,1,1,01/01/2020,4,...,6,0,0,0,4,1,2,0,0,1
3,2020010228006,2020,10228006,-0.001683,51.54121,2,1,1,01/01/2020,4,...,6,0,0,4,4,1,1,0,0,1
4,2020010228011,2020,10228011,-0.137592,51.515704,3,1,2,01/01/2020,4,...,5,0,0,0,4,1,1,0,0,1


In [3]:
accident_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91199 entries, 0 to 91198
Data columns (total 27 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   accident_index                           91199 non-null  object 
 1   accident_year                            91199 non-null  int64  
 2   accident_reference                       91199 non-null  object 
 3   longitude                                91185 non-null  float64
 4   latitude                                 91185 non-null  float64
 5   accident_severity                        91199 non-null  int64  
 6   number_of_vehicles                       91199 non-null  int64  
 7   number_of_casualties                     91199 non-null  int64  
 8   date                                     91199 non-null  object 
 9   day_of_week                              91199 non-null  int64  
 10  time                                     91199

In [4]:
accident_df.dropna()

Unnamed: 0,accident_index,accident_year,accident_reference,longitude,latitude,accident_severity,number_of_vehicles,number_of_casualties,date,day_of_week,...,second_road_class,second_road_number,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,light_conditions,weather_conditions,road_surface_conditions,special_conditions_at_site,carriageway_hazards,urban_or_rural_area
0,2020010219808,2020,10219808,-0.254001,51.462262,3,1,1,04/02/2020,3,...,6,0,9,9,1,9,9,0,0,1
1,2020010220496,2020,10220496,-0.139253,51.470327,3,1,2,27/04/2020,2,...,6,0,0,4,1,1,1,0,0,1
2,2020010228005,2020,10228005,-0.178719,51.529614,3,1,1,01/01/2020,4,...,6,0,0,0,4,1,2,0,0,1
3,2020010228006,2020,10228006,-0.001683,51.541210,2,1,1,01/01/2020,4,...,6,0,0,4,4,1,1,0,0,1
4,2020010228011,2020,10228011,-0.137592,51.515704,3,1,2,01/01/2020,4,...,5,0,0,0,4,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91194,2020991027064,2020,991027064,-2.926320,56.473539,2,2,1,12/08/2020,4,...,4,959,0,0,1,1,1,0,0,1
91195,2020991029573,2020,991029573,-4.267565,55.802353,3,1,1,13/11/2020,6,...,6,0,0,0,1,1,1,0,0,1
91196,2020991030297,2020,991030297,-2.271903,57.186317,2,2,1,15/04/2020,4,...,6,0,0,0,1,1,1,0,0,2
91197,2020991030900,2020,991030900,-3.968753,55.950940,3,2,1,15/12/2020,3,...,6,0,0,0,1,1,1,0,0,1


In [5]:
cleaned = accident_df.drop(columns=["accident_year","accident_reference","first_road_class","first_road_number",\
                                   "road_type","junction_detail","junction_control","second_road_class","second_road_number",\
                                   "pedestrian_crossing_human_control","pedestrian_crossing_physical_facilities",\
                                   "special_conditions_at_site","carriageway_hazards"])

In [6]:
cleaned.loc[cleaned["accident_severity"] ==3,"severity"] = "Slight"
cleaned.loc[cleaned["accident_severity"] ==2,"severity"] = "Serious"
cleaned.loc[cleaned["accident_severity"] ==1,"severity"] = "Fatal"

In [7]:
cleaned.loc[cleaned["road_surface_conditions"] ==1,"road_surface"] = "Dry"
cleaned.loc[cleaned["road_surface_conditions"] ==2,"road_surface"] = "Wet or damp"
cleaned.loc[cleaned["road_surface_conditions"] ==3,"road_surface"] = "Snow"
cleaned.loc[cleaned["road_surface_conditions"] ==4,"road_surface"] = "Frost or ice"
cleaned.loc[cleaned["road_surface_conditions"] ==5,"road_surface"] = "Flood over 3cm. deep"
cleaned.loc[cleaned["road_surface_conditions"] ==6,"road_surface"] = "Oil or diesel"
cleaned.loc[cleaned["road_surface_conditions"] ==7,"road_surface"] = "Mud"
cleaned.loc[cleaned["road_surface_conditions"] ==-1,"road_surface"] = "Data missing or out of range"
cleaned.loc[cleaned["road_surface_conditions"] ==9,"road_surface"] = "unknown (self reported)"

In [8]:
unknown_road=cleaned[cleaned["road_surface_conditions"] ==9]
cleaned.drop(unknown_road.index[unknown_road['road_surface_conditions'] == 9], inplace=True)
missingdata=cleaned[cleaned["road_surface_conditions"] ==-1]
cleaned.drop(missingdata.index[missingdata['road_surface_conditions'] == -1], inplace=True)
speedlimit=cleaned[cleaned["speed_limit"] ==-1]
cleaned.drop(speedlimit.index[speedlimit['speed_limit'] == -1], inplace=True)
weather=cleaned[cleaned["weather_conditions"] ==9]
cleaned.drop(weather.index[weather['weather_conditions'] == 9], inplace=True)

In [9]:
cleaned

Unnamed: 0,accident_index,longitude,latitude,accident_severity,number_of_vehicles,number_of_casualties,date,day_of_week,time,speed_limit,light_conditions,weather_conditions,road_surface_conditions,urban_or_rural_area,severity,road_surface
1,2020010220496,-0.139253,51.470327,3,1,2,27/04/2020,2,13:55,20,1,1,1,1,Slight,Dry
2,2020010228005,-0.178719,51.529614,3,1,1,01/01/2020,4,01:25,30,4,1,2,1,Slight,Wet or damp
3,2020010228006,-0.001683,51.541210,2,1,1,01/01/2020,4,01:50,30,4,1,1,1,Serious,Dry
4,2020010228011,-0.137592,51.515704,3,1,2,01/01/2020,4,02:25,30,4,1,1,1,Slight,Dry
5,2020010228012,-0.025880,51.476278,3,1,1,01/01/2020,4,01:30,20,4,1,1,1,Slight,Dry
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91194,2020991027064,-2.926320,56.473539,2,2,1,12/08/2020,4,14:30,30,1,1,1,1,Serious,Dry
91195,2020991029573,-4.267565,55.802353,3,1,1,13/11/2020,6,15:05,30,1,1,1,1,Slight,Dry
91196,2020991030297,-2.271903,57.186317,2,2,1,15/04/2020,4,12:42,60,1,1,1,2,Serious,Dry
91197,2020991030900,-3.968753,55.950940,3,2,1,15/12/2020,3,14:00,30,1,1,1,1,Slight,Dry


## Road Conditions vs # of casualties/ severity

In [94]:
px.box(cleaned, x="number_of_casualties", y="road_surface", orientation="h")

In [98]:
dry = cleaned[cleaned["road_surface"] == "Dry"]["number_of_casualties"]
wod = cleaned[cleaned["road_surface"] == "Wet or damp"]["number_of_casualties"]
flood = cleaned[cleaned["road_surface"] == "Flood over 3cm. deep"]["number_of_casualties"]
frost = cleaned[cleaned["road_surface"] == "Frost or ice"]["number_of_casualties"]
snow = cleaned[cleaned["road_surface"] == "Snow"]["number_of_casualties"]

In [70]:
new ={"Condition": ["Dry","Flood","Snow","Wet or Dry","Frost"],"Slight":[47974,138,122,20155,583], "Serious":[12493,39,26,5290,165],"Fatal":[917,4,1,439,12]}
new2= pd.DataFrame(new,columns=["Condition","Slight","Serious","Fatal"])
sum_column = new2["Slight"] + new2["Serious"] +new2["Fatal"]
new2["Total"]=sum_column
fatal=new2["Fatal"]/new2["Total"]
slight=new2["Slight"]/new2["Total"]
serious=new2["Serious"]/new2["Total"]
new2["Fatal %"]=fatal
new2["Slight %"]=slight
new2["Serious %"]=serious
new2

Unnamed: 0,Condition,Slight,Serious,Fatal,Total,Fatal %,Slight %,Serious %
0,Dry,47974,12493,917,61384,0.014939,0.781539,0.203522
1,Flood,138,39,4,181,0.022099,0.762431,0.21547
2,Snow,122,26,1,149,0.006711,0.818792,0.174497
3,Wet or Dry,20155,5290,439,25884,0.01696,0.778666,0.204373
4,Frost,583,165,12,760,0.015789,0.767105,0.217105


In [122]:
import plotly.graph_objs as go
fig = go.Figure()
fig.add_trace(go.Bar(
    y=new2["Fatal %"],
    x=new2.Condition,
    name="Fatal %",))
fig.add_trace(go.Bar(
    y=new2["Slight %"],
    x=new2.Condition,
    name="Slight %",))
fig.add_trace(go.Bar(
    y=new2["Serious %"],
    x=new2.Condition,
    name="Serious %",))
fig.update_layout(
        yaxis=dict(
        title_text="Percentage",
        ticktext=["0%", "20%", "40%", "60%","80%","100%"],
        tickvals=[0, 20, 40, 60, 80, 100],
        tickmode="array",
        titlefont=dict(size=15),
    ),
    autosize=False,
    width=1000,
    height=500,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    title={
        'text': "Accident Severity by Road Conditions",
        }, barmode="stack")
fig.show()

In [113]:
sts.kruskal(dry,wod, flood, frost, snow)

KruskalResult(statistic=166.1497633841841, pvalue=7.009766655756153e-35)

In [114]:
tukey = pairwise_tukeyhsd(endog=cleaned["number_of_casualties"],
                          groups=cleaned["road_surface"],
                          alpha=0.05)
print(tukey)

              Multiple Comparison of Means - Tukey HSD, FWER=0.05              
       group1               group2        meandiff p-adj   lower  upper  reject
-------------------------------------------------------------------------------
                 Dry Flood over 3cm. deep   0.1209 0.1244 -0.0184 0.2603  False
                 Dry         Frost or ice   0.0308  0.708 -0.0375 0.0991  False
                 Dry                 Snow   0.1009 0.3782 -0.0526 0.2545  False
                 Dry          Wet or damp    0.054  0.001  0.0401 0.0679   True
Flood over 3cm. deep         Frost or ice  -0.0902 0.5036  -0.245 0.0646  False
Flood over 3cm. deep                 Snow    -0.02    0.9  -0.227  0.187  False
Flood over 3cm. deep          Wet or damp  -0.0669 0.6631 -0.2065 0.0727  False
        Frost or ice                 Snow   0.0702 0.7574 -0.0975 0.2379  False
        Frost or ice          Wet or damp   0.0232 0.8831 -0.0456 0.0921  False
                Snow          Wet or dam

## Light Conditions vs # of casualties

## Other work

In [16]:
y = pd.get_dummies(cleaned.road_surface, prefix='Road')
print(y.head())

   Road_Dry  Road_Flood over 3cm. deep  Road_Frost or ice  Road_Snow  \
1         1                          0                  0          0   
2         0                          0                  0          0   
3         1                          0                  0          0   
4         1                          0                  0          0   
5         1                          0                  0          0   

   Road_Wet or damp  
1                 0  
2                 1  
3                 0  
4                 0  
5                 0  


In [17]:
result = pd.concat([cleaned, y], axis=1)
result.head()

Unnamed: 0,accident_index,longitude,latitude,accident_severity,number_of_vehicles,number_of_casualties,date,day_of_week,time,speed_limit,...,weather_conditions,road_surface_conditions,urban_or_rural_area,severity,road_surface,Road_Dry,Road_Flood over 3cm. deep,Road_Frost or ice,Road_Snow,Road_Wet or damp
1,2020010220496,-0.139253,51.470327,3,1,2,27/04/2020,2,13:55,20,...,1,1,1,Slight,Dry,1,0,0,0,0
2,2020010228005,-0.178719,51.529614,3,1,1,01/01/2020,4,01:25,30,...,1,2,1,Slight,Wet or damp,0,0,0,0,1
3,2020010228006,-0.001683,51.54121,2,1,1,01/01/2020,4,01:50,30,...,1,1,1,Serious,Dry,1,0,0,0,0
4,2020010228011,-0.137592,51.515704,3,1,2,01/01/2020,4,02:25,30,...,1,1,1,Slight,Dry,1,0,0,0,0
5,2020010228012,-0.02588,51.476278,3,1,1,01/01/2020,4,01:30,20,...,1,1,1,Slight,Dry,1,0,0,0,0


In [19]:
X = result[['Road_Dry',"Road_Flood over 3cm. deep","Road_Frost or ice","Road_Snow","Road_Wet or damp"]]
y = result['number_of_casualties']
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                             OLS Regression Results                             
Dep. Variable:     number_of_casualties   R-squared:                       0.001
Model:                              OLS   Adj. R-squared:                  0.001
Method:                   Least Squares   F-statistic:                     22.20
Date:                  Fri, 22 Apr 2022   Prob (F-statistic):           2.60e-22
Time:                          21:21:50   Log-Likelihood:                -92091.
No. Observations:                 88358   AIC:                         1.842e+05
Df Residuals:                     88352   BIC:                         1.842e+05
Df Model:                             5                                         
Covariance Type:              nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
co


In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only



In [128]:
fig=px.parallel_coordinates(cleaned,dimensions=['accident_severity',"number_of_casualties"], color="accident_severity")
fig.show()

In [127]:
px.histogram(cleaned, "number_of_vehicles")