In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import os
import numpy as np
import plotly.express as px
import scipy.stats as sts
from sklearn import datasets
import statsmodels.api as sm
from datetime import datetime
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import plotly.graph_objs as go

In [2]:
accident_df=pd.read_csv("Resources/accident-data.csv")
accident_df.head()

Unnamed: 0,accident_index,accident_year,accident_reference,longitude,latitude,accident_severity,number_of_vehicles,number_of_casualties,date,day_of_week,...,second_road_class,second_road_number,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,light_conditions,weather_conditions,road_surface_conditions,special_conditions_at_site,carriageway_hazards,urban_or_rural_area
0,2020010219808,2020,10219808,-0.254001,51.462262,3,1,1,04/02/2020,3,...,6,0,9,9,1,9,9,0,0,1
1,2020010220496,2020,10220496,-0.139253,51.470327,3,1,2,27/04/2020,2,...,6,0,0,4,1,1,1,0,0,1
2,2020010228005,2020,10228005,-0.178719,51.529614,3,1,1,01/01/2020,4,...,6,0,0,0,4,1,2,0,0,1
3,2020010228006,2020,10228006,-0.001683,51.54121,2,1,1,01/01/2020,4,...,6,0,0,4,4,1,1,0,0,1
4,2020010228011,2020,10228011,-0.137592,51.515704,3,1,2,01/01/2020,4,...,5,0,0,0,4,1,1,0,0,1


In [3]:
accident_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91199 entries, 0 to 91198
Data columns (total 27 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   accident_index                           91199 non-null  object 
 1   accident_year                            91199 non-null  int64  
 2   accident_reference                       91199 non-null  object 
 3   longitude                                91185 non-null  float64
 4   latitude                                 91185 non-null  float64
 5   accident_severity                        91199 non-null  int64  
 6   number_of_vehicles                       91199 non-null  int64  
 7   number_of_casualties                     91199 non-null  int64  
 8   date                                     91199 non-null  object 
 9   day_of_week                              91199 non-null  int64  
 10  time                                     91199

In [4]:
accident_df.dropna()

Unnamed: 0,accident_index,accident_year,accident_reference,longitude,latitude,accident_severity,number_of_vehicles,number_of_casualties,date,day_of_week,...,second_road_class,second_road_number,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,light_conditions,weather_conditions,road_surface_conditions,special_conditions_at_site,carriageway_hazards,urban_or_rural_area
0,2020010219808,2020,10219808,-0.254001,51.462262,3,1,1,04/02/2020,3,...,6,0,9,9,1,9,9,0,0,1
1,2020010220496,2020,10220496,-0.139253,51.470327,3,1,2,27/04/2020,2,...,6,0,0,4,1,1,1,0,0,1
2,2020010228005,2020,10228005,-0.178719,51.529614,3,1,1,01/01/2020,4,...,6,0,0,0,4,1,2,0,0,1
3,2020010228006,2020,10228006,-0.001683,51.541210,2,1,1,01/01/2020,4,...,6,0,0,4,4,1,1,0,0,1
4,2020010228011,2020,10228011,-0.137592,51.515704,3,1,2,01/01/2020,4,...,5,0,0,0,4,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91194,2020991027064,2020,991027064,-2.926320,56.473539,2,2,1,12/08/2020,4,...,4,959,0,0,1,1,1,0,0,1
91195,2020991029573,2020,991029573,-4.267565,55.802353,3,1,1,13/11/2020,6,...,6,0,0,0,1,1,1,0,0,1
91196,2020991030297,2020,991030297,-2.271903,57.186317,2,2,1,15/04/2020,4,...,6,0,0,0,1,1,1,0,0,2
91197,2020991030900,2020,991030900,-3.968753,55.950940,3,2,1,15/12/2020,3,...,6,0,0,0,1,1,1,0,0,1


In [5]:
cleaned = accident_df.drop(columns=["accident_year","accident_reference","first_road_class","first_road_number",\
                                   "road_type","junction_detail","junction_control","second_road_class","second_road_number",\
                                   "pedestrian_crossing_human_control","pedestrian_crossing_physical_facilities",\
                                   "special_conditions_at_site","carriageway_hazards"])

In [6]:
cleaned.loc[cleaned["accident_severity"] ==3,"severity"] = "Slight"
cleaned.loc[cleaned["accident_severity"] ==2,"severity"] = "Serious"
cleaned.loc[cleaned["accident_severity"] ==1,"severity"] = "Fatal"

In [7]:
cleaned.loc[cleaned["road_surface_conditions"] ==1,"road_surface"] = "Dry"
cleaned.loc[cleaned["road_surface_conditions"] ==2,"road_surface"] = "Wet or damp"
cleaned.loc[cleaned["road_surface_conditions"] ==3,"road_surface"] = "Snow"
cleaned.loc[cleaned["road_surface_conditions"] ==4,"road_surface"] = "Frost or ice"
cleaned.loc[cleaned["road_surface_conditions"] ==5,"road_surface"] = "Flood over 3cm. deep"
cleaned.loc[cleaned["road_surface_conditions"] ==6,"road_surface"] = "Oil or diesel"
cleaned.loc[cleaned["road_surface_conditions"] ==7,"road_surface"] = "Mud"
cleaned.loc[cleaned["road_surface_conditions"] ==-1,"road_surface"] = "Data missing or out of range"
cleaned.loc[cleaned["road_surface_conditions"] ==9,"road_surface"] = "unknown (self reported)"

In [8]:
unknown_road=cleaned[cleaned["road_surface_conditions"] ==9]
cleaned.drop(unknown_road.index[unknown_road['road_surface_conditions'] == 9], inplace=True)
missingdata=cleaned[cleaned["road_surface_conditions"] ==-1]
cleaned.drop(missingdata.index[missingdata['road_surface_conditions'] == -1], inplace=True)
speedlimit=cleaned[cleaned["speed_limit"] ==-1]
cleaned.drop(speedlimit.index[speedlimit['speed_limit'] == -1], inplace=True)
weather=cleaned[cleaned["weather_conditions"] ==9]
cleaned.drop(weather.index[weather['weather_conditions'] == 9], inplace=True)
unallocated=cleaned[cleaned["urban_or_rural_area"] ==3]
cleaned.drop(unallocated.index[unallocated['urban_or_rural_area'] == 3], inplace=True)
missingurban=cleaned[cleaned["urban_or_rural_area"] == -1]
cleaned.drop(missingurban.index[missingurban['urban_or_rural_area'] == -1], inplace=True)

In [9]:
cleaned

Unnamed: 0,accident_index,longitude,latitude,accident_severity,number_of_vehicles,number_of_casualties,date,day_of_week,time,speed_limit,light_conditions,weather_conditions,road_surface_conditions,urban_or_rural_area,severity,road_surface
1,2020010220496,-0.139253,51.470327,3,1,2,27/04/2020,2,13:55,20,1,1,1,1,Slight,Dry
2,2020010228005,-0.178719,51.529614,3,1,1,01/01/2020,4,01:25,30,4,1,2,1,Slight,Wet or damp
3,2020010228006,-0.001683,51.541210,2,1,1,01/01/2020,4,01:50,30,4,1,1,1,Serious,Dry
4,2020010228011,-0.137592,51.515704,3,1,2,01/01/2020,4,02:25,30,4,1,1,1,Slight,Dry
5,2020010228012,-0.025880,51.476278,3,1,1,01/01/2020,4,01:30,20,4,1,1,1,Slight,Dry
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91194,2020991027064,-2.926320,56.473539,2,2,1,12/08/2020,4,14:30,30,1,1,1,1,Serious,Dry
91195,2020991029573,-4.267565,55.802353,3,1,1,13/11/2020,6,15:05,30,1,1,1,1,Slight,Dry
91196,2020991030297,-2.271903,57.186317,2,2,1,15/04/2020,4,12:42,60,1,1,1,2,Serious,Dry
91197,2020991030900,-3.968753,55.950940,3,2,1,15/12/2020,3,14:00,30,1,1,1,1,Slight,Dry


In [None]:
px.histogram(cleaned, "number_of_casualties")

## Road Conditions vs # of casualties/ severity

In [32]:
px.box(cleaned, x="number_of_casualties", y="road_surface", orientation="h",title="Number of Casualties for each Road Condition",\
      labels={"road_surface": "Road Surface",
                     "number_of_casualties": "Number of Casualties"})

In [10]:
new ={"Condition": ["Dry","Wet or Damp","Flood","Frost","Snow"],"1":[50441,20354,134,596,106], "2":[7841,3945,31,131,34],"3":[2086,1039,11,22,8],\
     "4":[673,363,5,5,1],"5":[220,119,0,3,0],"6":[75,37,0,3,0],"7":[28,14,0,0,0],"8":[12,2,0,0,0],"9":[5,2,0,0,0],"10":[1,2,0,0,0],\
      "11":[1,3,0,0,0],"12":[0,1,0,0,0],"13":[0,1,0,0,0],"17":[0,1,0,0,0],"19":[0,1,0,0,0],"41":[1,0,0,0,0]}
new2= pd.DataFrame(new,columns=["Condition","1","2","3","4","5","6","7","8","9","10","11","12","13","17","19","41"])
sum_column = new2["1"] + new2["2"] +new2["3"]+new2["4"]+new2["5"]+new2["6"]+new2["7"]+new2["8"]+new2["9"]+new2["10"]\
+new2["11"]+new2["12"]+new2["13"]+new2["17"]+new2["19"]+new2["41"]
new2["Total"]=sum_column
new2["1 %"]=new2["1"]/new2["Total"]
new2["2 %"]=new2["2"]/new2["Total"]
new2["3 %"]=new2["3"]/new2["Total"]
new2["4 %"]=new2["4"]/new2["Total"]
new2["5 %"]=new2["5"]/new2["Total"]
new2["6 %"]=new2["6"]/new2["Total"]
new2["7 %"]=new2["7"]/new2["Total"]
new2["8 %"]=new2["8"]/new2["Total"]
new2["9 %"]=new2["9"]/new2["Total"]
new2["10 %"]=new2["10"]/new2["Total"]
new2["11 %"]=new2["11"]/new2["Total"]
new2["12 %"]=new2["12"]/new2["Total"]
new2["13 %"]=new2["13"]/new2["Total"]
new2["17 %"]=new2["17"]/new2["Total"]
new2["19 %"]=new2["19"]/new2["Total"]
new2["41 %"]=new2["41"]/new2["Total"]
new2

Unnamed: 0,Condition,1,2,3,4,5,6,7,8,9,...,7 %,8 %,9 %,10 %,11 %,12 %,13 %,17 %,19 %,41 %
0,Dry,50441,7841,2086,673,220,75,28,12,5,...,0.000456,0.000195,8.1e-05,1.6e-05,1.6e-05,0.0,0.0,0.0,0.0,1.6e-05
1,Wet or Damp,20354,3945,1039,363,119,37,14,2,2,...,0.000541,7.7e-05,7.7e-05,7.7e-05,0.000116,3.9e-05,3.9e-05,3.9e-05,3.9e-05,0.0
2,Flood,134,31,11,5,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Frost,596,131,22,5,3,3,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Snow,106,34,8,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
import plotly.graph_objs as go
colors = {'11 %': 'silver',
          '12 %': 'steelblue',
         "13 %": 'salmon',
         '17 %': 'hotpink',
         '19 %': "maroon",
         '41 %': 'burlywood'}
fig = go.Figure()
fig.add_trace(go.Bar(
    y=new2["1 %"],
    x=new2.Condition,
    name="1",text=["82.17%","78.64%","74.03%","78.42%","71.14%"]))
fig.add_trace(go.Bar(
    y=new2["2 %"],
    x=new2.Condition,
    name="2",text=["12.77%","15.24%","17.13%","17.24%","22.82%"]))
fig.add_trace(go.Bar(
    y=new2["3 %"],
    x=new2.Condition,
    name="3",text=["3.40%","4.01%","6.08%","2.89%","5.37%"]))
fig.add_trace(go.Bar(
    y=new2["4 %"],
    x=new2.Condition,
    name="4"))
fig.add_trace(go.Bar(
    y=new2["5 %"],
    x=new2.Condition,
    name="5",))
fig.add_trace(go.Bar(
    y=new2["6 %"],
    x=new2.Condition,
    name="6",))
fig.add_trace(go.Bar(
    y=new2["7 %"],
    x=new2.Condition,
    name="7",))
fig.add_trace(go.Bar(
    y=new2["8 %"],
    x=new2.Condition,
    name="8",))
fig.add_trace(go.Bar(
    y=new2["9 %"],
    x=new2.Condition,
    name="9",))
fig.add_trace(go.Bar(
    y=new2["10 %"],
    x=new2.Condition,
    name="10",))
fig.add_trace(go.Bar(
    y=new2["11 %"],
    x=new2.Condition,
    name="11",
    marker={'color': colors['11 %']},))
fig.add_trace(go.Bar(
    y=new2["12 %"],
    x=new2.Condition,
    name="12",
    marker={'color': colors['12 %']},))
fig.add_trace(go.Bar(
    y=new2["13 %"],
    x=new2.Condition,
    name="13",
    marker={'color': colors['13 %']},))
fig.add_trace(go.Bar(
    y=new2["17 %"],
    x=new2.Condition,
    name="17",
    marker={'color': colors['17 %']},))
fig.add_trace(go.Bar(
    y=new2["19 %"],
    x=new2.Condition,
    name="19",
    marker={'color': colors['19 %']},))
fig.add_trace(go.Bar(
    y=new2["41 %"],
    x=new2.Condition,
    name="41",
    marker={'color': colors['41 %']},))
fig.update_layout(barnorm = "percent",yaxis=dict(title_text="Percentage"),
    autosize=False,
    width=1000,
    height=700,
    title={
        'text': "Number of Casualties by Road Conditions",
        }, barmode="stack")
fig.show()

In [33]:
px.histogram(cleaned, "road_surface",color="number_of_casualties")

In [None]:
#Tree Map
fig = px.treemap(cleaned, path= [px.Constant("Road Conditions"), 'road_surface','number_of_casualties'], 
                 values='number_of_casualties'
)
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25),treemapcolorway = ["tomato",'olivedrab','steelblue','gold','darkgrey','midnightblue'])
fig.update_traces(go.Treemap(
    textinfo = "label+value+percent parent+percent root",
    root_color="whitesmoke"
))
fig.show()

In [35]:
px.pie(cleaned, "road_surface", title="Total Number of Casualties for each Road Condition")

In [59]:
dry = cleaned[cleaned["road_surface"] == "Dry"]["number_of_casualties"]
wod = cleaned[cleaned["road_surface"] == "Wet or damp"]["number_of_casualties"]
flood = cleaned[cleaned["road_surface"] == "Flood over 3cm. deep"]["number_of_casualties"]
frost = cleaned[cleaned["road_surface"] == "Frost or ice"]["number_of_casualties"]
snow = cleaned[cleaned["road_surface"] == "Snow"]["number_of_casualties"]

In [None]:
#Kruskal-Wallis test
sts.kruskal(dry,wod,flood,frost,snow)

In [None]:
#Take out Dry
sts.kruskal(wod,flood,frost,snow)

In [None]:
#Mood's Median test
sts.median_test(dry,wod,flood,frost,snow)

In [None]:
#Take out Dry
sts.median_test(wod,flood,frost,snow)

In [None]:
tukey = pairwise_tukeyhsd(endog=cleaned["number_of_casualties"],
                          groups=cleaned["road_surface"],
                          alpha=0.05)
print(tukey)

In [60]:
one = cleaned[cleaned["number_of_casualties"] == 1]["road_surface"]
two = cleaned[cleaned["number_of_casualties"] == 2]["road_surface"]
three = cleaned[cleaned["number_of_casualties"] == 3]["road_surface"]
four = cleaned[cleaned["number_of_casualties"] == 4]["road_surface"]
five = cleaned[cleaned["number_of_casualties"] == 5]["road_surface"]

In [61]:
#Kruskal-Wallis test
sts.kruskal(one,two, three, four,five)

KruskalResult(statistic=155.2297909267082, pvalue=1.5409339654191827e-32)

In [62]:
sts.kruskal(two,three,four,five)

KruskalResult(statistic=1.2994595009559777, pvalue=0.7292614271348525)

In [63]:
sts.kruskal(three,four,five)

KruskalResult(statistic=1.2443172597036112, pvalue=0.5367844670969663)

## Other work

In [None]:
new ={"Condition": ["Dry","Flood","Snow","Wet or Dry","Frost"],"Slight":[47974,138,122,20155,583], "Serious":[12493,39,26,5290,165],"Fatal":[917,4,1,439,12]}
new2= pd.DataFrame(new,columns=["Condition","Slight","Serious","Fatal"])
sum_column = new2["Slight"] + new2["Serious"] +new2["Fatal"]
new2["Total"]=sum_column
fatal=new2["Fatal"]/new2["Total"]
slight=new2["Slight"]/new2["Total"]
serious=new2["Serious"]/new2["Total"]
new2["Fatal %"]=fatal
new2["Slight %"]=slight
new2["Serious %"]=serious
new2

In [None]:
import plotly.graph_objs as go
fig = go.Figure()
fig.add_trace(go.Bar(
    y=new2["Fatal %"],
    x=new2.Condition,
    name="Fatal %",))
fig.add_trace(go.Bar(
    y=new2["Slight %"],
    x=new2.Condition,
    name="Slight %",))
fig.add_trace(go.Bar(
    y=new2["Serious %"],
    x=new2.Condition,
    name="Serious %",))
fig.update_layout(
        yaxis=dict(
        title_text="Percentage",
        ticktext=["0%", "20%", "40%", "60%","80%","100%"],
        tickvals=[0, 20, 40, 60, 80, 100],
        tickmode="array",
        titlefont=dict(size=15),
    ),
    autosize=False,
    width=1000,
    height=500,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    title={
        'text': "Accident Severity by Road Conditions",
        }, barmode="stack")
fig.show()

In [None]:
newlist=[]
counter =1
for condition in cleaned["road_surface"]:
    for number in cleaned["number_of_casualties"]:
        cleaned["new"] = number +1

In [None]:
fig=px.parallel_coordinates(cleaned,dimensions=['accident_severity',"number_of_casualties"], color="accident_severity")
fig.show()