In [None]:
import pandas as pd
import matplotlib as plt
import os
import numpy as np
import plotly.express as px
import scipy.stats as sts
from sklearn import datasets
import statsmodels.api as sm

In [None]:
accident_df=pd.read_csv("Resources/accident-data.csv")
accident_df.head()

In [None]:
accident_df.info()

In [None]:
#Grouping date by month
s = pd.to_datetime(pd.Series(cleaned["date"]), format='%d/%m/%Y')
s.index = s.dt.to_period('m')
s = s.groupby(level=0).size()

s = s.reindex(pd.period_range(s.index.min(), s.index.max(), freq='m'), fill_value=0)
print (s)

In [None]:
#Plot accidents by month
s.plot.bar()

In [None]:
cleaned = accident_df.drop(columns=["accident_year","accident_reference"])

In [None]:
cleaned.loc[cleaned["accident_severity"] ==3,"severity"] = "Slight"
cleaned.loc[cleaned["accident_severity"] ==2,"severity"] = "Serious"
cleaned.loc[cleaned["accident_severity"] ==1,"severity"] = "Fatal"

In [None]:
cleaned.loc[cleaned["road_surface_conditions"] ==1,"road_surface"] = "Dry"
cleaned.loc[cleaned["road_surface_conditions"] ==2,"road_surface"] = "Wet or damp"
cleaned.loc[cleaned["road_surface_conditions"] ==3,"road_surface"] = "Snow"
cleaned.loc[cleaned["road_surface_conditions"] ==4,"road_surface"] = "Frost or ice"
cleaned.loc[cleaned["road_surface_conditions"] ==5,"road_surface"] = "Flood over 3cm. deep"
cleaned.loc[cleaned["road_surface_conditions"] ==6,"road_surface"] = "Oil or diesel"
cleaned.loc[cleaned["road_surface_conditions"] ==7,"road_surface"] = "Mud"
cleaned.loc[cleaned["road_surface_conditions"] ==-1,"road_surface"] = "Data missing or out of range"
cleaned.loc[cleaned["road_surface_conditions"] ==9,"road_surface"] = "unknown (self reported)"

## Linear Regression on Road Conditions vs # of casualties

In [None]:
y = pd.get_dummies(cleaned.road_surface, prefix='Road')
print(y.head())

In [None]:
result = pd.concat([cleaned, y], axis=1)
result.head()

In [None]:
X = result[['Road_Dry',"Road_Flood over 3cm. deep","Road_Frost or ice","Road_Snow","Road_Wet or damp", "Road_unknown (self reported)"]]
y = result['number_of_casualties']
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
px.box(cleaned, x="number_of_casualties", y="road_surface", orientation="h")

## Other work

1. What do we do with unknown data?
2. Do we want to include actual description instead of number?
3. What columns to drop?
4. How to categorize time? (morning, night, 8:00am, 9:00am)
5. making regression using only categorical data
6. What are our hypotheses?

In [None]:
#px.set_mapbox_access_token(open(".mapbox_token").read())
#df = px.accident_df.carshare()
fig =px.scatter_mapbox(data_frame=accident_df, lat="latitude", lon="longitude", color="accident_severity")
fig.show()

In [None]:
fig=px.parallel_coordinates(cleaned,dimensions=['accident_severity',"number_of_casualties"], color="accident_severity")
fig.show()

In [None]:
sample = cleaned.sample(frac = 0.5)
population = cleaned
sts.ttest_1samp(sample, population.mean())

In [None]:
px.histogram(cleaned, "road_surface")

In [None]:
dry = cleaned[cleaned["road_surface"] == "Dry"]["number_of_casualties"] 
wod = cleaned[cleaned["road_surface"] == "Wet or damp"]["number_of_casualties"] 

In [None]:
#ANOVA test
sts.f_oneway(dry, wod)