# Import data

In [190]:
import pandas
base_data_url = "../datasets/project/road safety/road-safety-data-{}"

In [191]:
accidents_16 = pandas.read_csv(base_data_url.format("accidents-2016.csv"))
accidents_15 = pandas.read_csv(base_data_url.format("accidents-2015.csv"))
accidents_14 = pandas.read_csv(base_data_url.format("accidents-2005-2014.csv"))

In [192]:
accidents = pandas.DataFrame()
accidents = pandas.concat([accidents, accidents_15, accidents_14], axis=0)

In [193]:
# accidents

# Investigate correlations
1. Are serious or fatal accidents more likely in bad weather?
2. Are serious or fatal accidents more likely during the night?
3. Do serious or fatal accidents have, on average, a longer distance from a streetlight?
4. Do any times of day, days of week, or weeks of year have more serious or fatal accidents?

# 1: Bad weather
## Plotting

In [194]:
# Replace the weather with a number: 1 for bad weather, 0 for good
replacement_list = {'Fine + high winds': 1,
  'Fine no high winds': 0,
  'Fog or mist': 1,
  'Other': 0,
  'Raining + high winds': 1,
  'Raining no high winds': 1,
  'Snowing + high winds': 1,
  'Snowing no high winds': 1}

In [195]:
severities = ["Serious", "Slight", "Fatal"]

In [196]:
w_dict = {}
for severity in severities:
    w_dict[severity] = {}
    for weather_type in replacement_list.keys():
        matches = len(accidents[(accidents["Weather_Conditions"] == weather_type) &
                               (accidents["Accident_Severity"] == severity)])
        w_dict[severity][weather_type] = matches

In [197]:
# Plot results
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (40,20)

d_serious = w_dict["Serious"]
d_slight = w_dict["Slight"]
d_fatal = w_dict["Fatal"]


In [198]:
def plot_wtype(wtype, title):
    plt.figure()
    plt.title = title
    plt.bar(range(len(wtype)), list(wtype.values()), align='center')
    plt.xticks(range(len(wtype)), list(wtype.keys()))
    plt.show()

In [199]:
# plot_wtype(d_fatal, "Fatal")

## Hypothesis testing
$H_0$: Prop(bad, (serious|fatal)) = Prop(bad, slight)


$H_1$: Prop(bad, (serious|fatal)) > Prop(bad, slight)

For a p-value of 0.05.

In [200]:
# Replace the weather
accidents["Weather_Conditions"] = accidents["Weather_Conditions"].map(replacement_list)

In [201]:
slight_list = list(accidents[
    (accidents["Accident_Severity"] == "Slight")]
                   .Weather_Conditions)

In [202]:
bad_list = list(accidents[
    (accidents["Accident_Severity"] == ("Serious" or "Fatal"))]
                   .Weather_Conditions)

In [203]:
# Actually do the hypothesis test
from scipy.stats import ttest_ind
ttest_ind(slight_list, bad_list)

Ttest_indResult(statistic=1.5539014982290393, pvalue=0.12033987342840149)

p > 0.05, so the result is not significant and we accept the null hypothesis.

# 2: Times of day
$H_0$: Prop(serious, nighttime) = Prop(slight, nighttime)

$H_1$: Prop(serious, nighttime) > Prop(slight, nighttime)

p = 0.05.

In [273]:
accidents = pandas.concat([accidents, accidents_15, accidents_14], axis=0)

In [274]:
import datetime
from astral import Astral
city_name = 'Edinburgh'
a = Astral()
a.solar_depression = 'civil'
city = a[city_name]

def sun_at_day(day):
    sun = city.sun(date = day, local=False)
    return sun

def is_day(x):
    time_x = datetime.datetime.strptime(x + "+0000", '%d/%m/%Y %H:%M%z')
    sun = sun_at_day(time_x)
    
    
    if (sun['sunrise'] < time_x < sun['sunset']):
        return 0
    return 1

In [275]:
bad_accidents = accidents[(accidents["Accident_Severity"] == ("Serious" or "Fatal"))]
slight_accidents = accidents[(accidents["Accident_Severity"] == "Slight")]

In [276]:
bad_dt = bad_accidents.Date + " " + bad_accidents.Time
slight_dt = slight_accidents.Date + " " + slight_accidents.Time

In [277]:
bad_dt = bad_dt.map(lambda x: is_day(x))
slight_dt = slight_dt.map(lambda x: is_day(x))

In [278]:
from scipy.stats import ttest_ind
ttest_ind(bad_dt, slight_dt)

Ttest_indResult(statistic=2.8316363990551485, pvalue=0.0046406849367310807)

p < 0.05 so we reject the null hypothesis: the proportion of bad accidents during the night is greater than slight accidents during the night.

# 3: Distance to streetlight
$H_0$: avg(dist_to_streetlight(serious)) = avg(dist_to_streetlight(slight))

$H_1$: avg(dist_to_streetlight(serious)) > avg(dist_to_streetlight(slight))

p = 0.05

In [300]:
streetlights = pandas.read_csv("../datasets/project/v_street_lights.csv")

In [301]:
streetlights_coords = pandas.concat([streetlights.LATITUDE, 
                                     streetlights.LONGITUDE], axis=1)

streetlights_coords_tup = [tuple(x) for x in streetlights_coords.values]

In [309]:
from geopy.distance import vincenty

def dist_point_to_point(p1, p2):
    return vincenty(p1, p2).meters

def nearest_streetlight(point):
    print(point)
    min_dist = dist_point_to_point(point, streetlights_coords_tup[0])
    for x in streetlights_coords_tup[1:]:
        dist = dist_point_to_point(point, x)
        if dist < min_dist:
            min_dist = dist
    return min_dist

In [310]:
bad_accidents = accidents[(accidents["Accident_Severity"] == ("Serious" or "Fatal"))]
slight_accidents = accidents[(accidents["Accident_Severity"] == "Slight")]

In [318]:
bad_accidents.map(lambda x:
                 nearest_streetlight((x.Latitude, x.Longitude)))

AttributeError: 'DataFrame' object has no attribute 'map'

In [None]:
slight_accidents["min_dist"] = (bad_accidents.Longitude + bad_accidents.Latitude).map(lambda x: nearest_streetlight(x))