In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
accidents = pd.read_excel("Accidents_csv.xlsx")
accidents.head()

Unnamed: 0,Timestamp,Gender,Age Range,Regular Driver,Driving License Length,Weekly Driving Frequency,Long Distance Driving,Stop in Long Distance Trips,Involved in Traffic Accident?,Date of the accident,...,Human factor,Type of vehicle,Vehicle brand,Vehicle model,Year of Registration,Level of protection of the vehicle,Continuous time driving,Difficulty driving post pandemic,Injured people,Displacement (cc)
0,1,Woman,18 - 24,Yes,5 to 10 years,Every day of the week,Once a month,Every 2 hours,Yes,2019-12-01 00:00:00,...,,Car,Nissan,Nissan micra,2008.0,8.0,Less than 30 minutes,Yes,No,1000.0
1,2,Woman,18 - 24,No,2 to 5 years,No day of the week,I do not make long trips,I do not make long journeys,Yes,2019-06-01 00:00:00,...,Distraction,Car,Mercedes,Mercedes A180,2008.0,10.0,Less than 30 minutes,Yes,No,1991.0
2,3,Woman,18 - 24,No,Less than 2 years,Once a week,I do not make long trips,I do not make long journeys,Yes,2020-06-01 00:00:00,...,Speeding,Car,Renault,Renault Clio,2013.0,10.0,Less than 30 minutes,Yes,No,1461.0
3,4,Man,More than 54,Yes,More than 20 years,Every day of the week,Once every 2-3 months,After more than 2 hours,Yes,1975-03-01 00:00:00,...,Others,Car,BMW,BMW X3,2005.0,10.0,Less than 30 minutes,No,No,1991.0
4,5,Woman,18 - 24,Yes,5 to 10 years,Between 3 and 5 days a week,Once a month,After more than 2 hours,Yes,2018-05-01 00:00:00,...,,Car,Zip,Zip,2014.0,1.0,Less than 30 minutes,Yes,Yes,


### Lets do some probability and calculations

In [3]:
pr = dict()

In [4]:
# Count of gender
accidents["Gender"].count()

103

In [5]:
# From all my sample, how many are men
accidents[accidents['Gender'] == 'Man']["Gender"].count()

48

In [6]:
# How many are men AND involved in a traffic accident, directly or indirectly?
accidents[(accidents['Gender'] == 'Man') & (accidents['Involved in Traffic Accident?'] == 'Yes')]['Gender'].count() 

32

In [7]:
print('So, from all the men we have (48), 32 were involved directly or indirectly on a traffic accident, which is quite a high number')

So, from all the men we have (48), 32 were involved directly or indirectly on a traffic accident, which is quite a high number


In [8]:
# To compute the proportion of men involved in an accident, we divide the number of men involved in an accident over the total men of my sample
men_accident = (accidents[(accidents['Gender'] == 'Man') & (accidents['Involved in Traffic Accident?'] == 'Yes')]['Gender'].count())/(accidents[accidents['Gender'] == 'Man']["Gender"].count())
men_accident

0.6666666666666666

In [9]:
print('The probability, according to my sample available, of being a man and having a traffic accident is 0,66, which is quite a high number. Lets check for women')

The probability, according to my sample available, of being a man and having a traffic accident is 0,66, which is quite a high number. Lets check for women


In [10]:
# Same process with women. Number of women involved in a traffic accident over the total women of my sample
women = (accidents[accidents['Gender'] == 'Woman']["Gender"].count())
women_accident = (accidents[(accidents['Gender'] == 'Woman') & (accidents['Involved in Traffic Accident?'] == 'Yes')]['Gender'].count())/(accidents[accidents['Gender'] == 'Woman']["Gender"].count())
print(women)
print(women_accident)

55
0.7454545454545455


In [11]:
print('Cool result! It shows that, from my sample, almost a 75% of women were directly or indirectly involved in a traffic accident')

Cool result! It shows that, from my sample, almost a 75% of women were directly or indirectly involved in a traffic accident


In [12]:
print('We can apply some Bayes Theorem here')

We can apply some Bayes Theorem here


In [13]:
# Probability of having an accident, regardless of the gender
involvement = accidents[accidents['Involved in Traffic Accident?'] == 'Yes']['Involved in Traffic Accident?'].count()
involvement

73

In [14]:
73/103

0.7087378640776699

 ### Probability of being a man/woman and having an accident - Naive Bayes Theorem

In [15]:
pr = dict()
pr["Woman"] = 0.53
pr["Man"] = 0.47
pr["Having an Accident"] = 0.71
pr["Accident | Woman"] = 0.75
pr["Accident | Man"] = 0.66

In [16]:
print('Bayes Theorem: The probability of having an accident being a man is: P(Man|Accident) = (P(Man)*P(Accident|Man))/(P(Accident))')

Bayes Theorem: The probability of having an accident being a man is: P(Man|Accident) = (P(Man)*P(Accident|Man))/(P(Accident))


In [17]:
Accident_Man = (pr['Man']*pr['Accident | Man'])/(pr['Having an Accident'])
Accident_Man

0.4369014084507042

In [18]:
Accident_Woman = (pr['Woman']*pr['Accident | Woman'])/(pr['Having an Accident'])
Accident_Woman

0.5598591549295775

In [19]:
print('So, from the sample, the conditional probability of having an accident when being a man is 0.43 (43%), and when being a woman is 0.55 (55%), which is significantly higher')

So, from the sample, the conditional probability of having an accident when being a man is 0.43 (43%), and when being a woman is 0.55 (55%), which is significantly higher


### Probability of suffering injuries in an accident with a certain level of cylinder capacity

In [20]:
print('First, some previous calculus')

First, some previous calculus


In [21]:
# Lets see the number of people who had an accident and, sadly, there were injured
(accidents[accidents['Injured people'] == 'Yes']['Injured people'].count())

31

In [22]:
# From people who had an accident, how much there were injured?
31/73

0.4246575342465753

In [23]:
# Proportion of people injured (for future uses)
print('So, from the 73 people involved in a traffic accident, 31 were injured, which is a proportion of a 42%')

So, from the 73 people involved in a traffic accident, 31 were injured, which is a proportion of a 42%


In [24]:
# Probability of drive a car with a cylinder higher than 2500 cc
cylinder = accidents[accidents['Displacement (cc)'] < 2000]['Displacement (cc)'].count()
cylinder

53

In [25]:
# From all the cars available and involved in an accident, which ones had a low cilinder?
53/73

0.726027397260274

In [26]:
# Probability of having injured people and a cylinder lower than 2000 cc
injured_cylinder = (accidents[(accidents['Injured people'] == 'Yes') & (accidents['Displacement (cc)'] < 2000.0)]['Injured people'].count())
injured_cylinder

18

In [27]:
# Probability of being injured with a low cylinder  
18/31

0.5806451612903226

In [28]:
pr = dict()
pr["Injured"] = 0.42
pr["Cylinder < 2500 cc"] = 0.73
pr["Injured | Cylinder < 2500 cc"] = 0.58
pr["Injured | Cylinder > 2500 cc"] = 0.42

In [29]:
# Same procedure, applying Naive Bayes Theorem, the Probability of having an accident with a cylinder lower than 2000, the probability of suffering an accident with injured people is:
lowcylinder_injured = (pr['Injured']*pr["Injured | Cylinder < 2500 cc"])/(pr["Cylinder < 2500 cc"])
lowcylinder_injured

0.3336986301369863

In [30]:
print('There is a 33% of Probability to suffer an accident with injured people when the cylinder is lower than 2500 cc')

There is a 33% of Probability to suffer an accident with injured people when the cylinder is lower than 2500 cc


### Probability of suffering an accident due to speeding for people older than 54 years

In [31]:
# From all my sample, how many are 54 years old
accidents[accidents['Age Range'] == 'More than 54']["Age Range"].count()

11

In [32]:
# So, from all my sample (103), 11 are 54 years old
11/103

0.10679611650485436

In [33]:
# Out of all who suffered a traffic accident, how many are 54 years old? Lets remind that only 73 people suffered an accident
11/73

0.1506849315068493

In [34]:
# Out of all the accidents, which were caused by speeding?
accidents[accidents['Human factor'] == 'Speeding']['Human factor'].count()

14

In [35]:
14/73

0.1917808219178082

In [36]:
human_54 = (accidents[(accidents['Age Range'] == 'More than 54') & (accidents['Human factor'] == 'Speeding')]['Age Range'].count())
human_54

2

In [37]:
# So, from all the accidents caused by Human Factor, how many were 54 years old or more?
2/14

0.14285714285714285

In [38]:
pr = dict()
pr["Speeding"] = 0.19
pr["54 years old"] = 0.11
pr["54 years old | Speeding"] = 0.14

In [39]:
# Same procedure, applying Naive Bayes Theorem, the Probability of having an accident with a cylinder lower than 2000, the probability of suffering an accident with injured people is:
older_speeding = (pr['54 years old']*pr["54 years old | Speeding"])/(pr["Speeding"])
older_speeding

0.08105263157894738

In [40]:
print('Not really an interesting result, it just shows that the probability of suffering an accident due to speed when being older than 54 y/o is very low, maybe they are more prudent')

Not really an interesting result, it just shows that the probability of suffering an accident due to speed when being older than 54 y/o is very low, maybe they are more prudent


### Probability for young people of suffering an accident due to distraction

In [41]:
# From all my sample, how many are 54 years old
accidents[accidents['Age Range'] == '18 - 24']["Age Range"].count()

58

In [42]:
# So from the 73 people who suffered an accident, 58 were between 18-24 years old
# It doesnt mean that just young people suffer accidents since we are talking about SAMPLES
58/73

0.7945205479452054

In [43]:
# Now, lets compute accidents due to distraction
accidents[accidents['Human factor'] == 'Distraction']['Human factor'].count()

41

In [44]:
41/73

0.5616438356164384

In [45]:
# Now, young people that suffer accidents due to distraction
human_young = (accidents[(accidents['Age Range'] == '18 - 24') & (accidents['Human factor'] == 'Distraction')]['Age Range'].count())
human_young

23

In [46]:
22/73

0.3013698630136986

In [47]:
pr = dict()
pr["Distraction"] = 0.56
pr["Young Age"] = 0.8
pr["Young Age | Distraction"] = 0.30

In [48]:
older_human = (pr['Young Age']*pr["Young Age | Distraction"])/(pr["Distraction"])
older_human

0.4285714285714285

In [49]:
print('So, according to the sample, the probability of suffering a car accident with an age range from 18-24 years old due to distraction is a 43%')

So, according to the sample, the probability of suffering a car accident with an age range from 18-24 years old due to distraction is a 43%


### Probability of drive for more than two following hours and have an accident

In [50]:
# From my sample, we already know that 73 people out of 103 have had an accident. Proportionally, that is:
73/103

0.7087378640776699

In [51]:
# Now, which proportion of people has been driving for more than two hours?
accidents[accidents['Continuous time driving'] == 'More than two hours']["Continuous time driving"].count()

5

In [52]:
5/73

0.0684931506849315

In [53]:
more_2hours = (accidents[(accidents['Involved in Traffic Accident?'] == 'Yes') & (accidents['Continuous time driving'] == 'More than two hours')]['Involved in Traffic Accident?'].count())
more_2hours

5

In [54]:
print('Every person who drove more than two hours was involved in a traffic accident... wow*')

Every person who drove more than two hours was involved in a traffic accident... wow*


### Probability of having a car accident due to speed with a high cylindric level

In [55]:
# From previous exercises, we already know that the probability of having a car accident due to speediding is 0.19
pr = dict()
pr["Speeding"] = 0.19

In [56]:
# We also know the probability of having a low cylinder level (below 2500 cc), so we can compute the opposite for a high level
pr["Cylinder < 2500 cc"] = 0.73
pr["Cylinder > 2500 cc"] = 1-0.73
pr["Cylinder > 2500 cc"]

0.27

In [57]:
# Now, conditional probability;
cy_speed = (accidents[(accidents['Human factor'] == 'Speeding') & (accidents['Displacement (cc)'] >= 2500)]['Human factor'].count())
cy_speed

2

In [58]:
print('From the 14 accidents caused by speeding, 2 involved cars with a high CC (every car with CC > 2500 had accident due to speed*')
5/5

From the 14 accidents caused by speeding, 2 involved cars with a high CC (every car with CC > 2500 had accident due to speed*


1.0

### Probability of having the driving license for less than two years and suffering a difficulty post pandemic

In [59]:
accidents.head()

Unnamed: 0,Timestamp,Gender,Age Range,Regular Driver,Driving License Length,Weekly Driving Frequency,Long Distance Driving,Stop in Long Distance Trips,Involved in Traffic Accident?,Date of the accident,...,Human factor,Type of vehicle,Vehicle brand,Vehicle model,Year of Registration,Level of protection of the vehicle,Continuous time driving,Difficulty driving post pandemic,Injured people,Displacement (cc)
0,1,Woman,18 - 24,Yes,5 to 10 years,Every day of the week,Once a month,Every 2 hours,Yes,2019-12-01 00:00:00,...,,Car,Nissan,Nissan micra,2008.0,8.0,Less than 30 minutes,Yes,No,1000.0
1,2,Woman,18 - 24,No,2 to 5 years,No day of the week,I do not make long trips,I do not make long journeys,Yes,2019-06-01 00:00:00,...,Distraction,Car,Mercedes,Mercedes A180,2008.0,10.0,Less than 30 minutes,Yes,No,1991.0
2,3,Woman,18 - 24,No,Less than 2 years,Once a week,I do not make long trips,I do not make long journeys,Yes,2020-06-01 00:00:00,...,Speeding,Car,Renault,Renault Clio,2013.0,10.0,Less than 30 minutes,Yes,No,1461.0
3,4,Man,More than 54,Yes,More than 20 years,Every day of the week,Once every 2-3 months,After more than 2 hours,Yes,1975-03-01 00:00:00,...,Others,Car,BMW,BMW X3,2005.0,10.0,Less than 30 minutes,No,No,1991.0
4,5,Woman,18 - 24,Yes,5 to 10 years,Between 3 and 5 days a week,Once a month,After more than 2 hours,Yes,2018-05-01 00:00:00,...,,Car,Zip,Zip,2014.0,1.0,Less than 30 minutes,Yes,Yes,


In [60]:
accidents[accidents['Driving License Length'] == '2 to 5 years']['Driving License Length'].count()

44

In [61]:
# Out of 103 people, 44 have had the driving license from 2 to 5 years (during pandemic period)
44/103

0.42718446601941745

In [62]:
# Difficulty experienced post pandemic?
accidents[accidents['Difficulty driving post pandemic'] == 'Yes']['Difficulty driving post pandemic'].count()

59

In [63]:
# From 103 people, 59 experienced difficulties driving when pandemic ended (more than the half of the sample)
59/103

0.5728155339805825

In [64]:
pandemic = (accidents[(accidents['Driving License Length'] == '2 to 5 years') & (accidents['Difficulty driving post pandemic'] == 'Yes')]['Driving License Length'].count())
pandemic

33

In [65]:
# 33 people out of the 44 with a driving licence from 2-5 years have experienced difficulty driving post pandemic
33/44

0.75

In [66]:
pr = dict()
pr["Driving License 2-5 years"] = 0.43
pr["Difficulty Pandemic"] = 0.57
pr["Driving License 2-5 years | Difficulty Pandemic"] = 0.75

In [67]:
young_pand = (pr["Driving License 2-5 years"]*pr["Driving License 2-5 years | Difficulty Pandemic"])/(pr["Difficulty Pandemic"])
young_pand

0.5657894736842106

In [68]:
print('There is a 57% of probability to have a driving license for less than 5 years and have experienced difficulty driving post pandemic, according to our sample')

There is a 57% of probability to have a driving license for less than 5 years and have experienced difficulty driving post pandemic, according to our sample


In [69]:
# Correlación entre nivel de protección y cilindrada
# correlacion entre cilindrada alta y accidente
# Correlación entre marca y accidente

In [70]:
# No utilizar tanta probabilidad, si no más CI e Hypothesis testing ya que al fin y al cabo estamos hablando de una muestra
# Por ejemplo, que posibilidad hay que, siendo mujer, sufra un accidente. No se con exactitud al tratarse de una muestra,
# pero si que lo puedo estimar

### Hypothesis Testing

In [71]:
import numpy as np 
from scipy.stats import t
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

In [72]:
#!pip install --upgrade scipy

In [73]:
# Significance: Probability of rejecting the null if it happens to be true -- most damaging error
# We set the significance depending on how damaging can it be to get it wrong
# alpha = 5%

#### More than half of the sample are involved in a traffic accident have high CC

In [84]:
# h0: The people involved in traffic accidents were driving a car with high CC equal 2500
# h1: The people involved in traffic accidents were NOT driving a car with high CC different 2500
accidents.dropna()
involvement = accidents[(accidents['Involved in Traffic Accident?'] == 'Yes') & (accidents['Displacement (cc)'] < 2000.0)]['Displacement (cc)']
involvement

0      1000.0
1      1991.0
2      1461.0
3      1991.0
7      1368.0
11     1999.0
12     1498.0
15     1905.0
17     1905.0
19     1149.0
20     1560.0
22     1991.0
24     1991.0
26     1400.0
28     1149.0
32     1422.0
34     1400.0
40     1390.0
41     1200.0
42     1991.0
44     1598.0
46     1200.0
49     1242.0
52     1910.0
53     1560.0
54     1498.0
56     1364.0
57     1600.0
59     1600.0
60     1560.0
61     1600.0
62     1198.0
63     1800.0
64     1796.0
65     1400.0
67     1698.0
68     1995.0
70      998.0
71     1598.0
75     1984.0
76     1595.0
77     1600.0
78     1197.0
79     1595.0
80     1998.0
82     1200.0
86     1120.0
88     1600.0
94     1390.0
97     1998.0
98     1900.0
99     1587.0
102    1386.0
Name: Displacement (cc), dtype: float64

In [88]:
st.ttest_1samp(involvement, 2000)

Ttest_1sampResult(statistic=-10.469560077080121, pvalue=2.0807279329807654e-14)

In [89]:
st.ttest_1samp(involvement, 2000, alternative='greater')

Ttest_1sampResult(statistic=-10.469560077080121, pvalue=0.9999999999999896)

In [90]:
st.ttest_1samp(involvement, 2000, alternative='less')

Ttest_1sampResult(statistic=-10.469560077080121, pvalue=1.0403639664903827e-14)

In [78]:
print("P is not low, so it can't go! Under a significance level of a 5%, we can not reject H0, which states that people who drive cars with a high cylindric level (higher than 2500), are more likely to be involved in traffic accidents")

P is not low, so it can't go! Under a significance level of a 5%, we can not reject H0, which states that people who drive cars with a high cylindric level (higher than 2500), are more likely to be involved in traffic accidents


#### Out of the people involved in traffic accident, more than the half are man

In [79]:
st.ttest_1samp(involvement, 50)

Ttest_1sampResult(statistic=37.35263738606945, pvalue=9.245220918304062e-43)