In [1]:
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

%matplotlib  inline               
import pandas as pd
import numpy as np

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import folium
import folium.plugins 

#pd.set_option('html', False)  
# unknown error, skip this step
pd.set_option('max_columns', 30)
pd.set_option('max_rows', 20)

from sklearn.linear_model import LinearRegression
from sklearn import linear_model

from sklearn.metrics import r2_score as Rsq
from sklearn.metrics import mean_squared_error as MSE
from math import sqrt

## 1. Based on organzied csv files, get a table about all vehicle collisions from 2014 to 2017

### 1.1 combine vehicle collisions csv files from 2014 to 2017 (in the order of time)

In [2]:
df_csv1417 = pd.DataFrame()

for y in range(2014, 2018):
    fname = "collision%d.csv" % (y)
    tmp_df = pd.read_csv(fname)
    tmp_df["Year"] = y
    df_csv1417 = df_csv1417.append(tmp_df, sort = False)

df_csv1417.index = range(df_csv1417.shape[0])
df_csv1417["Date"] = pd.to_datetime(df_csv1417["Date"])

df_csv1417["Month"] = df_csv1417["Date"].apply(lambda x: x.month)

def DayOfWeek(date):
    dw = date.weekday()
    rst = {
        0: 'T1_Mon',
        1: 'T2_Tue',
        2: 'T3_Wed',
        3: 'T4_Thu',
        4: 'T5_Fri',
        5: 'T6_Sat',
        6: 'T7_Sun'
    }[dw]
    return rst

df_csv1417["Day_Of_Week"] = df_csv1417["Date"].apply(lambda x: DayOfWeek(x))

df_csv1417["Environment"] = df_csv1417["Environment"].astype("category")
df_csv1417["Road_Surface"] = df_csv1417["Road_Surface"].astype("category")
df_csv1417["Traffic_Control"] = df_csv1417["Traffic_Control"].astype("category")
df_csv1417["Collision_Location"] = df_csv1417["Collision_Location"].astype("category")
df_csv1417["Light"] = df_csv1417["Light"].astype("category")
df_csv1417["Collision_Classification"] = df_csv1417["Collision_Classification"].astype("category")
df_csv1417["Impact_type"] = df_csv1417["Impact_type"].astype("category")

df_csv1417.head(3)

Unnamed: 0,Record,Location,Date,Time,Environment,Road_Surface,Traffic_Control,Collision_Location,Light,Collision_Classification,Impact_type,longitude,latitude,Year,Month,Day_Of_Week
0,2014000001,RIDEAU ST @ WALLER ST,2014-02-21,06:07:00,02 - Rain,02 - Wet,01 - Traffic signal,03 - At intersection,07 - Dark,01 - Fatal injury,07 - SMV other,-75.688726,45.427533,2014,2,T5_Fri
1,2014000002,HINES RD btwn INNOVATION DR & SOLANDT RD,2014-08-02,13:55:00,01 - Clear,01 - Dry,10 - No control,04 - At/near private drive,01 - Daylight,01 - Fatal injury,02 - Angle,-75.921033,45.343152,2014,8,T6_Sat
2,2014000003,LOGGERS WAY btwn KINGDON MINE RD & GALETTA SID...,2014-06-20,23:15:00,01 - Clear,01 - Dry,10 - No control,01 - Non intersection,07 - Dark,01 - Fatal injury,07 - SMV other,-76.247045,45.438627,2014,6,T5_Fri


In [3]:
df_csv1417.tail(3)

Unnamed: 0,Record,Location,Date,Time,Environment,Road_Surface,Traffic_Control,Collision_Location,Light,Collision_Classification,Impact_type,longitude,latitude,Year,Month,Day_Of_Week
58335,2017014392,YORK ST WB btwn TO BE DETERMINED & CUMBERLAND ST,2017-01-23,20:30:00,01 - Clear,01 - Dry,10 - No control,01 - Non intersection,07 - Dark,03 - P.D. only,06 - SMV unattended vehicle,-75.689421,45.429755,2017,1,T1_Mon
58336,2017014393,YORKS CORNERS RD btwn PANA RD & VICTORIA ST,2017-07-18,18:03:00,01 - Clear,01 - Dry,10 - No control,01 - Non intersection,01 - Daylight,03 - P.D. only,07 - SMV other,-75.431838,45.258973,2017,7,T2_Tue
58337,2017014394,YORKS CORNERS RD btwn PARKWAY RD & COOPER HILL RD,2017-03-23,16:21:00,01 - Clear,01 - Dry,10 - No control,01 - Non intersection,01 - Daylight,02 - Non-fatal injury,07 - SMV other,-75.453559,45.299801,2017,3,T4_Thu


### 1.2 append collision table with daily weather, and mark holidays

In [4]:
wt1417 = pd.read_csv("weather1417.csv")

casesByDay = df_csv1417.groupby(['Date']).size().tolist()
wt1417["Number of Collisions"] = casesByDay

wt1417["Date"] = pd.to_datetime(wt1417["Date"])

wt1417["Year"] = wt1417["Date"].apply(lambda x: x.year)
wt1417["Month"] = wt1417["Date"].apply(lambda x: x.month)
wt1417["Day in month"] = wt1417["Date"].apply(lambda x: x.day)

hd1417 = pd.read_csv('holiday1417.csv')
hd1417["Date"] = pd.to_datetime(hd1417["Date"])

for idx, row in hd1417.iterrows():
    wt1417.loc[wt1417["Date"] == row['Date'], 'Holiday'] = row['Holiday']
    wt1417.loc[wt1417["Date"] == row['Date'], 'Type of holiday'] = row['Type of holiday']
    df_csv1417.loc[df_csv1417["Date"] == row['Date'], 'Holiday'] = row['Holiday']

wt1417["Holiday"] = wt1417["Holiday"].astype("category")
df_csv1417["Holiday"] = df_csv1417["Holiday"].astype("category")

wt1417["Day_Of_Week"] = wt1417["Date"].apply(lambda x: DayOfWeek(x))

wt1417.head(3)

Unnamed: 0,Date,Max Temp (°C),Min Temp (°C),Mean Temp (°C),Total Rain (mm),Total Snow (cm),Total Precip (mm),Snow on Grnd (cm),Number of Collisions,Year,Month,Day in month,Holiday,Type of holiday,Day_Of_Week
0,2014-01-01,-19.2,-23.0,-21.1,0.0,0.0,0.0,30,31,2014,1,1,New Year Day,onHoliday,T3_Wed
1,2014-01-02,-20.8,-27.1,-24.0,0.0,0.0,0.0,29,74,2014,1,2,the second day of new year,byHoliday,T4_Thu
2,2014-01-03,-21.0,-28.1,-24.6,0.0,0.3,0.0,29,151,2014,1,3,the workday after holidays,postHoliday,T5_Fri


In [5]:
hd1417 = pd.read_csv('holiday1417.csv')
hd1417

Unnamed: 0,Date,Holiday,Day_of_week,Type of holiday
0,1/1/2014,New Year Day,Wednesday,onHoliday
1,1/2/2014,the second day of new year,Thursday,byHoliday
2,1/3/2014,the workday after holidays,Wednesday,postHoliday
3,2/14/2014,the workday before holidays,Friday,preHoliday
4,2/15/2014,weekend close to holiday,Saturday,byHoliday
5,2/16/2014,weekend close to holiday,Sunday,byHoliday
6,2/17/2014,Family Day,Monday,onHoliday
7,2/18/2014,the workday after holidays,Tuesday,postHoliday
8,4/17/2014,the workday before holidays,Thurday,preHoliday
9,4/18/2014,Good Friday,Friday,onHoliday


In [6]:
wt1417.tail(3)

Unnamed: 0,Date,Max Temp (°C),Min Temp (°C),Mean Temp (°C),Total Rain (mm),Total Snow (cm),Total Precip (mm),Snow on Grnd (cm),Number of Collisions,Year,Month,Day in month,Holiday,Type of holiday,Day_Of_Week
1458,2017-12-29,-17.5,-26.2,-21.9,0.0,0.0,0.0,20,63,2017,12,29,the workday before holidays,preHoliday,T5_Fri
1459,2017-12-30,-16.0,-27.0,-21.5,0.0,1.0,0.4,19,47,2017,12,30,weekend close to holiday,byHoliday,T6_Sat
1460,2017-12-31,-19.5,-24.5,-22.0,0.0,0.0,0.0,20,43,2017,12,31,weekend close to holiday,byHoliday,T7_Sun


## 2. Based on the table we get, do analysis about single variables

### 2.1 Overview

In [7]:
countOfCollision = df_csv1417.groupby(['Year']).size()
growthRate = (countOfCollision.pct_change()*100).apply(lambda x: '' if pd.isna(x) else format(x,'.2f') + '%')
barOfYear = go.Bar(x = range(2014,2018), 
                   y = countOfCollision,
                   text = countOfCollision,
                   textposition = 'auto',
                   opacity=0.6, 
                   name = 'Number of vehicle collisions in year')
scatterOfYear = go.Scatter(x = range(2014,2018), 
                           y = countOfCollision,
                           text = growthRate,
                           mode = 'lines+text+markers',
                           textposition = 'top center',
                           name = 'Growth rate of vehicle collisions (compared with the previous year)')

fig = dict(data = [barOfYear,scatterOfYear], 
           layout = go.Layout(title = 'Number of vehicle collisions in year', xaxis=dict(title='Year', dtick=1)))
iplot(fig)

print("Total number of cases: " + str(countOfCollision.sum()))

Total number of cases: 58338


**Brief Conclusion:**
* From January 1, 2014 to December 31, 2017, there were **58338** vehicle collision cases happened in Ottawa. 
* The differences of vehicle collision cases during 4 years are small. 
* There is a relatively big decrease of the number of cases between 2015 and 2016(-7.0%), but the rest of growth rates between years are increase. 

**Hypothesis:**
* There's no relationship between year and vehicle collisions, or to say driving in a specific year does not lead to a vehicle collision. 

### 2.2 Percentage of single possible causes of accident

In [8]:
def analysisInSinglePercent(Col): 
    count = '# of Cases'
    df_cause = pd.crosstab(index = df_csv1417[Col], columns = count, margins = False)
    df_cause[Col] = df_cause.index
    sumOf = df_cause[count].sum()
    df_cause['Percentage'] = df_cause[count]/sumOf*100
    df_cause = df_cause[[Col, count, 'Percentage']]
    df_cause = df_cause.sort_values([count], ascending=[0])
    
    tab = go.Table(header = dict(values = ['<b>' + a + '</b>' for a in df_cause.columns.tolist()]),
                   domain = dict(x=[0, 0.4], y=[0,1.0]),
                   columnwidth = [110,55,55], 
                   cells = dict(values = [df_cause[b].tolist() for b in df_cause.columns.tolist()],
                                font = dict(color=['rgb(40, 40, 40)'] * 5, size=12),
                                format = [None] + [None] + ['.2f'],
                                suffix = ["", "", "%"], 
                                align = ["left", "right", "right"]),
                  )
    pie = go.Pie(values = df_cause[count], 
                 labels = df_cause.index, 
                 name = df_cause.index.name,
                 domain=dict(x=[0.6, 1.0], y=[0, 1.0]),
                 hoverinfo = "label+percent+name", hole = .4, showlegend = True)
    fig = dict(data = [tab,pie], 
               layout = go.Layout(title = Col))
    iplot(fig)
    
for n in ['Environment', 'Road_Surface', 'Traffic_Control', 'Collision_Location', \
          'Light', 'Collision_Classification', 'Impact_type', 'Month', "Day_Of_Week", "Holiday"]:
    analysisInSinglePercent(n) 

**Brief Conclusion - Most of car collisions happen in:**
* environment: clear (78.73%)
* road surface: dry (65.76%)
* traffic control: no traffic control (47.97%)
* collision location: at intersection or intersection related (21.47%+31.34% = 52.81%)
* light condition: daylight (68.36%)
* collision classification: P.D. only (80.99%)
* impact type: varies
* month: December, January, February (11.14%+10.37%+9.90% = 31.41%)
* day of week: workdays, the trend keeps rising from Monday to Friday
* holiday: Boxing Day (17.74%), Family Day (13.14%), New Years Day (12.09%)

**Hypothesis:**
* Those weather that seems common, such as clear environment and dry road surface might lead drivers to be careless(it might be not a dependable factor: the proportions of those weathers such as clear days and dry road surfaces is much more bigger than the proportions of wet road surface and snowy environment). 
* A place without traffic control might lead drivers to ignore some details that perhaps cause accidents, or some drivers could break the rules. 
* The location where is close to of at intersection requires drivers to control their wheels and gases carefully due to the heavy traffic and roads to different directions. Under this kind of stressful situation, people without enough driving experience are easy to make mistakes. 
* Most of collisions did not cause injuries, and the impact type varies, either. P.D. only collisions are highly related to SMV other and rear end impacts. 
* December, January and February are in winter of Ottawa. The season of winter in Ottawa is cold, so that this season causes many potential factors of collision: snow and ice creates terrible road surface condition that let cars slip, low temperature makes drivers' reaction slower, snow and long night reduces their visability, etc. 
* Objectively, the traffic is heavier on workdays in a week. The stress of people on workdays increases step by step, and the proportion of collisions is also divided by days of week step by step.
* Boxing Day, Family Day and New Years Day include multiple possible factors that cause collisions: time is in winter, heavy traffic, many crowded people, used to gather at places like supermarkets with complicated routes. 

### 2.3 Temperature & Weather Related Variables

In [9]:
df_avg = wt1417.groupby(["Year","Month"]).mean()
df_avg["Year"] = [x[0] for x in df_avg.index]
df_avg["Month"] = [x[1] for x in df_avg.index]

df_sum = wt1417.groupby(["Year","Month"]).sum()
df_sum["Year"] = [x[0] for x in df_sum.index]
df_sum["Month"] = [x[1] for x in df_sum.index]

def printTemInYear(n):
    monthCase = go.Bar(x = df_sum[df_sum["Year"]==n]["Month"],
                       y = df_sum[df_sum["Year"]==n]["Number of Collisions"],
                       text = df_sum[df_sum["Year"]==n]["Number of Collisions"],
                       textposition = 'auto',
                       opacity = 0.6,
                       name = 'Number of Collisions')
    max_temp = go.Scatter(x = df_avg[df_avg["Year"]==n]["Month"],
                         y = df_avg[df_avg["Year"]==n]["Max Temp (°C)"],
                         mode = 'lines',
                         name = 'Monthly Mean Max Temp',
                         xaxis = 'x2', 
                         yaxis = 'y2')
    min_temp = go.Scatter(x = df_avg[df_avg["Year"]==n]["Month"],
                         y = df_avg[df_avg["Year"]==n]["Min Temp (°C)"],
                         mode = 'lines',
                         name = 'Monthly Mean Min Temp',
                         fill = 'tonexty', 
                         xaxis = 'x2', 
                         yaxis = 'y2')
    fig = dict(data = [monthCase, max_temp,min_temp], 
               layout = go.Layout(title = "Temperature Analysis (" + str(n) + ")",
                                  xaxis = dict(title='Month', dtick=1),
                                  yaxis = dict(title='# of Collisions', dtick=500, domain = [0,0.5]),
                                  yaxis2 = dict(title='Temp (°C)', dtick=5, domain = [0.55,1.0],autorange='reversed'),
                                  xaxis2 = dict(dtick=1, anchor = 'y2')
                                 )
              )
    iplot(fig)
    
for m in range(2014,2018):
    printTemInYear(m)
    
print("The correlation between number of collisions and mean temperature: \n" 
      + "all months = %.4f" % df_avg["Number of Collisions"].corr(df_avg["Mean Temp (°C)"]) + "\n" 
      + "In each month: " + "\n")

def corrInMonth(l):
      s = {
        1: 'January',
        2: 'February',
        3: 'March',
        4: 'April',
        5: 'May',
        6: 'June',
        7: 'July',
        8: 'August',
        9: 'September',
       10: 'October',
       11: 'November',
       12: 'December'
      }[l]
      print(s + " = %.4f" % df_avg[df_avg["Month"] == l]["Number of Collisions"].corr(df_avg["Mean Temp (°C)"]))
        
for f in range(1,13):
    corrInMonth(f)

The correlation between number of collisions and mean temperature: 
all months = -0.7044
In each month: 

January = -0.9936
February = -0.9209
March = -0.7807
April = -0.8263
May = 0.9109
June = -0.4081
July = -0.0211
August = -0.5915
September = 0.3481
October = -0.1837
November = -0.7828
December = -0.9908


**Brief Conclusion:**
* The number of collisions in winter is always higher than other months in a year. 
* The correlation between the number of collisions and month becomes extremely low in winter. 

**Hypothesis:**
* Extreme low temperature leads to vehicle collisions, such as vehicle faults and several weathers that are related to low temperature. 

In [10]:
df_avg = wt1417.groupby(["Year","Month"]).mean()
df_avg["Year"] = [x[0] for x in df_avg.index]
df_avg["Month"] = [x[1] for x in df_avg.index]

df_sum = wt1417.groupby(["Year","Month"]).sum()
df_sum["Year"] = [x[0] for x in df_sum.index]
df_sum["Month"] = [x[1] for x in df_sum.index]

def printRainInYear(n):
    monthCase = go.Bar(x = df_sum[df_sum["Year"]==n]["Month"],
                       y = df_sum[df_sum["Year"]==n]["Number of Collisions"],
                       text = df_sum[df_sum["Year"]==n]["Number of Collisions"],
                       textposition = 'auto',
                       opacity = 0.6,
                       name = 'Number of Collisions')
    total_rain = go.Scatter(x = df_sum[df_sum["Year"]==n]["Month"],
                         y = df_sum[df_sum["Year"]==n]["Total Rain (mm)"],
                         mode = 'lines',
                         name = 'Monthly Total Rain',
                         xaxis = 'x2', 
                         yaxis = 'y2')
    fig = dict(data = [monthCase, total_rain], 
               layout = go.Layout(title = "Rain Analysis (" + str(n) + ")",
                                  xaxis = dict(title='Month', dtick=1),
                                  yaxis = dict(title='# of Collisions', dtick=500, domain = [0,0.5]),
                                  yaxis2 = dict(title='Total Rain (mm)', domain = [0.55,1.0]),
                                  xaxis2 = dict(dtick=1, anchor = 'y2')
                                 )
              )
    iplot(fig)
    
for m in range(2014,2018):
    printRainInYear(m)

**Brief Conclusion:**
* No clear relationship between rain and collisions

In [11]:
df_avg = wt1417.groupby(["Year","Month"]).mean()
df_avg["Year"] = [x[0] for x in df_avg.index]
df_avg["Month"] = [x[1] for x in df_avg.index]

df_sum = wt1417.groupby(["Year","Month"]).sum()
df_sum["Year"] = [x[0] for x in df_sum.index]
df_sum["Month"] = [x[1] for x in df_sum.index]

def printSnowInYear(n):
    monthCase = go.Bar(x = df_sum[df_sum["Year"]==n]["Month"],
                       y = df_sum[df_sum["Year"]==n]["Number of Collisions"],
                       text = df_sum[df_sum["Year"]==n]["Number of Collisions"],
                       textposition = 'auto',
                       opacity = 0.6,
                       name = 'Number of Collisions')
    total_snow = go.Scatter(x = df_sum[df_sum["Year"]==n]["Month"],
                         y = df_sum[df_sum["Year"]==n]["Total Snow (cm)"],
                         mode = 'lines',
                         name = 'Monthly Total Snow',
                         xaxis = 'x2', 
                         yaxis = 'y2')
    fig = dict(data = [monthCase, total_snow], 
               layout = go.Layout(title = "Snow Analysis (" + str(n) + ")",
                                  xaxis = dict(title='Month', dtick=1),
                                  yaxis = dict(title='# of Collisions', dtick=500, domain = [0,0.5]),
                                  yaxis2 = dict(title='Total Snow (cm)', domain = [0.55,1.0]),
                                  xaxis2 = dict(dtick=1, anchor = 'y2')
                                 )
              )
    iplot(fig)
    
for m in range(2014,2018):
    printSnowInYear(m)

In [12]:
df_avg = wt1417.groupby(["Year","Month"]).mean()
df_avg["Year"] = [x[0] for x in df_avg.index]
df_avg["Month"] = [x[1] for x in df_avg.index]

df_sum = wt1417.groupby(["Year","Month"]).sum()
df_sum["Year"] = [x[0] for x in df_sum.index]
df_sum["Month"] = [x[1] for x in df_sum.index]

def printSnowOnGrndInYear(n):
    monthCase = go.Bar(x = df_sum[df_sum["Year"]==n]["Month"],
                       y = df_sum[df_sum["Year"]==n]["Number of Collisions"],
                       text = df_sum[df_sum["Year"]==n]["Number of Collisions"],
                       textposition = 'auto',
                       opacity = 0.6,
                       name = 'Number of Collisions')
    total_snowOnGrnd = go.Scatter(x = df_sum[df_sum["Year"]==n]["Month"],
                         y = df_sum[df_sum["Year"]==n]["Snow on Grnd (cm)"],
                         mode = 'lines',
                         name = 'Monthly Total Snow on Ground',
                         xaxis = 'x2', 
                         yaxis = 'y2')
    fig = dict(data = [monthCase, total_snowOnGrnd], 
               layout = go.Layout(title = "Snow on Ground Analysis (" + str(n) + ")",
                                  xaxis = dict(title='Month', dtick=1),
                                  yaxis = dict(title='# of Collisions', dtick=500, domain = [0,0.5]),
                                  yaxis2 = dict(title='Snow on Grnd (cm)', domain = [0.55,1.0]),
                                  xaxis2 = dict(dtick=1, anchor = 'y2')
                                 )
              )
    iplot(fig)
    
for m in range(2014,2018):
    printSnowOnGrndInYear(m)

**Brief Conclusion:**
* The curve of monthly snowfall and snow on ground is similar to the trend of monthly collisions. 

**Hypothesis:**
* Snowy weather causes collisions from the perspective of reducing visibility and grip. 

In [13]:
df_avg = wt1417.groupby(["Year","Month"]).mean()
df_avg["Year"] = [x[0] for x in df_avg.index]
df_avg["Month"] = [x[1] for x in df_avg.index]

df_sum = wt1417.groupby(["Year","Month"]).sum()
df_sum["Year"] = [x[0] for x in df_sum.index]
df_sum["Month"] = [x[1] for x in df_sum.index]

def printPrecipInYear(n):
    monthCase = go.Bar(x = df_sum[df_sum["Year"]==n]["Month"],
                       y = df_sum[df_sum["Year"]==n]["Number of Collisions"],
                       text = df_sum[df_sum["Year"]==n]["Number of Collisions"],
                       textposition = 'auto',
                       opacity = 0.6,
                       name = 'Number of Collisions')
    total_precip = go.Scatter(x = df_sum[df_sum["Year"]==n]["Month"],
                         y = df_sum[df_sum["Year"]==n]["Total Precip (mm)"],
                         mode = 'lines',
                         name = 'Monthly Total Precip',
                         xaxis = 'x2', 
                         yaxis = 'y2')
    fig = dict(data = [monthCase, total_precip], 
               layout = go.Layout(title = "Precip Analysis (" + str(n) + ")",
                                  xaxis = dict(title='Month', dtick=1),
                                  yaxis = dict(title='# of Collisions', dtick=500, domain = [0,0.5]),
                                  yaxis2 = dict(title='Total Precip (mm)', domain = [0.55,1.0]),
                                  xaxis2 = dict(dtick=1, anchor = 'y2')
                                 )
              )
    iplot(fig)
    
for m in range(2014,2018):
    printPrecipInYear(m)

**Brief Conclusion:**
* No clear relationship between monthly precipitation and collisions

### 2.4 The day in a year with the largest number of collisions

**In 2014:**

In [14]:
maxColl2014 = wt1417[wt1417["Year"]==2014].sort_values(["Number of Collisions"], ascending=[0])
maxColl2014.iloc[[0]]

Unnamed: 0,Date,Max Temp (°C),Min Temp (°C),Mean Temp (°C),Total Rain (mm),Total Snow (cm),Total Precip (mm),Snow on Grnd (cm),Number of Collisions,Year,Month,Day in month,Holiday,Type of holiday,Day_Of_Week
2,2014-01-03,-21.0,-28.1,-24.6,0.0,0.3,0.0,29,151,2014,1,3,the workday after holidays,postHoliday,T5_Fri


**In 2015:**

In [15]:
maxColl2015 = wt1417[wt1417["Year"]==2015].sort_values(["Number of Collisions"], ascending=[0])
maxColl2015.iloc[[0]]

Unnamed: 0,Date,Max Temp (°C),Min Temp (°C),Mean Temp (°C),Total Rain (mm),Total Snow (cm),Total Precip (mm),Snow on Grnd (cm),Number of Collisions,Year,Month,Day in month,Holiday,Type of holiday,Day_Of_Week
397,2015-02-02,-14.3,-20.4,-17.4,0.0,3.0,1.5,14,160,2015,2,2,,,T1_Mon


**In 2016:**

In [16]:
maxColl2016 = wt1417[wt1417["Year"]==2016].sort_values(["Number of Collisions"], ascending=[0])
maxColl2016.iloc[[0]]

Unnamed: 0,Date,Max Temp (°C),Min Temp (°C),Mean Temp (°C),Total Rain (mm),Total Snow (cm),Total Precip (mm),Snow on Grnd (cm),Number of Collisions,Year,Month,Day in month,Holiday,Type of holiday,Day_Of_Week
777,2016-02-17,0.1,-11.2,-5.6,0.0,0.4,0.4,40,152,2016,2,17,,,T3_Wed


**In 2017:**

In [17]:
maxColl2017 = wt1417[wt1417["Year"]==2017].sort_values(["Number of Collisions"], ascending=[0])
maxColl2017.iloc[[0]]

Unnamed: 0,Date,Max Temp (°C),Min Temp (°C),Mean Temp (°C),Total Rain (mm),Total Snow (cm),Total Precip (mm),Snow on Grnd (cm),Number of Collisions,Year,Month,Day in month,Holiday,Type of holiday,Day_Of_Week
1178,2017-03-24,1.0,-4.0,-1.5,6.0,7.0,12.6,17,134,2017,3,24,,,T5_Fri


**Brief Conclusion:**
* The day in a year with the largest number of collisions are in winter or close to winter. 
* low temperature, and with snow

**Hypothesis:**
* It still supports the hypothesis about weather: low temperature and snowy weather leads to collisions

### 2.5 The location of collisions in Ottawa (on the date with the largest number of collisions)

In [18]:
dayOf2014 = pd.to_datetime('2014/01/03', format = '%Y/%m/%d')
dayOf2015 = pd.to_datetime('2015/02/02', format = '%Y/%m/%d')
dayOf2016 = pd.to_datetime('2016/02/17', format = '%Y/%m/%d')
dayOf2017 = pd.to_datetime('2017/03/24', format = '%Y/%m/%d')

def mapOfMaxCollision(dayOf):
    df_day = df_csv1417[df_csv1417["Date"] == dayOf]
    lat_lons = [[row['latitude'], row['longitude']] for index, row in df_day.iterrows()]
    lats = [lat for [lat, lon] in lat_lons]
    lons = [lon for [lat, lon] in lat_lons]

    center = [((np.min(lat) + np.max(lat))/2), ((np.min(lon) + np.max(lon))/2)]

    map = folium.Map(location=center,
                        zoom_start=10,
                        tiles='OpenStreetMap'
                       )

    plugin = folium.plugins.MarkerCluster(locations = lat_lons)
    plugin.add_to(map)
    print(str(dayOf.year) + "/" + str(dayOf.month) + "/" + str(dayOf.day) + ":")
    display(map)

for dt in [dayOf2014, dayOf2015, dayOf2016, dayOf2017]:
    mapOfMaxCollision(dt)

2014/1/3:


2015/2/2:


2016/2/17:


2017/3/24:


### 2.6 2017 Canada 150th anniversary (2017-07-01) 

In [19]:
wt1417[wt1417['Holiday'] == 'Canada Day']

Unnamed: 0,Date,Max Temp (°C),Min Temp (°C),Mean Temp (°C),Total Rain (mm),Total Snow (cm),Total Precip (mm),Snow on Grnd (cm),Number of Collisions,Year,Month,Day in month,Holiday,Type of holiday,Day_Of_Week
181,2014-07-01,31.0,22.0,26.5,0.2,0.0,0.2,0,25,2014,7,1,Canada Day,onHoliday,T2_Tue
546,2015-07-01,22.7,16.7,19.7,16.6,0.0,16.6,0,20,2015,7,1,Canada Day,onHoliday,T3_Wed
912,2016-07-01,27.0,12.0,19.5,12.6,0.0,12.6,0,14,2016,7,1,Canada Day,onHoliday,T5_Fri
1277,2017-07-01,24.5,19.0,21.8,14.6,0.0,14.6,0,28,2017,7,1,Canada Day,onHoliday,T6_Sat


In [20]:
Canada2014 = pd.to_datetime('2014/07/01', format = '%Y/%m/%d')
Canada2015 = pd.to_datetime('2015/07/01', format = '%Y/%m/%d')
Canada2016 = pd.to_datetime('2016/07/01', format = '%Y/%m/%d')
Canada2017 = pd.to_datetime('2017/07/03', format = '%Y/%m/%d')

for cd in [Canada2014, Canada2015, Canada2016, Canada2017]:
    mapOfMaxCollision(cd)

2014/7/1:


2015/7/1:


2016/7/1:


2017/7/3:


### Brief Conclusion: 
* The distribution and number of collisions beside the parliament hill in 2017 is small compared to the values in 2014 and 2015. 

### Hypothesis: 
* Even though the predction is that the 150th anniversary attracts more people to the place of celebration(most of them are beside the parliament hill), the government did more traffic control on Canada Day in 2017. 

## 3. Do analysis about those combinations of variables

### 3.1 Correlation between independent variables

In [21]:
df_vars = df_csv1417[['Environment', 'Road_Surface', 'Traffic_Control', 'Collision_Location', 
                      'Light', 'Collision_Classification', 'Impact_type', 'Year', 'Month', 'Day_Of_Week']]

df_corr = df_vars.apply(lambda x : pd.factorize(x)[0]).corr(method='pearson', min_periods=1)
df_corr

Unnamed: 0,Environment,Road_Surface,Traffic_Control,Collision_Location,Light,Collision_Classification,Impact_type,Year,Month,Day_Of_Week
Environment,1.0,0.407237,0.033659,0.023043,-0.013634,0.032732,-0.065814,0.028877,0.042192,0.025752
Road_Surface,0.407237,1.0,0.052701,0.028649,0.006519,0.062297,-0.066913,0.027234,0.002951,0.026177
Traffic_Control,0.033659,0.052701,1.0,-0.143711,0.026917,0.033407,-0.179868,0.006068,-0.000393,-0.005413
Collision_Location,0.023043,0.028649,-0.143711,1.0,0.042738,0.116057,0.477705,0.00439,-0.002106,0.005891
Light,-0.013634,0.006519,0.026917,0.042738,1.0,0.041628,0.061463,0.016489,-0.009449,0.015724
Collision_Classification,0.032732,0.062297,0.033407,0.116057,0.041628,1.0,0.081291,-0.012735,0.011433,0.002105
Impact_type,-0.065814,-0.066913,-0.179868,0.477705,0.061463,0.081291,1.0,0.00178,-0.012958,0.023057
Year,0.028877,0.027234,0.006068,0.00439,0.016489,-0.012735,0.00178,1.0,0.00746,0.006355
Month,0.042192,0.002951,-0.000393,-0.002106,-0.009449,0.011433,-0.012958,0.00746,1.0,-0.009906
Day_Of_Week,0.025752,0.026177,-0.005413,0.005891,0.015724,0.002105,0.023057,0.006355,-0.009906,1.0


In [22]:
df_inLongList = df_corr.stack(dropna = False).reset_index(name='Correlation')
trace = go.Heatmap(x = df_inLongList['level_0'], y = df_inLongList['level_1'], z = df_inLongList['Correlation'])
iplot([trace])

**Brief Conclusion:**
* Independent variables which are related to correlation might leave some effects on each other if one of them changes. For example, rainy environment -> wet road surface. 
* Then, sometimes it's better to analyze the relationship between the dependent variable and the group of correlated variables together. 
* According to the pattern of the heatmap, there is a slight correlation between 2 groups: **Environment** and **Road_Surface**, **Collision_Location** and **Impact_type**. 

In [23]:
def corrAnalysis(df, var1, var2):
    tmpdf = pd.crosstab(index = df[var1], 
                        columns = df[var2], 
                        dropna = False).stack(dropna = False).reset_index(name='# of Cases')
    trace = go.Heatmap(x = tmpdf[var1], y = tmpdf[var2], z = tmpdf['# of Cases'])
    iplot([trace])

### 3.1.1 Environment and Road_Surface

In [24]:
corrAnalysis(df_csv1417,'Environment','Road_Surface')

### Brief Conclusion: 
* Even though most of cases happens in the combination of dry and clear situation, but since the most common weather in Ottawa is dry and clear, this result is not so representative.

### 3.1.2 Collision_Location and Impact_type

In [25]:
corrAnalysis(df_csv1417,'Collision_Location','Impact_type')

In [26]:
df_locationType = pd.crosstab(index = df_csv1417['Collision_Location'], 
                  columns = df_csv1417['Impact_type'], 
                  dropna = False).stack(dropna = False).reset_index(name='# of Cases').sort_values(['# of Cases'], ascending=[0])

caseNum = df_locationType['# of Cases'].sum()
df_locationType['Proportion of cases(%)'] = ((df_locationType['# of Cases']/caseNum)*100.0).apply(lambda x: format(x, '.2f'))
df_locationType.head(5)

Unnamed: 0,Collision_Location,Impact_type,# of Cases,Proportion of cases(%)
10,02 - Intersection related,03 - Rear end,11825,20.27
6,01 - Non intersection,07 - SMV other,6612,11.34
2,01 - Non intersection,03 - Rear end,6258,10.73
17,03 - At intersection,02 - Angle,4944,8.48
20,03 - At intersection,05 - Turning movement,4295,7.36


### Brief Conclusion: 
* From the heatmap, those situations have more cases of collisions compared with other pairs:
* 1. Intersection related & Rear end
* 2. Non intersection & Rear end
* 3. Non intersection & SMV other
* 4. At intersection & Angle
* 5. At intersection & Turning movement

### Hypothesis:
* If a driver face to these situations:  
* 1. roads related to intersection, when traffic light changes the signal, might lead to an impact because of fault when enter or exit the intersection
* 2. roads like stright lines, might lead to a driver unable to brake on time and cause a rear end or SMV other impact
* 3. roads at intersection, a driver might ignore the traffic light or make a mistake at the point of turning

In [27]:
df_locationType

Unnamed: 0,Collision_Location,Impact_type,# of Cases,Proportion of cases(%)
10,02 - Intersection related,03 - Rear end,11825,20.27
6,01 - Non intersection,07 - SMV other,6612,11.34
2,01 - Non intersection,03 - Rear end,6258,10.73
17,03 - At intersection,02 - Angle,4944,8.48
20,03 - At intersection,05 - Turning movement,4295,7.36
3,01 - Non intersection,04 - Sideswipe,3829,6.56
5,01 - Non intersection,06 - SMV unattended vehicle,3577,6.13
11,02 - Intersection related,04 - Sideswipe,2911,4.99
25,04 - At/near private drive,02 - Angle,2072,3.55
22,03 - At intersection,07 - SMV other,1369,2.35


### 3.2 Fatal Injury

In [28]:
df_fatal = df_csv1417[df_csv1417['Collision_Classification'] == '01 - Fatal injury']

df_fatal.head(3)

Unnamed: 0,Record,Location,Date,Time,Environment,Road_Surface,Traffic_Control,Collision_Location,Light,Collision_Classification,Impact_type,longitude,latitude,Year,Month,Day_Of_Week,Holiday
0,2014000001,RIDEAU ST @ WALLER ST,2014-02-21,06:07:00,02 - Rain,02 - Wet,01 - Traffic signal,03 - At intersection,07 - Dark,01 - Fatal injury,07 - SMV other,-75.688726,45.427533,2014,2,T5_Fri,
1,2014000002,HINES RD btwn INNOVATION DR & SOLANDT RD,2014-08-02,13:55:00,01 - Clear,01 - Dry,10 - No control,04 - At/near private drive,01 - Daylight,01 - Fatal injury,02 - Angle,-75.921033,45.343152,2014,8,T6_Sat,weekend close to holiday
2,2014000003,LOGGERS WAY btwn KINGDON MINE RD & GALETTA SID...,2014-06-20,23:15:00,01 - Clear,01 - Dry,10 - No control,01 - Non intersection,07 - Dark,01 - Fatal injury,07 - SMV other,-76.247045,45.438627,2014,6,T5_Fri,


In [29]:
df_fatal.tail(3)

Unnamed: 0,Record,Location,Date,Time,Environment,Road_Surface,Traffic_Control,Collision_Location,Light,Collision_Classification,Impact_type,longitude,latitude,Year,Month,Day_Of_Week,Holiday
55836,2017011893,REGIONAL ROAD 174 btwn QUIGLEY HILL RD & REGIO...,2017-12-16,22:20:00,01 - Clear,01 - Dry,10 - No control,01 - Non intersection,07 - Dark,01 - Fatal injury,01 - Approaching,-75.475746,45.502894,2017,12,T6_Sat,
56749,2017012806,SNAKE ISLAND RD btwn 5TH LINE RD & GREY'S CREE...,2017-06-04,19:39:00,01 - Clear,01 - Dry,10 - No control,01 - Non intersection,01 - Daylight,01 - Fatal injury,01 - Approaching,-75.522968,45.214922,2017,6,T7_Sun,
58081,2017014138,WEST HUNT CLUB RD WB btwn CEDARVIEW RD & GREEN...,2017-12-22,15:44:00,01 - Clear,05 - Packed snow,10 - No control,01 - Non intersection,01 - Daylight,01 - Fatal injury,01 - Approaching,-75.778142,45.321768,2017,12,T5_Fri,the workday before holidays


In [30]:
caseFtl = len(df_fatal)

def displayOfFatal(var):
    df_ftl = pd.crosstab(index = df_fatal[var],
                        columns = 'count',
                        dropna = False)
    df_ftl['Proportion(%)'] = (df_ftl['count']/caseFtl) * 100
    df_all = pd.crosstab(index = df_csv1417[var],
                            columns = 'count',
                            dropna = False)
    df_all['Proportion(%)'] = (df_all['count']/caseNum) * 100
    fatalPercentage = go.Bar(x = df_ftl.index, 
                             y = df_ftl['Proportion(%)'],
                           text = str(df_ftl['Proportion(%)']) + '%',
                           textposition = 'auto',
                           opacity=0.6, 
                           name = 'Proportion of fatal collisions')
    totalPercentage = go.Bar(x = df_all.index, 
                             y = df_all['Proportion(%)'],
                           text = str(df_all['Proportion(%)']) + '%',
                           textposition = 'auto',
                           opacity=0.6, 
                           name = 'Proportion of all collisions')

    fig = dict(data = [fatalPercentage, totalPercentage], 
               layout = go.Layout(title = var, yaxis=dict(title='Proportion(%)')))
    iplot(fig)
    return df_ftl['Proportion(%)'].corr(df_all['Proportion(%)'])*100
    
ft_sim = pd.DataFrame(['Similarities(%)'])
ft_sim.index.name = 'Variables'

for k in ["Environment", 'Road_Surface', 'Traffic_Control', 
                  'Collision_Location', 'Light', 'Impact_type', 'Day_Of_Week']: 
    ft_sim.loc[k] = displayOfFatal(k)
    
ft_sim

Unnamed: 0_level_0,0
Variables,Unnamed: 1_level_1
0,Similarities(%)
Environment,99.4876
Road_Surface,99.3124
Traffic_Control,95.4215
Collision_Location,80.3912
Light,95.3727
Impact_type,4.39649
Day_Of_Week,13.5305


### Brief Conclusion: 
* The similarities between fatal collisions and all collisions divided by different variables are high, except **Impact_type** and **Day_Of_Week**. 
* Lower similarity implies larger difference. 

### Hypothesis:
* **Impact_type** and **Day_Of_Week** are highly related to fatal collisions. 

In [31]:
ftl_impact = pd.crosstab(index=df_fatal['Day_Of_Week'], 
                         columns=df_fatal['Impact_type'], 
                         dropna=False).stack(dropna = False).reset_index(name='# of Cases')
ftl_impact

Unnamed: 0,Day_Of_Week,Impact_type,# of Cases
0,T1_Mon,01 - Approaching,1
1,T1_Mon,02 - Angle,1
2,T1_Mon,03 - Rear end,0
3,T1_Mon,04 - Sideswipe,2
4,T1_Mon,05 - Turning movement,1
5,T1_Mon,06 - SMV unattended vehicle,0
6,T1_Mon,07 - SMV other,6
7,T1_Mon,99 - Other,0
8,T2_Tue,01 - Approaching,1
9,T2_Tue,02 - Angle,1


In [32]:
trace = go.Heatmap(x = ftl_impact['Day_Of_Week'], y = ftl_impact['Impact_type'], z = ftl_impact['# of Cases'])
iplot([trace])

### Brief Conclusion: 
* Most of fatal collisions are related to SMV other collisions, and the proportion of this kind of accidents increases on Friday and Sunday. 

## 4. To predict the number of collisions

* Firstly, use the dataset from 2014 to 2016 to predict 2017 and find out the best variable set. 
* Then, set up a model with those variables. 

In [85]:
def graphModel(xc, yc, model_LR):
    X_test = np.asarray(df_test[xc])
    y_true = np.asarray(df_test[yc])
    
    y_predict = model_LR.predict(X_test)
    y_residual = y_predict - y_true
    
    print('R^2 = %.4f' % (Rsq(y_true, y_predict)))

    mse = sqrt(MSE(y_true, y_predict))
    print('variance = root of MSE = %.4f' % mse)

    hitrate = sum(abs(y_residual)<mse)*100.0/len(y_residual)
    print('%.4f%% of residuals are in the range of variance' % hitrate)

    trace0 = go.Scatter(x = df_test['Date'], y = y_true, mode = 'lines', name = 'Actual number of daily collisions', yaxis = 'y1')
    trace1 = go.Scatter(x = df_test['Date'], y = y_predict, mode = 'lines', name = 'Predicted number of daily collisions', yaxis = 'y1')
    trace2 = go.Scatter(x = df_test['Date'], y = y_residual, mode = 'lines', name = 'Residual', yaxis = 'y2')
    trace3 = go.Scatter(x = [df_test.Date.min(), df_test.Date.max()], y = [mse, mse], mode = 'lines', name = 'variance(+)', yaxis = 'y2')
    trace4 = go.Scatter(x = [df_test.Date.min(), df_test.Date.max()], y = [-mse, -mse], mode = 'lines', name = 'variance(-)', yaxis = 'y2')

    data = [trace0, trace1, trace2, trace3, trace4]
    layout = dict(title = 'To predict the number of collisions in 2017 from old data', showlegend = True, 
                  yaxis = dict(title='# of collisions', domain=[.4, 1.0] ),
                  yaxis2 = dict(title=('hitrate = %.2f%%' % hitrate), domain=[0, .4] ),
                 )
    fig = dict(data=data, layout=layout)
    iplot(fig)

def makeModel(xc, yc, toGraph):
    X_train = np.asarray(df_train[xc])
    y_train = np.asarray(df_train[yc])
    
    model_LR = linear_model.LinearRegression()
    model_LR.fit(X_train, y_train)
    if(toGraph):
        graphModel(xc, yc, model_LR)
    return model_LR

def printModel(model):
    print("y = " + str(model.intercept_))
    n=1
    for l in model.coef_.tolist():
        if l<0:
            print("    - %f * %s" % (-l, "x"+str(n)))
        else:
            print("    + %f * %s" % (l, "x"+str(n)))
        n += 1

### 4.1 If the weather statistics information about yesterday could be organized and used today

In [86]:
for col in wt1417['Day_Of_Week'].tolist():
    wt1417[col] = wt1417['Day_Of_Week'].apply(lambda x : 1 if x==col else 0)
        
for col in ['Max Temp (°C)', 'Min Temp (°C)', 'Total Rain (mm)', 'Total Snow (cm)', 'Number of Collisions']:
    previous_col = 'previous_'+col
    wt1417.loc[1: ,previous_col] = wt1417.loc[ :wt1417.shape[0]-2, col].tolist()

for col in wt1417['Type of holiday'].tolist():
    wt1417[col] = wt1417['Type of holiday'].apply(lambda x : 1 if x==col else 0)
    
# learn from https://en.wikipedia.org/wiki/Snow_emergency
wt1417['Snow Level'] = wt1417['Total Snow (cm)'].apply(lambda x : 0 if x<5.08 else 1 if ((x>=5.08) & (x<10.16))
                                                      else 2 if ((x>=10.16) & (x<20.32)) else 3)

In [87]:
df_train = wt1417[wt1417.Year < 2017][1:]
df_test = wt1417[wt1417.Year == 2017]

X_column = ['previous_Min Temp (°C)', 'previous_Total Rain (mm)', 'previous_Total Snow (cm)', 
            'T1_Mon', 'T2_Tue', 'T3_Wed', 'T4_Thu', 'T5_Fri', 'T6_Sat', 'T7_Sun', 'onHoliday', 'postHoliday',  
            'previous_Number of Collisions', 'Snow Level']
y_column = 'Number of Collisions'

model_LR = makeModel(X_column,y_column,True)

R^2 = 0.5060
variance = root of MSE = 11.5673
75.0685% of residuals are in the range of variance


### Brief Conclusion: 
* 50.60% of relations could be explained by the model, and the variance is about 12 cases, while 75.0685% of residuals are lie in the boundary of variance. 
* The R^2 is more close to 1 than 0, and it shows that there is a linear relationship between the number of collisions and independent variables, so that this model is appliable to predict the number of collisions today if the related department can get the weather information about yesterday. 

In [88]:
df_train = wt1417[wt1417.Year <= 2017][1:]
model_LR = makeModel(X_column,y_column,False)

### This model is: 

In [89]:
printModel(model_LR)

y = 28.99569759128727
    - 0.268258 * x1
    + 0.044919 * x2
    + 1.181620 * x3
    + 3.884763 * x4
    + 2.846614 * x5
    + 2.997672 * x6
    + 6.040065 * x7
    + 7.389183 * x8
    - 10.284338 * x9
    - 12.873959 * x10
    - 20.578066 * x11
    + 10.529100 * x12
    + 0.266512 * x13
    + 12.700606 * x14


### Where:
* y = the number of collisions
* x1 = minimum temperature of the previous date (in °C)
* x2 = rain amount of the previous date (in mm)
* x3 = snow amount of the previous date (in cm)
* x4 = it is on Monday (0 or 1)
* x5 = it is on Tuesday (0 or 1)
* x6 = it is on Wednesday (0 or 1)
* x7 = it is on Thursday (0 or 1)
* x8 = it is on Friday (0 or 1)
* x9 = it is on Saturday (0 or 1)
* x10 = it is on Sunday (0 or 1)
* x11= it is on a holiday (0 or 1)
* x12= it is the workday after holiday (0 or 1)
* x13= the number of collisions on yesterday
* x14= snow level (a whole number from 0 to 3 depend on snow emergency level)

### 4.2 If the weather statistics information about yesterday could be organized and used tomorrow

In [90]:
for col in ['Max Temp (°C)', 'Min Temp (°C)', 'Total Rain (mm)', 'Total Snow (cm)', 'Number of Collisions']:
    previous2_col = 'previous2_'+col
    wt1417.loc[2: , previous2_col] = wt1417.loc[ :wt1417.shape[0]-3, col].tolist()

In [91]:
df_train = wt1417[wt1417.Year < 2017][2:]
df_test = wt1417[wt1417.Year == 2017]

X_column2 = ['previous2_Min Temp (°C)', 'previous2_Total Rain (mm)', 'previous2_Total Snow (cm)', 
            'T1_Mon', 'T2_Tue', 'T3_Wed', 'T4_Thu', 'T5_Fri', 'T6_Sat', 'T7_Sun', 'onHoliday', 'postHoliday',  
            'previous2_Number of Collisions', 'Snow Level']
y_column2 = 'Number of Collisions'

model_LR2 = makeModel(X_column2,y_column2,True)

R^2 = 0.4321
variance = root of MSE = 12.4026
74.5205% of residuals are in the range of variance


### Brief Conclusion: 
* 43.21% of relations could be explained by the model, and the variance is about 13 cases, while 74.5205% of residuals are lie in the boundary of variance. 
* Even though the R^2 shows that the linear relationship between the number of collisions and independent variables are not so strong, it's still an available model if the transmission of weather information between departments is not so fast. 

In [92]:
df_train = wt1417[wt1417.Year <= 2017][2:]
model_LR2 = makeModel(X_column2,y_column2,False)

### This model is: 

In [93]:
printModel(model_LR2)

y = 31.640682686884762
    - 0.316775 * x1
    - 0.082083 * x2
    + 0.259450 * x3
    + 1.725832 * x4
    + 5.874799 * x5
    + 4.980542 * x6
    + 5.954674 * x7
    + 8.096943 * x8
    - 10.027913 * x9
    - 16.604876 * x10
    - 21.314824 * x11
    + 6.793552 * x12
    + 0.224976 * x13
    + 12.104805 * x14


### Where:
* y = the number of collisions
* x1 = minimum temperature of 2 days ago (in °C)
* x2 = rain amount of 2 days ago (in mm)
* x3 = snow amount of 2 days ago  (in cm)
* x4 = it is on Monday (0 or 1)
* x5 = it is on Tuesday (0 or 1)
* x6 = it is on Wednesday (0 or 1)
* x7 = it is on Thursday (0 or 1)
* x8 = it is on Friday (0 or 1)
* x9= it is on Saturday (0 or 1)
* x10= it is on Sunday (0 or 1)
* x11= it is on a holiday (0 or 1)
* x12= it is the workday after holiday (0 or 1)
* x13= the number of collisions on yesterday
* x14= snow level (a whole number from 0 to 3 depend on snow emergency level)