In [None]:
from random import randint
from csv import writer
from random import randint
import time

import numpy as np
import pandas as pd
from fastai.tabular.all import *
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

First import pandas and numpy to help us work with the collision data excel file. Then import fastai and sklearn for the random forest machine learning model. 

In [None]:
pd.set_option('display.max_columns', None)
raw_collisions = pd.read_csv("/home/alec/Desktop/code/personal_projects/safe-path-finder/data/raw_collisions.csv", low_memory=False)
raw_collisions["accident"] = [1] * 242131
add_datepart(raw_collisions, 'INCDATE')


## Data Preprocessing

Here we read our collisions data file into the program as a DataFrame. We set the pandas options to display the max columns so pandas doesn't hide any of the columns when we display the DataFrame. The low memory, False key value pair also ensures the displayed DataFrame doesn't hide any data. Additionally, we add a column for our dependant variable, accident, whether or not there was an accident. Some important columns to look at for the machine learning model are location: ('X', 'Y'), time: 'INCDATE', weather: 'WEATHER', and light conditions: 'LIGHTCOND'. Finally, we'll use the fastai function add_datepart to split up our date column to change the data from categorical to numerical. 

In [None]:
column_names = [
    'X', 'Y', 'SEVERITYCODE', 'WEATHER', 'LIGHTCOND',
    'INCDATEYear', 'INCDATEMonth', 'INCDATEWeek',
    'INCDATEDay', 'INCDATEDayofweek', 'INCDATEDayofyear'
]
collisions = raw_collisions[column_names].copy()
collisions.to_csv("/home/alec/Desktop/code/personal_projects/safe-path-finder/data/collisions.csv", index=False)

As this is the first iteration of the machine learning model I will drop columns are not significant factors. Note that it's difficult to decide whether or not a factor is significant. However, for simplicity the columns I will keep are: the location in coordinates, the date columns, the weather, and the light conditions. Finally, we save the edited collisions data to a csv file.

In [None]:
with open('/home/alec/Desktop/code/personal_projects/safe-path-finder/data/new_collisions.csv', 'w') as csv_file:
    csv_writer = writer(csv_file)
    csv_writer.writerow(column_names)

num = 1
start = time.perf_counter()

while num <= 242131 * 3:
    if num % 10000 == 0:
        end = time.perf_counter()
        print(num/(242131 * 3)*100, '%', end - start)
        start = end
    rand_index_1 = randint(0, len(collisions) - 1)
    row = collisions.iloc[rand_index_1].copy(deep=True)
    rand_index_2 = randint(0, len(collisions) - 1)
    rand_x, rand_y = collisions.iloc[rand_index_2]['X'], collisions.iloc[rand_index_2]['Y']
    row['X'], row['Y'] = rand_x, rand_y
    row['SEVERITYCODE'] = 0


This code is for generating random data. It takes about an hour to run and saves the random data to a new csv file.
The algorithm inspired from a description of generating data in this article https://medium.com/geoai/using-machine-learning-to-predict-car-accident-risk-4d92c91a7d57

- Pick random data row, using python random numbers in the range of rows and then accessing that row
- Create a copy of the row and  modify the coordinates of the row.
- Check if the newly generated data is in the records
- If not add to the dataset
- Loop this code 3 times the number of records in the current data set.


In [None]:
collisions = pd.read_csv("/home/alec/Desktop/code/personal_projects/safe-path-finder/data/collisions.csv", low_memory=False)
print(collisions["WEATHER"].unique())
collisions["SEVERITYCODE"] = 1
no_collision = pd.read_csv("/home/alec/Desktop/code/personal_projects/safe-path-finder/data/new_collisions.csv", low_memory=False)
print(no_collision.head())

combined = pd.concat([collisions, no_collision], ignore_index=True)
combined = combined.sample(frac = 1)

dep_var = 'SEVERITYCODE'
continuous, categorical = cont_cat_split(combined, 1, dep_var=dep_var)
procs = (Categorify, FillMissing)

time_condition = combined.INCDATEYear<2019
train_idx = np.where(time_condition)[0]
valid_idx = np.where(~time_condition)[0]
splits = (list(train_idx),list(valid_idx))

to = TabularPandas(combined, procs, categorical, continuous, y_names=dep_var, splits=splits)
print(to["LIGHTCOND"].unique())
to.classes["LIGHTCOND"]

I read the collision data and add one to all of the data points in the severity code column as 0 will indicate no accident. I read the no collision data and combine the two datasets into a single dataframe. I define the dependant variable which is the severity code. Categorify and FillMissing are TabularProcs. These transform data in place by replacing categorical data with numerical data and replacing missing values with the median of the column respectively. Fill missing also adds a boolean column to indicate replaced rows. Then, I define the splits for the training set and validation set. The training set will use data before 2019 and the validation set will use data after 2019. This will replicate the real model of predicting accidents in the future. Finally, I create the tabular panda with the previously defined variables.

In [None]:
# edit all dark street lights on data in the no accident set to just dark/streetlights unknown
indep_vars, dep_vars = to.train.xs, to.train.y
valid_indep_vars, valid_dep_vars = to.valid.xs, to.valid.y
print(indep_vars)

tree = DecisionTreeRegressor(min_samples_leaf=20) 
tree.fit(indep_vars, dep_vars)
mean_absolute_error(dep_vars, tree.predict(indep_vars)), mean_absolute_error(valid_dep_vars, tree.predict(valid_indep_vars))

Here I define the dependent and independent variables for the training set and validation set. Then, I build a decision tree regressor from the dataset. After we fit the tree to the data, we use the metric mean absolute error to measure the accuracy of the models prediction. The initial mean error I got using a simple decision tree model was around 40%.

In [None]:
random_forest = RandomForestRegressor(n_jobs=-1, n_estimators=50,
        max_samples=200000, max_features=0.5,
        min_samples_leaf=20, oob_score=True)
random_forest.fit(indep_vars, dep_vars)
mean_absolute_error(dep_vars, random_forest.predict(indep_vars)), mean_absolute_error(valid_dep_vars, random_forest.predict(valid_indep_vars))
# next up is out of bag error