In [1]:
import pandas as pd
import requests
from matplotlib import pyplot as plt
import numpy as np
import operator
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import catboost
from catboost import CatBoostRegressor

## Prepare data

In [2]:
df = pd.read_csv('/Users/alex/Github/nyc-crashes/src/data/crashes-20201220-215930.csv', low_memory=False)
df = df.dropna(subset=['ON STREET NAME'])
print(len(df))
df.head(3)

1391117


Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
1,07/03/2020,8:00,,,40.74947,-73.7564,"(40.74947, -73.7564)",LONG ISLAND EXPRESSWAY,,,...,Unspecified,,,,4325419,Pick-up Truck,Sedan,,,
2,04/27/2020,10:00,QUEENS,11102.0,40.77677,-73.93453,"(40.77677, -73.93453)",26 AVENUE,2 STREET,,...,,,,,4310759,Station Wagon/Sport Utility Vehicle,,,,
3,07/01/2020,22:53,BROOKLYN,11203.0,40.64213,-73.92936,"(40.64213, -73.92936)",UTICA AVENUE,AVENUE D,,...,Unspecified,,,,4325642,Sedan,Sedan,,,


In [15]:
streets_file = "/Users/alex/Github/nyc-crashes/src/code/utils/streets_for_predictions.txt"

with open(streets_file) as f:
    valid_streets = f.read().split('\n')[:-1]

In [16]:
df_good = df[df['ON STREET NAME'].isin(valid_streets)]
df_good = df_good[['CRASH DATE', 'ON STREET NAME']]
df_good['CRASH DATE'] = pd.to_datetime(df_good['CRASH DATE'])
df_good.reset_index(drop=True, inplace=True)
df_good

Unnamed: 0,CRASH DATE,ON STREET NAME
0,2020-07-03,LONG ISLAND EXPRESSWAY
1,2020-04-27,26 AVENUE
2,2020-07-01,UTICA AVENUE
3,2020-05-29,CROSS BRONX EXPY
4,2020-06-16,HART STREET
...,...,...
1281940,2012-07-14,16 STREET
1281941,2012-07-02,EAST 115 STREET
1281942,2012-07-03,AVENUE J
1281943,2012-07-11,EAST FORDHAM ROAD


In [17]:
uniq_dates = sorted(set(df_good['CRASH DATE']))
date_to_id = {date: idd for idd, date in enumerate(uniq_dates)}
id_to_date = {idd: date for date, idd in date_to_id.items()}
len(id_to_date)

3090

In [18]:
street_to_id = {street: idd for idd, street in enumerate(valid_streets)}
id_to_street = {idd: street for street, idd in street_to_id.items()}
len(id_to_street)

1884

In [20]:
X = np.zeros((len(id_to_street), len(id_to_date)), dtype=np.uint32)
X.shape

(1884, 3090)

In [21]:
for row in tqdm(df_good.iterrows(), total=len(df_good)):
    info = row[1]
    date = info['CRASH DATE']
    street = info['ON STREET NAME']
    
    date_id = date_to_id[date]
    street_id = street_to_id[street]
    
    X[street_id, date_id] += 1

HBox(children=(FloatProgress(value=0.0, max=1281945.0), HTML(value='')))




In [30]:
# divide on batches of 31 days

num_batches = X.shape[1] // 31
days_covered = int(num_batches * 31)
num_batches, days_covered

(99, 3069)

In [31]:
X_cut = X[:, -3069:]
X_cut.shape

(1884, 3069)

In [32]:
# reshape for training

X_cut = X_cut.reshape((1884, 31, -1))
X_cut.shape

(1884, 31, 99)

## Train model

In [36]:
model = CatBoostRegressor(iterations=1500, depth=10, verbose=False)

for i in tqdm(range(num_batches)):
    data = X_cut[...,i] # get batch
    X_train = data[:, :-1] # take 30 days for training
    y_train = data[:, -1] # take 31th day as labels
    
    model.fit(X_train, y_train)


HBox(children=(FloatProgress(value=0.0, max=99.0), HTML(value='')))




In [43]:
model.save_model("catboost_predictor.dump")

## Load model and make predictions

In [38]:
model = CatBoostRegressor()
model.load_model("catboost_predictor.dump")

<catboost.core.CatBoostRegressor at 0x11c2fc550>

In [50]:
last_batch = X_cut[...,-1]
X_last, y_last = last_batch[:, :-1], last_batch[:, -1]

y_pred = model.predict(X_last)

In [51]:
#calc MAE

from sklearn.metrics import mean_absolute_error as mae

mae(y_last, y_pred)

0.02907166180475805

In [49]:
top_10_strets_inds = np.argsort(y_pred)[::-1][:10]
top_10_streets = [id_to_street[ind] for ind in top_10_strets_inds]
top_10_streets

['BELT PARKWAY                    ',
 'MAJOR DEEGAN EXPRESSWAY         ',
 'BROOKLYN QUEENS EXPRESSWAY      ',
 'VAN WYCK EXPWY                  ',
 'LONG ISLAND EXPRESSWAY          ',
 'NORTH CONDUIT AVENUE            ',
 'ATLANTIC AVENUE                 ',
 'FDR DRIVE                       ',
 'WHITESTONE EXPRESSWAY           ',
 'BROADWAY                        ']