In [12]:
import pandas as pd
import requests
from matplotlib import pyplot as plt
import numpy as np
import operator
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import catboost
from catboost import CatBoostRegressor

## Prepare data

In [2]:
df = pd.read_csv('../data/Motor_Vehicle_Collisions_-_Crashes.csv', low_memory=False)
df = df.dropna(subset=['ON STREET NAME'])
print(len(df))
df.head(3)

1389630


Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,12/07/2020,19:27,BRONX,10475.0,40.87147,-73.83057,"(40.87147, -73.83057)",ALCOTT PLACE,ASCH LOOP,,...,,,,,4374449,Sedan,,,,
1,12/07/2020,6:00,QUEENS,11372.0,40.755264,-73.88817,"(40.755264, -73.88817)",NORTHERN BOULEVARD,79 STREET,,...,,,,,4374434,Taxi,,,,
2,12/06/2020,14:25,,,,,,VERRAZANO BRIDGE,,,...,Unspecified,,,,4374156,Sedan,Sedan,,,


In [3]:
df['CRASH DATE'].nunique()

3083

In [4]:
distrib = Counter(df['ON STREET NAME'])
print(len(distrib))
threshold = 100

valid_streets = set()

for street_name, num_crashes in distrib.items():
    if num_crashes > threshold:
        valid_streets.add(street_name)
len(valid_streets)

12031


1884

In [5]:
df_good = df[df['ON STREET NAME'].isin(valid_streets)]
df_good = df_good[['CRASH DATE', 'ON STREET NAME']]
df_good['CRASH DATE'] = pd.to_datetime(df_good['CRASH DATE'])
df_good.reset_index(drop=True, inplace=True)
df_good

Unnamed: 0,CRASH DATE,ON STREET NAME
0,2020-12-07,NORTHERN BOULEVARD
1,2020-12-06,VERRAZANO BRIDGE
2,2020-12-06,BROOKLYN BRIDGE
3,2020-12-06,BELT PARKWAY
4,2020-12-05,BELT PARKWAY
...,...,...
1280556,2012-07-10,TILLARY STREET
1280557,2012-07-09,FLATBUSH AVENUE
1280558,2012-07-06,7 AVENUE
1280559,2012-07-11,KINGS HIGHWAY


In [4]:
df_good = pd.read_csv('for_predictions.csv')

In [6]:
uniq_dates = sorted(set(df_good['CRASH DATE']))
date_to_id = {date: idd for idd, date in enumerate(uniq_dates)}
id_to_date = {idd: date for date, idd in date_to_id.items()}
len(id_to_date)

3083

In [7]:
street_distrib = dict(Counter(df_good['ON STREET NAME']))
street_distrib = sorted(street_distrib.items(), key=operator.itemgetter(1), reverse=True)

street_to_id = {street[0]: idd for idd, street in enumerate(street_distrib)}
id_to_street = {idd: street for street, idd in street_to_id.items()}
len(id_to_street)

1884

In [8]:
X = np.zeros((len(id_to_street), len(id_to_date)), dtype=np.uint32)
X.shape

(1884, 3083)

In [16]:
for row in tqdm(df_good.iterrows(), total=len(df_good)):
    info = row[1]
    date = info['CRASH DATE']
    street = info['ON STREET NAME']
    
    date_id = date_to_id[date]
    street_id = street_to_id[street]
    
    X[street_id, date_id] += 1

HBox(children=(FloatProgress(value=0.0, max=1280561.0), HTML(value='')))




In [17]:
3083 / 31

99.45161290322581

In [18]:
31*99

3069

In [19]:
# divide on batches of 31 days

X_r = X[:, -3069:]
X_r.shape

(1884, 3069)

In [20]:
X_r = X_r.reshape((1884, 31, -1))
X_r.shape

(1884, 31, 99)

## Train model

In [None]:
model = CatBoostRegressor(iterations=1500, depth=10, metric_period=500)

for i in tqdm(range(99)):
    data = X_r[...,i]
    X_train = data[:, :-1]
    y_train = data[:, -1]
    
    model.fit(X_train, y_train)


In [29]:
model.save_model("predictor.dump")

## Load model

In [15]:
model = CatBoostRegressor()
model.load_model("predictor.dump")

<catboost.core.CatBoostRegressor at 0x12ee23518>

In [25]:
d = X_r[...,-1]
x, y = d[:, :-1], d[:, -1]

y_pred = model.predict(x)

In [26]:
from sklearn.metrics import mean_absolute_error as mae

mae(y, y_pred)

0.026174722866557754