In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from math import sqrt
import shap
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [77]:
def random_forest_forecast(train, X_test):
    y_train = train['num_crimes']
    train_dates = train['date']
    X_train = train.drop(columns=['num_crimes', 'date'])
    # fit model
    model = RandomForestRegressor(n_estimators=100)
    model.fit(X_train, y_train)
    # make a one-step prediction
    yhat = model.predict(pd.DataFrame(X_test))
    return yhat[0]


def walk_forward_validation(train_data, test_data):
    predictions = []

    train = train_data
    test = test_data

    history = train

    # step over each time-step in the test set
    for i in range(len(test)):
        # split test row into input and output columns
        y_test = test['num_crimes'].iloc[i]
        X_test = test.iloc[i]
        X_test = X_test.drop(['num_crimes', 'date'])
        X_test = pd.DataFrame(X_test).transpose()
        # fit model on history and make a prediction
        yhat = random_forest_forecast(history, X_test)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        X_test['num_crimes'] = yhat
        history = pd.concat([history, X_test])
        # summarize progress
        print('>expected=%.1f, predicted=%.1f' % (y_test, yhat))
    # estimate prediction error
    error = mean_squared_error(test['num_crimes'], predictions)
    return error, test['num_crimes'], predictions

In [78]:
day_data = pd.read_csv('../day_data.csv')
day_data['date'] = pd.to_datetime(day_data['date'], format="%Y-%m-%d")
day_data['holiday'] = day_data['holiday'].fillna('0')

onehot = OneHotEncoder(categories='auto', drop=None, sparse_output=False)
holidays = onehot.fit_transform(day_data[['holiday']])
holidays = pd.DataFrame(holidays, columns=onehot.categories_)

onehot = OneHotEncoder(categories='auto', drop='first', sparse_output=False)
wd = onehot.fit_transform(day_data[['wd']])
wd = pd.DataFrame(wd, columns=onehot.get_feature_names_out(['wd']))

day_data = pd.concat([day_data, holidays, wd], axis=1)
day_data = day_data.drop(columns=['holiday', ('0',), 'wd'])
day_data.columns = day_data.columns.astype(str)

cutoff_date = '2024-01-01'

In [79]:
train_data = day_data[day_data['date'] < cutoff_date]
train_data['x1'] = train_data['num_crimes'].shift(1)
train_data['x2'] = train_data['num_crimes'].shift(2)
train_data['x3'] = train_data['num_crimes'].shift(3)
train_data = train_data.ffill().bfill().ffill()
train_dates = train_data['date']

y_train = train_data['num_crimes']
X_train = train_data.drop(columns=['num_crimes', 'date'])


test_data = day_data[(day_data['date'] >= cutoff_date) & (day_data['date'] <= '2024-01-31')]
test_data['x1'] = test_data['num_crimes'].shift(1)
test_data['x2'] = test_data['num_crimes'].shift(2)
test_data['x3'] = test_data['num_crimes'].shift(3)
test_data = test_data.ffill().bfill().ffill()
y_test = test_data['num_crimes']
test_dates = test_data['date']
X_test = test_data.drop(columns=['num_crimes', 'date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['x1'] = train_data['num_crimes'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['x2'] = train_data['num_crimes'].shift(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['x3'] = train_data['num_crimes'].shift(3)
A value is trying to be set on a copy of a sli

In [80]:
mse, ytrue, yhat = walk_forward_validation(train_data, test_data)

In [58]:
# # step over each time-step in the test set
# for i in range(len(test)):
#     # split test row into input and output columns
#     y_test = test['num_crimes'].iloc[i]
#     X_test = test.iloc[i]
#     X_test = X_test.drop(['num_crimes', 'date'])
#     X_test = pd.DataFrame(X_test).transpose()
#     # fit model on history and make a prediction
#     yhat = random_forest_forecast(history, X_test)
#     # store forecast in list of predictions
#     predictions.append(yhat)
#     # add actual observation to history for the next loop
#     X_test['num_crimes'] = yhat
#     history = pd.concat([history, X_test])
#     # summarize progress
#     print('>expected=%.1f, predicted=%.1f' % (y_test, yhat))
# # estimate prediction error
# # error = mean_absolute_error(test[:, -1], predictions)

>expected=49.0, predicted=43.6
>expected=35.0, predicted=42.4
>expected=43.0, predicted=39.8
>expected=44.0, predicted=37.6
>expected=43.0, predicted=39.1
>expected=44.0, predicted=39.4
>expected=38.0, predicted=38.5
>expected=40.0, predicted=39.5
>expected=29.0, predicted=39.0
>expected=32.0, predicted=38.3
>expected=36.0, predicted=33.3
>expected=30.0, predicted=32.2
>expected=33.0, predicted=29.9
>expected=23.0, predicted=29.2
>expected=29.0, predicted=32.4
>expected=35.0, predicted=31.0
>expected=29.0, predicted=31.1
>expected=35.0, predicted=34.7
>expected=32.0, predicted=35.0
>expected=26.0, predicted=35.8
>expected=29.0, predicted=33.1
>expected=28.0, predicted=31.8
>expected=39.0, predicted=34.0
>expected=24.0, predicted=37.8
>expected=28.0, predicted=35.3
>expected=36.0, predicted=31.9
>expected=26.0, predicted=35.9
>expected=30.0, predicted=32.3
>expected=41.0, predicted=32.9
>expected=37.0, predicted=36.6
>expected=30.0, predicted=37.8
