# This project is example of multistep timeseries forecasting using random forest regressor and tsfresh features

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from tsfresh import extract_relevant_features

## Load and prepare data:

In [None]:
df = pd.read_csv('data/data.csv', delimiter=';')
df.columns = ["date", "year", "month", "week", "day", "cnt", "average_sum"]
df["average_sum"] = df["average_sum"].apply(lambda x: float(x.replace(",", ".")))
df = df.set_index("date")
df = df.sort_values(["year", "month", "week", "day"])

In [None]:
def create_dataset(dataset, look_back=1):
    data_x, data_y = [], []
    for i in range(len(dataset) - look_back):
        a = dataset[i:(i + look_back), 0]
        data_x.append(a)
        data_y.append(dataset[i + look_back, 0])
    return np.array(data_x), np.array(data_y)

## Convert timeseries into supervised form:

In [None]:
values = df['cnt'].values.reshape(-1, 1)
values = values.astype('float32')
data_x, data_y = create_dataset(values, look_back=15)

## Extract relevant features with tsfresh:

In [None]:
data_x = pd.DataFrame.from_records(data_x).stack()
data_x.index.rename(['id', 'time'], inplace=True)
data_x = data_x.reset_index()

In [None]:
data_filtered = extract_relevant_features(data_x, y=data_y, column_id='id', column_sort='time')

## Split dataset into train and test sets:

In [None]:
train_x, test_x = train_test_split(data_filtered.values, shuffle=False)
train_y, test_y = train_test_split(data_y, shuffle=False)

## Normalized data and train random forest regressor:

In [None]:
pipeline = make_pipeline(MinMaxScaler(), RandomForestRegressor(random_state=42))
pipeline.fit(train_x, train_y)

## Calculate MAPE:

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
preds_pipeline = pipeline.predict(test_x)

print(mean_absolute_percentage_error(test_y, preds_pipeline))
print(mean_absolute_percentage_error(sum(test_y), sum(preds_pipeline)))

## Forecast multiple steps ahead:

In [None]:
future = []
data = test_x[-1].reshape(1,1,193)

for i in range(23):
    forecast = pipeline.predict(data.reshape(-1, 1).transpose())
    future.append(forecast[0])  
    data = np.append(data[0][0][1:], forecast)
    data = data.reshape(1,1,193)
    
future = [x for x in future]