# Feature Extraction

### In this notebook, we roll the datasets for the later features extraction.

### To achieve this, we use the tsfresh library.

### Import packages

In [1]:
%matplotlib inline

import os
import pandas as pd, numpy as np
import random
from natsort import natsorted
import matplotlib.pylab as plt

import datetime
from datetime import date, timedelta
import time

from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import (
    impute,
    make_forecasting_frame,
    roll_time_series,
)
from tsfresh.feature_extraction import (
    ComprehensiveFCParameters,
    EfficientFCParameters,
    MinimalFCParameters,
    settings,
)

from sklearn.model_selection import train_test_split

from joblib import Parallel, delayed, parallel_backend
import multiprocessing

import pickle

import warnings

warnings.filterwarnings("ignore")

In [2]:
num_cpus = multiprocessing.cpu_count() - 2

print(f"Number of available cpus: {multiprocessing.cpu_count()}\n")
print(f"Number of cpus to use: {num_cpus}")

Number of available cpus: 16

Number of cpus to use: 14


### Global variables

In [3]:
DATA = "../data"
RESULTS = "../results/rolled-dataset"

### Helper functions

In [4]:
def create_forecasting_frame(dataframe, col, max_timeshift, rolling_direction):

    date_df, _ = make_forecasting_frame(
        dataframe["Date"],
        kind="Date",
        max_timeshift=max_timeshift,
        rolling_direction=rolling_direction,
    )
    value_df, y = make_forecasting_frame(
        dataframe[col],
        kind=col,
        max_timeshift=max_timeshift,
        rolling_direction=rolling_direction,
    )

    X = pd.merge(value_df, date_df, how="inner", on=["id", "time"])
    X = X.rename(
        columns={"value_x": "asset_value", "value_y": "date", "kind_x": "asset_code"}
    )
    X = X[["id", "time", "asset_value", "date", "asset_code"]]

    y = pd.DataFrame(y).reset_index()
    y = y.rename(columns={"index": "id", "value": "target"})

    return X, y


def create_and_save_dataset(forecasting_frame_tuple, path, horizon, split):

    X, y = forecasting_frame_tuple
    y_shifted = y.copy()
    y_shifted["target"] = y_shifted["target"].shift(1 - horizon)
    dataset_name = np.unique(X["asset_code"])[0]

    if horizon > 1:
        horizon = str(horizon) + "-days-horizon-"
    else:
        horizon = str(horizon) + "-day-horizon-"

    filename = os.path.join(
        path, "rolled-dataset-" + horizon + "-" + dataset_name + "-" + split + ".pkl"
    )

    with open(filename, "wb") as fOut:
        pickle.dump({"X": X, "y": y_shifted}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

# STEP 0: Load the data

In [5]:
df = pd.read_csv(os.path.join(DATA, "assets-prices-no-missing-values.csv"))

print(f"Size of the data: {df.shape}\n\n")
df.head()

Size of the data: (1491, 9)




Unnamed: 0,Date,AC.PA,BNP.PA,CAP.PA,ENGI.PA,G.MI,RACE.MI,SAN.PA,TIT.MI
0,2018-01-02,43.48,62.09,99.0,14.23,15.02,87.300003,71.760002,0.7255
1,2018-01-03,43.310001,62.639999,101.0,14.29,14.89,88.800003,72.07,0.725
2,2018-01-04,43.599998,63.77,101.349998,14.515,15.0,92.5,73.0,0.734
3,2018-01-05,43.77,63.889999,102.5,14.595,15.2,93.349998,74.360001,0.7385
4,2018-01-06,43.973334,64.093333,102.466667,14.626667,15.203333,93.783333,74.356667,0.743167


# STEP 1: Train / test split 
#### We split the data based on the dates.
#### We consider the earlier data points to train the model, and the later ones to test it.

In [6]:
# we consider 80% of the data for training
# we choose the earlier dates
train_dates = list(df.Date)[: int(0.8 * len(df))]
test_dates = [d for d in list(df.Date) if d not in train_dates]

In [7]:
print(
    f"Number of training points: {len(train_dates)}\nNumber of testing points: {len(test_dates)}"
)

Number of training points: 1192
Number of testing points: 299


### Split the data

In [8]:
train = df[df.Date.isin(train_dates)]
test = df[df.Date.isin(test_dates)]

In [9]:
train

Unnamed: 0,Date,AC.PA,BNP.PA,CAP.PA,ENGI.PA,G.MI,RACE.MI,SAN.PA,TIT.MI
0,2018-01-02,43.480000,62.090000,99.000000,14.230000,15.020000,87.300003,71.760002,0.725500
1,2018-01-03,43.310001,62.639999,101.000000,14.290000,14.890000,88.800003,72.070000,0.725000
2,2018-01-04,43.599998,63.770000,101.349998,14.515000,15.000000,92.500000,73.000000,0.734000
3,2018-01-05,43.770000,63.889999,102.500000,14.595000,15.200000,93.349998,74.360001,0.738500
4,2018-01-06,43.973334,64.093333,102.466667,14.626667,15.203333,93.783333,74.356667,0.743167
...,...,...,...,...,...,...,...,...,...
1187,2021-04-03,33.210000,52.070001,148.400000,12.134800,17.072001,177.350003,84.259998,0.457240
1188,2021-04-04,33.470000,52.030001,148.600000,12.141200,17.093000,177.000003,84.289998,0.456160
1189,2021-04-05,33.730001,51.990001,148.800000,12.147600,17.114000,176.650003,84.319998,0.455080
1190,2021-04-06,33.990002,51.950001,149.000000,12.154000,17.135000,176.300003,84.349998,0.454000


In [10]:
test

Unnamed: 0,Date,AC.PA,BNP.PA,CAP.PA,ENGI.PA,G.MI,RACE.MI,SAN.PA,TIT.MI
1192,2021-04-08,33.180000,52.000000,150.050003,12.290000,17.105000,175.199997,84.709999,0.445300
1193,2021-04-09,33.040001,51.459999,151.000000,12.326000,16.965000,174.000000,85.059998,0.433900
1194,2021-04-10,32.890001,51.446665,151.216665,12.368000,17.000000,174.700002,84.936666,0.435233
1195,2021-04-11,32.740000,51.433332,151.433329,12.410000,17.035000,175.400004,84.813334,0.436567
1196,2021-04-12,32.590000,51.419998,151.649994,12.452000,17.070000,176.100006,84.690002,0.437900
...,...,...,...,...,...,...,...,...,...
1486,2022-01-27,32.160000,64.500000,193.800003,13.672000,18.280001,200.399994,94.889999,0.408100
1487,2022-01-28,32.029999,62.779999,193.000000,13.558000,18.264999,200.500000,94.260002,0.406900
1488,2022-01-29,32.139999,62.853333,194.483332,13.556666,18.370000,201.299998,93.756668,0.410000
1489,2022-01-30,32.250000,62.926666,195.966665,13.555333,18.475000,202.099996,93.253334,0.413100


# STEP 2: Create the forecasting dataframe

#### We roll the data and create the target at the same time using the ***make_forecasting_frame()*** function available in tsfresh.
#### As required by this function (cf tsfresh's documentation), this will be done for each time series separately.

#### Plus, here, we will create the target for three different scenarios corresponding to different horizons of forecasting:
#### **- 1-day horizon**
#### **- 3-days horizon**
#### **- 7-days horizon**

In [11]:
cols = list(df.columns[1:])
cols

['AC.PA', 'BNP.PA', 'CAP.PA', 'ENGI.PA', 'G.MI', 'RACE.MI', 'SAN.PA', 'TIT.MI']

#### Roll the datasets

In [12]:
max_timeshift = 14
rolling_direction = 1

In [13]:
%%time
result_train = Parallel(n_jobs=num_cpus, prefer="threads")(
    delayed(create_forecasting_frame)(train, col, max_timeshift, rolling_direction)
    for col in cols
)

Rolling:   0%|                                                                                                                  | 0/40 [00:00<?, ?it/s]
Rolling:   5%|█████▎                                                                                                    | 2/40 [00:00<00:08,  4.55it/s][A
Rolling:   8%|███████▉                                                                                                  | 3/40 [00:02<00:09,  4.07it/s][A

Rolling:   0%|                                                                                                                  | 0/40 [00:00<?, ?it/s][A[A


Rolling:   0%|                                                                                                                  | 0/40 [00:00<?, ?it/s][A[A[A
Rolling:   8%|███████▉                                                                                                  | 3/40 [00:04<00:30,  1.23it/s][A
Rolling:  10%|██████████▌                                    

CPU times: user 18.8 s, sys: 3.55 s, total: 22.4 s
Wall time: 18.9 s


In [14]:
%%time
result_test = Parallel(n_jobs=num_cpus, prefer="threads")(
    delayed(create_forecasting_frame)(test, col, max_timeshift, rolling_direction)
    for col in cols
)

Rolling:   8%|████████▎                                                                                                 | 3/38 [00:00<00:01, 24.04it/s]
Rolling:  16%|████████████████▋                                                                                         | 6/38 [00:00<00:01, 27.04it/s][A

Rolling:   0%|                                                                                                                  | 0/38 [00:00<?, ?it/s][A[A


Rolling:   0%|                                                                                                                  | 0/38 [00:00<?, ?it/s][A[A[A
Rolling:  24%|█████████████████████████                                                                                 | 9/38 [00:01<00:02, 10.58it/s][A



Rolling:   0%|                                                                                                                  | 0/38 [00:00<?, ?it/s][A[A[A[A

Rolling:  55%|██████████████████████████████████

CPU times: user 5.82 s, sys: 2.28 s, total: 8.11 s
Wall time: 6.75 s


### 1-day horizon

In [15]:
horizon = 1

#### Save the rolled the datasets

In [16]:
%%time
Parallel(n_jobs=num_cpus, prefer="threads")(
    delayed(create_and_save_dataset)(forecasting_frame, RESULTS, horizon, "train")
    for forecasting_frame in result_train
)

CPU times: user 128 ms, sys: 26.4 ms, total: 154 ms
Wall time: 138 ms


[None, None, None, None, None, None, None, None]

In [17]:
%%time
Parallel(n_jobs=num_cpus, prefer="threads")(
    delayed(create_and_save_dataset)(forecasting_frame, RESULTS, horizon, "test")
    for forecasting_frame in result_test
)

CPU times: user 43.3 ms, sys: 414 µs, total: 43.8 ms
Wall time: 34.5 ms


[None, None, None, None, None, None, None, None]

### 3-days horizon

In [18]:
horizon = 3

#### Save the rolled the datasets

In [19]:
%%time
Parallel(n_jobs=num_cpus, prefer="threads")(
    delayed(create_and_save_dataset)(forecasting_frame, RESULTS, horizon, "train")
    for forecasting_frame in result_train
)

CPU times: user 132 ms, sys: 11.9 ms, total: 144 ms
Wall time: 130 ms


[None, None, None, None, None, None, None, None]

In [20]:
%%time
Parallel(n_jobs=num_cpus, prefer="threads")(
    delayed(create_and_save_dataset)(forecasting_frame, RESULTS, horizon, "test")
    for forecasting_frame in result_test
)

CPU times: user 44 ms, sys: 970 µs, total: 44.9 ms
Wall time: 36.5 ms


[None, None, None, None, None, None, None, None]

### 7-days horizon

In [21]:
horizon = 7

#### Save the rolled the datasets

In [22]:
%%time
Parallel(n_jobs=num_cpus, prefer="threads")(
    delayed(create_and_save_dataset)(forecasting_frame, RESULTS, horizon, "train")
    for forecasting_frame in result_train
)

CPU times: user 141 ms, sys: 7.49 ms, total: 149 ms
Wall time: 134 ms


[None, None, None, None, None, None, None, None]

In [23]:
%%time
Parallel(n_jobs=num_cpus, prefer="threads")(
    delayed(create_and_save_dataset)(forecasting_frame, RESULTS, horizon, "test")
    for forecasting_frame in result_test
)

CPU times: user 36.1 ms, sys: 8.95 ms, total: 45.1 ms
Wall time: 36.1 ms


[None, None, None, None, None, None, None, None]