# Feature Extraction

### In this notebook, we perform feature extraction from our dataset using the tsfresh package.

### Import packages

In [1]:
%matplotlib inline

import os, re
import pandas as pd, numpy as np
import random
from natsort import natsorted
import matplotlib.pylab as plt

import time

from tsfresh import extract_features, select_features, extract_relevant_features
from tsfresh.utilities.dataframe_functions import (
    impute,
    make_forecasting_frame,
    roll_time_series,
)
from tsfresh.feature_extraction import (
    ComprehensiveFCParameters,
    EfficientFCParameters,
    MinimalFCParameters,
    settings,
)
from sklearn.preprocessing import StandardScaler

import multiprocessing

import pickle

import warnings

warnings.filterwarnings("ignore")

In [2]:
num_cpus = multiprocessing.cpu_count() - 2

print(f"Number of available cpus: {multiprocessing.cpu_count()}\n")
print(f"Number of cpus to use: {num_cpus}")

Number of available cpus: 16

Number of cpus to use: 14


### Global variables

In [3]:
ROLLED_DATAFRAMES_PATH = "../results/rolled-dataset"
FEATURES_DATAFRAMES_PATH = "../results/features"

### Helper function

In [4]:
def generate_features_dataframes(input_path, output_path):
    # list all the rolled dataset
    rolled_datasets_train = natsorted(
        [d for d in os.listdir(input_path) if "train" in d]
    )
    rolled_datasets_test = natsorted([d for d in os.listdir(input_path) if "test" in d])
    sanity_check = True
    error_tracker = []

    # loop over the rolled datasets
    # to extract the features

    print("Starting features extraction...\n\n")
    for i in range(len(rolled_datasets_train)):

        # load the data
        train = rolled_datasets_train[i]
        test = rolled_datasets_test[i]

        with open(os.path.join(input_path, train), "rb") as fIn:
            stored_data = pickle.load(fIn)
            X_train = stored_data["X"]
            y_train = stored_data["y"]

        with open(os.path.join(input_path, test), "rb") as fIn:
            stored_data = pickle.load(fIn)
            X_test = stored_data["X"]
            y_test = stored_data["y"]

        # extract the dates
        ids_train = list(np.unique(X_train.id))
        dates_train = []

        for idx in ids_train:
            temp = X_train[X_train["id"] == idx].reset_index(drop=True)
            dates_train.append(temp.iloc[len(temp) - 1, 3])
        dates_train = pd.DataFrame(dates_train, columns=["date"])

        ids_test = list(np.unique(X_test.id))
        dates_test = []

        for idx in ids_test:
            temp = X_test[X_test["id"] == idx].reset_index(drop=True)
            dates_test.append(temp.iloc[len(temp) - 1, 3])
        dates_test = pd.DataFrame(dates_test, columns=["date"])

        # extract the features
        cols = ["id", "time", "asset_value"]
        features_train = extract_features(
            X_train[cols],
            default_fc_parameters=ComprehensiveFCParameters(),  # we could use also: MinimalFCParameters(), EfficientFCParameters()
            column_id="id",
            column_sort="time",
            impute_function=impute,
            n_jobs=num_cpus,
        )

        features_test = extract_features(
            X_test[cols],
            default_fc_parameters=ComprehensiveFCParameters(),  # we could use also: MinimalFCParameters(), EfficientFCParameters()
            column_id="id",
            column_sort="time",
            impute_function=impute,
            n_jobs=num_cpus,
        )

        # remove the rows whose target values are NaNs
        remove_indices_train = y_train["target"].index[
            y_train["target"].apply(np.isnan)
        ]
        indices_train = [
            idx for idx in y_train.index if idx not in remove_indices_train
        ]

        remove_indices_test = y_test["target"].index[y_test["target"].apply(np.isnan)]
        indices_test = [idx for idx in y_test.index if idx not in remove_indices_test]

        features_train = features_train.reset_index(drop=True)
        features_train = features_train.iloc[indices_train]
        y_train = y_train.iloc[indices_train]
        dates_train = dates_train.iloc[indices_train]

        features_test = features_test.reset_index(drop=True)
        features_test = features_test.iloc[indices_test]
        y_test = y_test.iloc[indices_test]
        dates_test = dates_test.iloc[indices_test]

        # normalize the features
        cols = features_train.columns
        scaler = StandardScaler()
        features_train = pd.DataFrame(
            scaler.fit_transform(features_train), columns=cols
        )
        features_test = pd.DataFrame(scaler.transform(features_test), columns=cols)

        # add the dates
        features_train["day"] = pd.to_datetime(dates_train["date"]).dt.day
        features_train["week"] = pd.to_datetime(dates_train["date"]).dt.week
        features_train["month"] = pd.to_datetime(dates_train["date"]).dt.month

        features_test["day"] = pd.to_datetime(dates_test["date"]).dt.day
        features_test["week"] = pd.to_datetime(dates_test["date"]).dt.week
        features_test["month"] = pd.to_datetime(dates_test["date"]).dt.month

        # filter the features
        features_selected_train = select_features(features_train, y_train.target)
        features_selected_test = features_test[features_selected_train.columns]

        # generate the final dataframe
        # containing the filtered features and the target
        df_train = features_selected_train.merge(
            y_train.target, left_index=True, right_index=True
        )
        df_test = features_selected_test.merge(
            y_test.target, left_index=True, right_index=True
        )

        # add dates index
        index_train = pd.Series(list(dates_train["date"]))
        df_train = df_train.set_index(index_train)

        index_test = pd.Series(list(dates_test["date"]))
        df_test = df_test.set_index(index_test)

        # export to csv and pickle
        filename_train = re.sub(
            ".pkl", ".csv", re.sub("rolled-dataset", "features", train)
        )
        filename_test = re.sub(
            ".pkl", ".csv", re.sub("rolled-dataset", "features", test)
        )
        filename_scaler = re.sub(
            "-train", "", re.sub("rolled-dataset", "scaler", train)
        )

        df_train.to_csv(os.path.join(output_path, filename_train))
        df_test.to_csv(os.path.join(output_path, filename_test))
        with open(os.path.join(output_path, filename_scaler), "wb") as fOut:
            pickle.dump(scaler, fOut, protocol=pickle.HIGHEST_PROTOCOL)

        # sanity check
        check = np.unique(df_test.columns == df_train.columns)
        if len(check) > 1 or not check[0]:
            sanity_check = False
            error_tracker.append((filename_test, filename_train))

    if sanity_check:
        print("\n\n...Features extraction completed, all the files are OK!!!\n\n")
    else:
        print("\n\n...The following pairs of files are not matching:")
        for el in error_tracker:
            print(el)
        print()

# STEP 1: Generate Features Dataframes

In [5]:
%%time
generate_features_dataframes(ROLLED_DATAFRAMES_PATH, FEATURES_DATAFRAMES_PATH)

Starting features extraction...




Feature Extraction: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [00:05<00:00, 11.28it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 60/60 [00:01<00:00, 38.43it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [00:06<00:00, 10.87it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 60/60 [00:01<00:00, 37.16it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [00:05<00:00, 11.29it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 60/60 [00:01<00:00, 39.48it/s]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████



...Features extraction completed, all the files are OK!!!


CPU times: user 2min 29s, sys: 12.4 s, total: 2min 41s
Wall time: 5min 7s
