# Feature Extraction

### In this notebook, we perform feature extraction from our dataset using the tsfresh package.

#### We consider a binary classification problem with the following classes:

#### - Class -1: Non-critical fire (burns less than 10 000 acres)
#### - Class 1: Critical fire (burns more than 10 000 acres)

#### Also, we consider the scenario where we want to predict if the fire is going to grow critically within the next 6 hours.

#### This horizon of prediction of 6 hours is arbitrary (you could change it), and corresponds to the delay that could be to mobilise the appropriate resources to deal with the fire before it becomes critical.

### Install some required packages
#### Only install those that are not installed yet

In [None]:
!pip install natsort
!pip install tsfresh
!pip install -U pandas

### Import packages

In [None]:
%matplotlib inline

import os
import pandas as pd, numpy as np
import random
from natsort import natsorted
import matplotlib.pylab as plt

import datetime
from datetime import date, timedelta
import time

from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import (
    impute,
    make_forecasting_frame,
    roll_time_series,
)
from tsfresh.feature_extraction import (
    ComprehensiveFCParameters,
    EfficientFCParameters,
    MinimalFCParameters,
    settings,
)

import ast

import multiprocessing

import warnings

warnings.filterwarnings("ignore")

In [None]:
num_cpus = multiprocessing.cpu_count()

print(f"Number of available cpus: {multiprocessing.cpu_count()}\n")
print(f"Number of cpus to use: {num_cpus}")

### Global variables

In [None]:
RESULTS = "../results"

### Utility function

In [None]:
def make_sorter(l):
    """
    Create a dict from the list to map to 0..len(l)
    Returns a mapper to map a series to this custom sort order
    """
    sort_order = {k: v for k, v in zip(l, range(len(l)))}
    return lambda s: s.map(lambda x: sort_order[x])

# Load the data

### Rolled dataset 

In [None]:
df_rolled_test = pd.read_csv(
    os.path.join(RESULTS, "fires-stations-final-dataset-flat-format-rolled-test.csv")
)
df_rolled_test.shape

In [None]:
df_rolled_test.head()

In [None]:
X_cols = [
    "id",
    "time_step",
    "ghi_station_1",
    "dni_station_1",
    "wind_speed_station_1",
    "wind_direction_station_1",
    "dhi_station_1",
    "air_temperature_station_1",
    "solar_zenith_angle_station_1",
    "ghi_station_2",
    "dni_station_2",
    "wind_speed_station_2",
    "wind_direction_station_2",
    "dhi_station_2",
    "air_temperature_station_2",
    "solar_zenith_angle_station_2",
    "duration_in_hours",
]

y_col = ["category"]

#### Running this cell could take a very long time
#### This could last until a few hours
#### Please consider using distributed calculation, e.g. with Dask or PySpark

In [None]:
%%time
X_test = extract_features(
    df_rolled_test[X_cols],
    column_id="id",
    column_sort="time_step",
    default_fc_parameters=ComprehensiveFCParameters(),  # we could use also: MinimalFCParameters(), EfficientFCParameters()
    impute_function=impute,
    n_jobs=num_cpus,
)
X_test.shape

In [None]:
X_test.head()

In [None]:
X_test.to_csv(
    os.path.join(
        RESULTS,
        "extracted-features-ComprehensiveFCParameters-full--without-target-horizon-test.csv",
    )
)

# STEP 1: Feature Selection

### Hypothesis:
### Here, we want to predict the evolution of the considered in the near future, given the parameters provided by the surrounding stations the last couple of hours.

### Set the horizons of prediction

In [None]:
horizon_6h = 6
# horizon_12h = 12
# horizon_18h = 18

### Create the target vectors

#### Scenario 1: Horizon of prediction of 6 hours

In [None]:
df_rolled_test["id"] = df_rolled_test["id"].apply(
    lambda idx: idx if not isinstance(idx, str) else ast.literal_eval(idx)
)
test_ids = list(np.unique(df_rolled_test["id"]))
X_test_6h = X_test.copy()
X_test_6h = X_test_6h.reset_index()
X_test_6h.rename(columns={X_test_6h.columns[0]: "id"}, inplace=True)
X_test_6h["id"] = X_test_6h["id"].apply(
    lambda idx: idx if not isinstance(idx, str) else ast.literal_eval(idx)
)
X_test_6h = X_test_6h.sort_values("id", key=make_sorter(test_ids))
X_test_6h = X_test_6h.set_index("id")
X_test_6h = X_test_6h.rename_axis(index=None)
# remove the last id
# as we cannot find any label to assign to it
X_test_6h = X_test_6h[:-horizon_6h]

test_target_dic_6h = {}

# loop over all the ids
# excepted the last
for idx in range(len(test_ids) - horizon_6h):
    temp_df_6h = df_rolled_test[
        df_rolled_test["id"] == test_ids[idx + horizon_6h]
    ].reset_index(drop=True)
    test_target_dic_6h[test_ids[idx]] = temp_df_6h.loc[len(temp_df_6h) - 1, "category"]

test_target_6h = [
    test_target_dic_6h[idx]
    if not isinstance(idx, str)
    else test_target_dic_6h[ast.literal_eval(idx)]
    for idx in X_test_6h.index
]

In [None]:
test_df_6h = X_test_6h.copy()
test_df_6h["target"] = test_target_6h
test_df_6h.shape

In [None]:
test_df_6h.head()

### Save the full dataset of extracted features

In [None]:
test_df_6h.to_csv(
    os.path.join(
        RESULTS,
        "extracted-features-ComprehensiveFCParameters-full-target-horizon-6h-test.csv",
    )
)

### Filter the descriptor columns
### We use the ***select_features*** function of tsfresh to retain only the most relevant descriptors of the X dataframe w.r.t the target.

In [None]:
train_df_filtered_6h = pd.read_csv(
    os.path.join(
        RESULTS,
        "extracted-features-ComprehensiveFCParameters-filtered-target-horizon-6h-train.csv",
    ),
    index_col=0,
)
train_df_filtered_6h.shape

In [None]:
%%time
test_df_filtered_6h = test_df_6h[list(train_df_filtered_6h.columns)]
test_df_filtered_6h.shape

In [None]:
test_df_filtered_6h.head(2)

### Save the filtered dataset of extracted features

In [None]:
test_df_filtered_6h.to_csv(
    os.path.join(
        RESULTS,
        "extracted-features-ComprehensiveFCParameters-filtered-target-horizon-6h-test.csv",
    )
)