# Feature Extraction

### In this notebook, we roll the datasets for the later features extraction.

### To achieve this, we use the tsfresh library.

### Install some required packages
#### Please install just those that are not yet installed

In [None]:
!pip install natsort
!pip install tsfresh
!pip install pandas-profiling
!pip install tabulate

### Import packages

In [None]:
%matplotlib inline

import os
import pandas as pd, numpy as np
import random
from natsort import natsorted
import matplotlib.pylab as plt

import datetime
from datetime import date, timedelta
import time

from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import (
    impute,
    make_forecasting_frame,
    roll_time_series,
)
from tsfresh.feature_extraction import (
    ComprehensiveFCParameters,
    EfficientFCParameters,
    MinimalFCParameters,
    settings,
)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from pandas_profiling import ProfileReport
from collections import Counter
import ast

import multiprocessing

# import warnings
# warnings.filterwarnings("ignore")

In [None]:
num_cpus = multiprocessing.cpu_count()

print(f"Number of available cpus: {multiprocessing.cpu_count()}\n")
print(f"Number of cpus to use: {num_cpus}")

### Global variables

In [None]:
RESULTS = "../results"

### Utility functions

In [None]:
def transform_to_input_format_1(input_dataset):
    """
    Change the format of the data to tsfresh input format 1.

    Arguments:
        df: the dataframe to be transformed

    Return:
        The dataframe transformed to tsfresh input format 1.
    """

    df = pd.DataFrame()
    fires = natsorted(list(np.unique(input_dataset.fire)))
    stations = natsorted(list(np.unique(input_dataset.station)))

    for fire in fires:
        df_temp = pd.DataFrame()
        for station in stations:
            reduced_df = data[(data["fire"] == fire) & (data["station"] == station)]
            reduced_df = reduced_df.sort_values("datetime").reset_index(drop=True)

            if station == "station_1":
                df_temp["fire"] = [fire] * len(reduced_df)
                df_temp["time_step"] = reduced_df["duration_in_hours"].copy()
                df_temp["acres_burnt"] = reduced_df["acres_burnt"].copy()
                df_temp["duration_in_hours"] = reduced_df["duration_in_hours"].copy()
                df_temp["category"] = reduced_df["category"].copy()

            remove_cols = ["station"]
            common_cols = [
                "fire",
                "datetime",
                "acres_burnt",
                "duration_in_hours",
                "category",
            ]
            other_cols = [
                c
                for c in reduced_df.columns
                if c not in remove_cols and c not in common_cols
            ]
            reduced_df = reduced_df[other_cols]
            reduced_df = reduced_df.rename(
                columns={
                    "ghi": "ghi_" + station,
                    "dni": "dni_" + station,
                    "wind_speed": "wind_speed_" + station,
                    "wind_direction": "wind_direction_" + station,
                    "dhi": "dhi_" + station,
                    "air_temperature": "air_temperature_" + station,
                    "solar_zenith_angle": "solar_zenith_angle_" + station,
                }
            )
            df_temp = pd.concat([df_temp, reduced_df], axis=1)
        ordered_cols = (
            ["fire", "time_step"]
            + [
                c
                for c in df_temp.columns
                if c
                not in [
                    "fire",
                    "time_step",
                    "acres_burnt",
                    "duration_in_hours",
                    "category",
                ]
            ]
            + ["acres_burnt", "duration_in_hours", "category"]
        )
        df_temp = df_temp[ordered_cols]
        df = pd.concat([df, df_temp])
        df = df.reset_index(drop=True)

    return df

# Load the data

In [None]:
data = pd.read_csv(os.path.join(RESULTS, "fires-stations-final-dataset.csv"))

print(f"Size of the data: {data.shape}\n\n")
data.head()

# STEP 0: Prepare the data

### We reformat the data to put it into tsfresh data format 1. This is necessary for the later creation of the forecasting dataframe using the tsfresh native function *make_forecasting_frame()*.

In [None]:
%%time
data = transform_to_input_format_1(data)

print(f"Size of the data: {data.shape}\n\n")
data.head()

### Save the dataset

In [None]:
data.to_csv(
    os.path.join(RESULTS, "fires-stations-final-dataset-flat-format.csv"), index=False
)

# STEP 1: Quick EDA

#### Running this cell could take many minutes...

In [None]:
profile = ProfileReport(data, title="Data exploration - Pandas Profiling Report")
profile

# STEP 2: Train / val / test split 
### We split the data based on their date of occurrence, as follows:
### - We consider the earliest dated fires to train the model, and the most recent ones to test it

#### Load the fires

In [None]:
fire_data = pd.read_feather("s3://data.atoti.io/notebooks/ca-solar/fire_data.feather")
fire_data["StartedDate"] = (
    fire_data["StartedDate"].apply(pd.to_datetime).dt.tz_localize("UTC")
)
fire_data["EndedDate"] = (
    fire_data["EndedDate"].apply(pd.to_datetime).dt.tz_localize("UTC")
)

print(f"Data size: {len(fire_data)}\n\n")
fire_data.head()

In [None]:
fire_data = (
    fire_data[fire_data["Fire"].isin(list(data.fire))]
    .sort_values("EndedDate")
    .reset_index(drop=True)
)

print(f"Data size: {len(fire_data)}\n\n")
fire_data.head()

In [None]:
fire_data.tail()

In [None]:
# we consider 85% of the fires for training
# we choose the earlier dates
train_fires = list(fire_data.Fire)[: int(0.85 * len(fire_data))]
valtest_fires = [fire for fire in list(fire_data.Fire) if fire not in train_fires]
# we keep the raimining 15% of the fires for validation and testing (the most recent dates)
# then, we consider a 50%-50% distribution for validation and  testing (hold-out) respectively
val_fires = valtest_fires[: int(0.5 * len(valtest_fires))]
test_fires = [fire for fire in valtest_fires if fire not in val_fires]

In [None]:
fire_data[fire_data["Fire"].isin(train_fires)]

In [None]:
fire_data[fire_data["Fire"].isin(val_fires)]

In [None]:
fire_data[fire_data["Fire"].isin(test_fires)]

### Split the data

In [None]:
data_train = data[data["fire"].isin(train_fires)].reset_index(drop=True)

print(f"Size of the train data: {len(data_train)}\n\n")
data_train.head()

In [None]:
data_val = data[data["fire"].isin(val_fires)].reset_index(drop=True)

print(f"Size of the train data: {len(data_val)}\n\n")
data_val.head()

In [None]:
data_test = data[data["fire"].isin(test_fires)].reset_index(drop=True)

print(f"Size of the train data: {len(data_test)}\n\n")
data_test.head()

### Save the raw dataset

In [None]:
data_train.to_csv(
    os.path.join(RESULTS, "fires-stations-final-dataset-flat-format_train.csv"),
    index=False,
)
data_val.to_csv(
    os.path.join(RESULTS, "fires-stations-final-dataset-flat-format_val.csv"),
    index=False,
)
data_test.to_csv(
    os.path.join(RESULTS, "fires-stations-final-dataset-flat-format_test.csv"),
    index=False,
)

### Check the distribution of the classes in the datasets

In [None]:
data_train.category.value_counts(normalize=True)

In [None]:
data_val.category.value_counts(normalize=True)

In [None]:
data_test.category.value_counts(normalize=True)

### We can observe that the critical fires (calss 1) happened more frequently (in proportion) in the recent period, between Aug 2020 and Jan 2021.

# STEP 3: Roll the datasets

In [None]:
%%time
df_rolled_train = roll_time_series(
    data_train,
    column_id="fire",
    column_sort="time_step",
    rolling_direction=1,
    max_timeshift=11,
    min_timeshift=3,
    n_jobs=num_cpus,
)
df_rolled_train.shape

In [None]:
df_rolled_train.head()

In [None]:
%%time
df_rolled_val = roll_time_series(
    data_val,
    column_id="fire",
    column_sort="time_step",
    rolling_direction=1,
    max_timeshift=11,
    min_timeshift=3,
    n_jobs=num_cpus,
)
df_rolled_val.shape

In [None]:
df_rolled_val.head()

In [None]:
%%time
df_rolled_test = roll_time_series(
    data_test,
    column_id="fire",
    column_sort="time_step",
    rolling_direction=1,
    max_timeshift=11,
    min_timeshift=3,
    n_jobs=num_cpus,
)
df_rolled_test.shape

In [None]:
df_rolled_test.head()

### Save the rolled datasets

In [None]:
df_rolled_train.to_csv(
    os.path.join(RESULTS, "fires-stations-final-dataset-flat-format-rolled-train.csv"),
    index=False,
)
df_rolled_val.to_csv(
    os.path.join(RESULTS, "fires-stations-final-dataset-flat-format-rolled-val"),
    index=False,
)
df_rolled_test.to_csv(
    os.path.join(RESULTS, "fires-stations-final-dataset-flat-format-rolled-test.csv"),
    index=False,
)