# IMPORTS

In [58]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [59]:
# data
import numpy as np
import pandas as pd

# graphics
import plotly.express as px
import matplotlib.pyplot as plt

# stats
from sktime.split import temporal_train_test_split
from sklearn.preprocessing import Normalizer

# src
from src.utilities import from_pickle, to_pickle
from src.data.transformations import split_train_test_ts

# PATHS & NAMES

In [60]:
RAW_FILENAME = "dataset.parquet"
RAWFILE_FOLDER = "../data/raw"
RAW_FILEPATH = f"{RAWFILE_FOLDER}/{RAW_FILENAME}"

INTERIM_FILENAME = "transformed.pkl"
INTERIM_FOLDER = "../data/interim"
INTERIM_FILEPATH = f"{INTERIM_FOLDER}/{INTERIM_FILENAME}"

DEVSET_FILENAME = "devset.pkl"
TESTSET_FILENAME = "testset.pkl"
DEVSET_FILEPATH = f"{INTERIM_FOLDER}/{DEVSET_FILENAME}"
TESTSET_FILEPATH = f"{INTERIM_FOLDER}/{TESTSET_FILENAME}"

FITTED_NORMALIZER = "normalizer.pkl"
MODELS_FOLDER = "../models"
FITTED_NORMALIZER_FILEPATH = f"{MODELS_FOLDER}/{FITTED_NORMALIZER}"

REPORT_FOLDER = "../reports"

TARGET = "y"
LAG_TARGET = "x_y_lagged"

FIG_WIDTH = 15
FIG_HEIGHT = 3

# DATA

In [61]:
df = pd.read_parquet(RAW_FILEPATH, engine='pyarrow')

In [62]:
# fix indicies
df.index.name = None
df.columns.name = None

# fix date
df['date'] = df.index
df['date'] = df['date'].dt.tz_localize(None)

In [63]:
df.head()

Unnamed: 0,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,...,x26,x27,x28,x29,x30,x_y_lagged,x_z_lagged,y,z,date
2023-01-16 01:00:00+01:00,4277.0,15501.0,2512.0,0.0,0.0,6197.0,140.0,1617.0,87.885115,60.010055,...,37.11,0.0,50.0,5845.0,15046.0,95.97,25.24,66.99,60.01,2023-01-16 01:00:00
2023-01-16 02:00:00+01:00,4130.0,15089.0,2422.0,0.0,0.0,5657.0,125.0,1587.0,90.430952,62.721417,...,32.79,0.0,44.71,6259.0,16741.0,95.0,45.86,63.03,60.01,2023-01-16 02:00:00
2023-01-16 03:00:00+01:00,3999.0,14962.0,2438.0,0.0,0.0,5135.0,110.0,1548.0,89.991216,56.103107,...,40.0,0.0,51.0,6176.0,17814.0,95.0,21.12,66.85,62.72,2023-01-16 03:00:00
2023-01-16 04:00:00+01:00,3965.0,15026.0,2426.0,0.0,0.0,4735.0,98.0,1552.0,90.925069,63.304305,...,39.69,0.0,56.12,6632.0,19696.0,87.05,5.42,64.96,56.1,2023-01-16 04:00:00
2023-01-16 05:00:00+01:00,4104.0,15470.0,2519.0,0.0,0.0,4420.0,94.0,1629.0,94.905936,87.005404,...,42.24,0.0,55.47,6753.0,21700.0,93.27,4.18,63.42,63.3,2023-01-16 05:00:00


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6048 entries, 2023-01-16 01:00:00+01:00 to 2023-09-25 00:00:00+02:00
Data columns (total 35 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   x01         6047 non-null   float64       
 1   x02         6047 non-null   float64       
 2   x03         6047 non-null   float64       
 3   x04         6047 non-null   float64       
 4   x05         6047 non-null   float64       
 5   x06         6047 non-null   float64       
 6   x07         6047 non-null   float64       
 7   x08         6047 non-null   float64       
 8   x09         6047 non-null   float64       
 9   x10         6047 non-null   float64       
 10  x11         6047 non-null   float64       
 11  x12         6047 non-null   float64       
 12  x13         6047 non-null   float64       
 13  x14         6047 non-null   float64       
 14  x15         6047 non-null   float64       
 15  x16         6047 non-nul

# SPLIT

As imputation strategy as well as other postprocessing a future engineering shall not leak to the test data, split has to be done 1st.

Because the data represent time series, the split shall take into account time aspect as well.

In [65]:
devset, testset = split_train_test_ts(df)

# TRANSFORM

## treat missings

use LOCF (Last Observation Carried Forward) as gaps aren't too big.

In [66]:
nomiss = devset.ffill()
nomiss_test = testset.ffill()

## normalize

In [67]:
cols_to_norm = [col for col in nomiss.columns if col.startswith("x")]

In [68]:
fitted_normalizer = Normalizer().fit(nomiss.loc[:,cols_to_norm])
to_pickle(fitted_normalizer, FITTED_NORMALIZER_FILEPATH)

True

In [69]:
normed = nomiss
normed.loc[:,cols_to_norm] = fitted_normalizer.transform(nomiss.loc[:,cols_to_norm])
to_pickle(normed, DEVSET_FILEPATH)

True

In [70]:
normed_test = nomiss_test
normed_test.loc[:,cols_to_norm] = fitted_normalizer.transform(nomiss_test.loc[:,cols_to_norm])
to_pickle(normed_test, TESTSET_FILEPATH)

True