# IMPORTS

In [13]:
%load_ext autoreload
%autoreload 2

In [35]:
# data
import numpy as np
import pandas as pd

# graphics
import plotly.express as px
import matplotlib.pyplot as plt

# stats
from sktime.split import temporal_train_test_split

# src
from src.utilities import from_pickle, to_pickle
from src.data.transformations import split_train_test_ts

# PATHS & NAMES

In [3]:
RAW_FILENAME = "dataset.parquet"
RAWFILE_FOLDER = "../data/raw"
RAW_FILEPATH = f"{RAWFILE_FOLDER}/{RAW_FILENAME}"

INTERIM_FILENAME = "transformed.pkl"
INTERIM_FOLDER = "../data/interim"
INTERIM_FILEPATH = f"{INTERIM_FOLDER}/{INTERIM_FILENAME}"

REPORT_FOLDER = "../reports"

TARGET = "y"
LAG_TARGET = "x_y_lagged"

FIG_WIDTH = 15
FIG_HEIGHT = 3

# DATA

In [4]:
df = pd.read_parquet(RAW_FILEPATH, engine='pyarrow')

In [24]:
# fix indicies
df.index.name = None
df.columns.name = None

# fix date
df['date'] = df.index
df['date'] = df['date'].dt.tz_localize(None)

In [25]:
df.head()

Unnamed: 0,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x_y_lagged,x_z_lagged,y,z,date
2023-01-16 01:00:00+01:00,4277.0,15501.0,2512.0,0.0,0.0,6197.0,140.0,1617.0,87.885115,60.010055,87.893539,87.881423,88.655647,8.528268,83.012341,89.737316,154.714047,82.558968,84.249291,82.558968,150.4,40.779765,88.979332,78.34,0.0,37.11,0.0,50.0,5845.0,15046.0,95.97,25.24,66.99,60.01,2023-01-16 01:00:00
2023-01-16 02:00:00+01:00,4130.0,15089.0,2422.0,0.0,0.0,5657.0,125.0,1587.0,90.430952,62.721417,90.436825,90.427957,81.50881,8.528268,67.401068,74.352916,129.550859,75.41213,84.249291,72.553146,157.42,55.033883,89.999133,83.01,0.05,32.79,0.0,44.71,6259.0,16741.0,95.0,45.86,63.03,60.01,2023-01-16 02:00:00
2023-01-16 03:00:00+01:00,3999.0,14962.0,2438.0,0.0,0.0,5135.0,110.0,1548.0,89.991216,56.103107,89.997746,89.988239,71.807424,8.528268,67.401068,73.199016,98.920831,65.710744,84.249291,65.710744,187.14,15.116841,179.289454,83.82,0.04,40.0,0.0,51.0,6176.0,17814.0,95.0,21.12,66.85,62.72,2023-01-16 03:00:00
2023-01-16 04:00:00+01:00,3965.0,15026.0,2426.0,0.0,0.0,4735.0,98.0,1552.0,90.925069,63.304305,90.931215,90.922103,90.494243,8.528268,84.249291,87.867262,98.920831,84.397563,86.087887,84.397563,110.83,56.458657,190.179037,77.69,0.04,39.69,0.0,56.12,6632.0,19696.0,87.05,5.42,64.96,56.1,2023-01-16 04:00:00
2023-01-16 05:00:00+01:00,4104.0,15470.0,2519.0,0.0,0.0,4420.0,94.0,1629.0,94.905936,87.005404,94.908659,94.901828,109.213495,109.291957,102.184451,107.044765,159.211021,103.116816,99.26449,103.116816,110.04,16.695218,47.217063,83.81,0.04,42.24,0.0,55.47,6753.0,21700.0,93.27,4.18,63.42,63.3,2023-01-16 05:00:00


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6048 entries, 2023-01-16 01:00:00+01:00 to 2023-09-25 00:00:00+02:00
Data columns (total 35 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   x01         6047 non-null   float64       
 1   x02         6047 non-null   float64       
 2   x03         6047 non-null   float64       
 3   x04         6047 non-null   float64       
 4   x05         6047 non-null   float64       
 5   x06         6047 non-null   float64       
 6   x07         6047 non-null   float64       
 7   x08         6047 non-null   float64       
 8   x09         6047 non-null   float64       
 9   x10         6047 non-null   float64       
 10  x11         6047 non-null   float64       
 11  x12         6047 non-null   float64       
 12  x13         6047 non-null   float64       
 13  x14         6047 non-null   float64       
 14  x15         6047 non-null   float64       
 15  x16         6047 non-nul

# SPLIT

As imputation strategy as well as other postprocessing a future engineering shall not leak to the test data, split has to be done 1st.

Because the data represent time series, the split shall take into account time aspect as well.

In [34]:
devset, testset = split_train_test_ts(df)
to_pickle(devset, f"{INTERIM_FOLDER}/devset.pkl")
to_pickle(testset, f"{INTERIM_FOLDER}/testset.pkl")

True

# TRANSFORM