In [12]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import torch
import torch.optim as optim
import scipy
import seaborn as sns
from datetime import datetime, timedelta

from tqdm import tqdm

from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import TimeSeriesSplit
import preprocessing as pp

from rnn import RNN
from lstm import LSTM

In [55]:
raw_data = pd.read_csv('consumption_and_temperatures.csv')
raw_data['timestamp'] = pd.to_datetime(raw_data['timestamp'])

In [16]:
raw_data.rolling(window=5)

Rolling [window=5,center=False,axis=0,method=single]

In [57]:
def change_timestamp_to_int(df: pd.DataFrame) -> pd.DataFrame:
    print('here')
    df['timestamp_int'] = pd.to_datetime(df['timestamp']).astype(int) // (3600 * 10 ** 9) - 414888
    return df

In [72]:
def change_timestamp_to_sin(df: pd.DataFrame) -> pd.DataFrame:
    print("sin")
    df['timestamp_sin'] = np.sin((pd.to_datetime(df['timestamp']).astype(int) // (3600 * 10 ** 9) - 414888) / ( 365 * 24 ))
    return df

In [19]:
df2 = change_timestamp_to_sin(raw_data)

In [20]:
def pick_location_data(df: pd.DataFrame, loc: int | list[int] = 1) -> pd.DataFrame:
    df = df.copy()
    if type(loc) == int:
        loc = [loc]
    
    columns_to_drop = []
    for k in range(1, 6):
        if k not in loc: 
            columns_to_drop.append(f"NO{k}_consumption")
            columns_to_drop.append(f"NO{k}_temperature")
        
    return df.drop(columns=columns_to_drop)


In [21]:
df_NO1 = pick_location_data(raw_data, 1)

In [22]:
def shift_x_data_upon(df: pd.DataFrame, n: int = 1, loc: int = 1) -> pd.DataFrame:
    data_shifted = df.copy()
    for k in range(n):
        data_shifted[f'previous_{k+1}_x{loc}'] = data_shifted['NO{loc}_temperature'].shift(k)
    return data_shifted

In [62]:
def split_dataset_by_proportions(df_input: pd.DataFrame, df_output: pd.DataFrame, train_size: int = 80, seq_len: int = 0, test_len: int = 24):
    assert len(df_input) == len(df_output), "Sizes should be the same"

    train_size = int(train_size * len(df_input) / 100)
    # test_size = int((100 - train_size) * len(df_input) / 100)

    X_train = df_input[:train_size]
    Y_train = df_output[:train_size]

    X_val = df_input[train_size - seq_len:]
    Y_val = df_output[train_size - seq_len:]

    X_test = df_input[len(df_input) - test_len - seq_len:]
    Y_test = df_output[len(df_output) - test_len - seq_len:]
    
    return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test)


In [24]:
def normalize_inputs(scalerMethod: object, X_train: pd.DataFrame, X_test: pd.DataFrame = pd.DataFrame([]), X_val: pd.DataFrame = pd.DataFrame([])):
    X_train_scaled = scalerMethod.fit_transform(X_train)

    X_test_scaled = scalerMethod.transform(X_test)
    X_val_scaled = scalerMethod.transform(X_val)

    return scalerMethod, X_train_scaled, X_test_scaled, X_val_scaled

In [25]:
def normalize_ouputs(scalerMethods: list | object, targetsByLocation = list[pd.DataFrame] | pd.DataFrame):
    pass

In [None]:
(X_train, Y_train), (X_val, Y_val), (X_test, Y_test) = split_dataset_by_proportions(raw_data)

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
train_test_split(df_NO1, test_size=0.33, random_state=42)

[                      timestamp  NO1_consumption  NO1_temperature  \
 54259 2023-07-09 19:00:00+00:00      2317.553592             23.4   
 15785 2019-02-17 17:00:00+00:00      5297.901999              2.6   
 18480 2019-06-10 00:00:00+00:00      2224.803914              9.7   
 56778 2023-10-22 18:00:00+00:00      4395.023508              3.9   
 9103  2018-05-15 07:00:00+00:00      3335.218999             16.8   
 ...                         ...              ...              ...   
 54343 2023-07-13 07:00:00+00:00      2585.745524             15.1   
 38158 2021-09-06 22:00:00+00:00      2557.732439             11.5   
 860   2017-06-05 20:00:00+00:00      2931.047004             14.1   
 15795 2019-02-18 03:00:00+00:00      4161.751991              2.8   
 56422 2023-10-07 22:00:00+00:00      3239.189512              2.5   
 
        timestamp_sin  
 54259      -0.089117  
 15785       0.973405  
 18480       0.858329  
 56778       0.197024  
 9103        0.861976  
 ...          

In [42]:
X_train

Unnamed: 0,timestamp,NO1_consumption,NO1_temperature,NO2_consumption,NO2_temperature,NO3_consumption,NO3_temperature,NO4_consumption,NO4_temperature,NO5_consumption,NO5_temperature,timestamp_sin
0,2017-05-01 00:00:00+00:00,3325.431995,2.1,3534.588000,5.0,2674.838000,5.5,2061.209000,0.2,1609.089000,3.9,0.000000
1,2017-05-01 01:00:00+00:00,3344.690998,1.8,3560.209000,4.1,2678.188000,4.0,2045.436000,0.1,1623.485000,3.2,0.000114
2,2017-05-01 02:00:00+00:00,3398.359002,3.2,3596.149000,3.1,2678.341000,3.7,2020.007000,0.1,1634.128000,2.7,0.000228
3,2017-05-01 03:00:00+00:00,3430.220001,3.6,3594.042000,2.3,2684.172000,3.2,2021.920000,0.1,1647.906000,2.3,0.000342
4,2017-05-01 04:00:00+00:00,3606.750000,3.4,3665.091000,2.6,2733.345000,2.7,2045.913000,0.1,1679.176000,2.3,0.000457
...,...,...,...,...,...,...,...,...,...,...,...,...
40919,2021-12-30 23:00:00+00:00,4740.969005,0.2,4447.866217,6.7,3532.040874,-1.9,2412.324320,-2.8,1962.144176,7.8,-0.999149
40920,2021-12-31 00:00:00+00:00,4619.599391,0.2,4398.041722,6.5,3471.478033,-1.7,2383.037948,-2.5,1927.205302,7.7,-0.999153
40921,2021-12-31 01:00:00+00:00,4536.957117,0.3,4352.930463,6.9,3428.159822,-1.4,2350.644249,-3.0,1907.046405,7.8,-0.999158
40922,2021-12-31 02:00:00+00:00,4507.843045,0.2,4311.489138,6.9,3416.870403,-2.4,2342.276147,-4.5,1892.347201,7.9,-0.999163


In [69]:
from torch.nn.utils.rnn import pad_sequence

def make_sequences(x, y, seq_len=9, dt = 1):
    num_samples = x.shape[0]

    num_sequences = num_samples - seq_len + 1

    sequences = []
    targets = []

    for i in range(num_sequences):
        seq = x[i:i+seq_len]
        target = y[i+dt:i+seq_len+dt]
        sequences.append(seq)
        targets.append(target)

    sequences_padded = pad_sequence(sequences, batch_first=True)
    targets_padded = pad_sequence(targets, batch_first=True)

    sequences_tensor = torch.tensor(sequences_padded, dtype=torch.float32)
    targets_tensor = torch.tensor(targets_padded, dtype=torch.float32)
    return sequences_tensor, targets_tensor

In [76]:
def general_preprocessing(
        df_raw: pd.DataFrame,
        train_size: int = 80,
        target_column: str = 'NO1_consumption',
        features_to_add: list = [change_timestamp_to_sin], 
        # model_type: str, 
        split_locations: bool = True,
        # seq_len: int = 0,
        scalerInputMethod: object = StandardScaler(),
        scalerOutputMethod: object = StandardScaler(),
        scale_output: bool = False,
        locationsKeeped: list[int] | int = 1,
        test_len: int = 24,
        seq_len: int = 48,
        ):

    df = df_raw.copy()

    for f in features_to_add:
        df = f(df)
        
    if split_locations: 
        df = pick_location_data(df, locationsKeeped)

    # SPLIT TARGET AND INPUT
    df_X = df.drop(columns=target_column)
    df_y = df[['timestamp', target_column]]

    df_X = df_X.drop(columns='timestamp')
    df_y = df_y.drop(columns='timestamp')


    # SPLIT DATASET
    (X_train, Y_train), (X_val, Y_val), (X_test, Y_test) = split_dataset_by_proportions(df_X, df_y, train_size=train_size, seq_len=seq_len, test_len=test_len)

    print("FEATURES:", X_train.columns)
    # SCALE INPUT VALUES
    scaled_X_train = scalerInputMethod.fit_transform(X_train)
    scaled_X_val = scalerInputMethod.transform(X_val)
    scaled_X_test = scalerInputMethod.transform(X_test)

    # SCALE OUTPUT VALUES
    if scale_output:
        Y_train = scalerOutputMethod.fit_transform(Y_train)
        Y_test = scalerOutputMethod.transform(Y_test)
        Y_val = scalerOutputMethod.transform(Y_val)

    # MANAGE MIN-MAX VALUES

    # TRANSFORM DataFrames TO torch.Tensor
    X_train = torch.tensor(scaled_X_train, dtype=torch.float32)
    X_val = torch.tensor(scaled_X_val, dtype=torch.float32)
    X_test = torch.tensor(scaled_X_test, dtype=torch.float32)

    Y_train = torch.tensor(Y_train.values, dtype=torch.float32)
    Y_test = torch.tensor(Y_test.values, dtype=torch.float32)
    Y_val = torch.tensor(Y_val.values, dtype=torch.float32)

    # MAKE SEQUENCES

    if seq_len > 0:
        X_train, y_train = make_sequences(X_train, Y_train, seq_len=seq_len)
        X_test, y_test = make_sequences(X_test, Y_test, seq_len=seq_len)
        X_val, y_val = make_sequences(X_val, Y_val, seq_len=seq_len)

    return (X_train, y_train), (X_val, y_val), (X_test, y_test), (scalerInputMethod, scalerOutputMethod)

In [77]:
(X_train, y_train), (X_val, y_val), (X_test, y_test), (scalerInputMethod, scalerOutputMethod) = general_preprocessing(raw_data, locationsKeeped=[1, 2])

sin
FEATURES: Index(['NO1_temperature', 'NO2_consumption', 'NO2_temperature',
       'timestamp_sin'],
      dtype='object')


  sequences_tensor = torch.tensor(sequences_padded, dtype=torch.float32)
  targets_tensor = torch.tensor(targets_padded, dtype=torch.float32)


In [71]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

(torch.Size([46724, 48, 4]),
 torch.Size([46724, 48, 1]),
 torch.Size([11694, 48, 4]),
 torch.Size([11694, 48, 1]),
 torch.Size([25, 48, 4]),
 torch.Size([25, 48, 1]))