# Machine Learning

## 1. Initial Setup

- Set variables and hyperparameters
- Import libraries
- Load dataset

In [1]:
MLP_WINDOW_LENGTH = 10
MLP_BATCH_SIZE = 32
MLP_EPOCHS = 10
MLP_LEARNING_RATE = 0.001
MLP_HIDDEN_LAYERS = [64, 32]

LSTM_WINDOW_LENGTH = 20
LSTM_BATCH_SIZE = 64
LSTM_EPOCHS = 20
LSTM_LEARNING_RATE = 0.0005
LSTM_HIDDEN_SIZE = 128
LSTM_NUM_LAYERS = 2

GRU_WINDOW_LENGTH = 20
GRU_BATCH_SIZE = 64
GRU_EPOCHS = 20
GRU_LEARNING_RATE = 0.0005
GRU_HIDDEN_SIZE = 128
GRU_NUM_LAYERS = 2

In [2]:
PROCESSED_PATH = "../data/ANA HIDROWEB/RIO MEIA PONTE/processed.csv"

In [3]:
from typing import List
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

import torch
from torch.utils.data import Dataset, DataLoader



In [4]:
df = pd.read_csv(
    PROCESSED_PATH,
    sep=";",
    parse_dates=["date"],
    dayfirst=True,
)

df.set_index("date", inplace=True)
df.index = pd.to_datetime(df.index)

df.head()

Unnamed: 0_level_0,rain_upstream_mean,rain_upstream_max,rain_upstream_min,rain_upstream_q25,rain_upstream_q75,level_upstream_mean,level_upstream_max,level_upstream_min,level_upstream_q25,level_upstream_q75,...,flow_after_max,flow_after_min,flow_after_q25,flow_after_q75,rain_upstream_acc_2_days,rain_downstream_acc_2_days,rain_after_acc_2_days,rain_upstream_acc_3_days,rain_downstream_acc_3_days,rain_after_acc_3_days
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01,0.0,0.0,0.0,0.0,0.0,219.083333,222.0,216.0,217.0,221.0,...,70.8,65.4,66.6,69.6,0.0,0.008696,0.0,0.0,0.026087,0.0
2014-01-02,0.008696,0.2,0.0,0.0,0.0,222.956522,226.0,219.0,221.5,224.0,...,85.3,67.8,73.9,78.1,0.008696,0.217391,0.0,0.008696,0.226087,0.0
2014-01-03,0.0,0.0,0.0,0.0,0.0,225.318182,231.0,219.0,220.5,230.0,...,78.7,63.0,67.95,76.3,0.008696,0.217391,0.0,0.008696,0.217391,0.0
2014-01-04,0.0,0.0,0.0,0.0,0.0,213.863636,221.0,208.0,212.0,215.75,...,63.0,58.2,61.05,62.85,0.0,0.0,0.0,0.008696,0.217391,0.0
2014-01-05,0.0,0.0,0.0,0.0,0.0,204.904762,208.0,201.0,204.0,206.0,...,58.2,53.4,55.8,57.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2. Data Preprocessing

- Conferir ausência de dados nulos
- Retirar as colunas de vazão, uma vez que está altamente correlacionada à coluna de nível
- Normalização dos dados

In [5]:
# Count missing values in each column
def print_missing_values(data):
    missing_values = data.isnull().sum()
    print("Missing values in each column:")
    print(missing_values[missing_values > 0])

print_missing_values(df)

Missing values in each column:
Series([], dtype: int64)


In [6]:
# Remove columns containing 'flow' from the dataframe
df = df.loc[:, ~df.columns.str.contains('flow')]
df.head()

Unnamed: 0_level_0,rain_upstream_mean,rain_upstream_max,rain_upstream_min,rain_upstream_q25,rain_upstream_q75,level_upstream_mean,level_upstream_max,level_upstream_min,level_upstream_q25,level_upstream_q75,...,level_after_max,level_after_min,level_after_q25,level_after_q75,rain_upstream_acc_2_days,rain_downstream_acc_2_days,rain_after_acc_2_days,rain_upstream_acc_3_days,rain_downstream_acc_3_days,rain_after_acc_3_days
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01,0.0,0.0,0.0,0.0,0.0,219.083333,222.0,216.0,217.0,221.0,...,293.0,284.0,286.0,291.0,0.0,0.008696,0.0,0.0,0.026087,0.0
2014-01-02,0.008696,0.2,0.0,0.0,0.0,222.956522,226.0,219.0,221.5,224.0,...,326.0,288.0,298.0,309.0,0.008696,0.217391,0.0,0.008696,0.226087,0.0
2014-01-03,0.0,0.0,0.0,0.0,0.0,225.318182,231.0,219.0,220.5,230.0,...,306.0,280.0,288.25,302.0,0.008696,0.217391,0.0,0.008696,0.217391,0.0
2014-01-04,0.0,0.0,0.0,0.0,0.0,213.863636,221.0,208.0,212.0,215.75,...,280.0,271.0,276.75,279.0,0.0,0.0,0.0,0.008696,0.217391,0.0
2014-01-05,0.0,0.0,0.0,0.0,0.0,204.904762,208.0,201.0,204.0,206.0,...,272.0,264.0,268.0,270.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
scaler = StandardScaler()
df_normalized = pd.DataFrame(
    scaler.fit_transform(df),
    index=df.index,
    columns=df.columns
)
df_normalized.head()

Unnamed: 0_level_0,rain_upstream_mean,rain_upstream_max,rain_upstream_min,rain_upstream_q25,rain_upstream_q75,level_upstream_mean,level_upstream_max,level_upstream_min,level_upstream_q25,level_upstream_q75,...,level_after_max,level_after_min,level_after_q25,level_after_q75,rain_upstream_acc_2_days,rain_downstream_acc_2_days,rain_after_acc_2_days,rain_upstream_acc_3_days,rain_downstream_acc_3_days,rain_after_acc_3_days
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01,-0.28545,-0.287747,0.0,-0.022316,-0.115149,0.245701,0.105763,0.354992,0.294064,0.201384,...,0.616501,0.835205,0.7819,0.66371,-0.371786,-0.341985,-0.334039,-0.428193,-0.344398,-0.383211
2014-01-02,-0.20383,-0.200634,0.0,-0.022316,-0.115149,0.301341,0.157947,0.400498,0.360028,0.243211,...,0.911162,0.875776,0.900001,0.829041,-0.318631,0.690675,-0.334039,-0.387379,0.416068,-0.383211
2014-01-03,-0.28545,-0.287747,0.0,-0.022316,-0.115149,0.335268,0.223177,0.400498,0.345369,0.326865,...,0.73258,0.794635,0.804044,0.764746,-0.318631,0.690675,-0.334039,-0.387379,0.383004,-0.383211
2014-01-04,-0.28545,-0.287747,0.0,-0.022316,-0.115149,0.170716,0.092717,0.233644,0.220771,0.128186,...,0.500423,0.70335,0.690865,0.553489,-0.371786,-0.385012,-0.334039,-0.387379,0.383004,-0.383211
2014-01-05,-0.28545,-0.287747,0.0,-0.022316,-0.115149,0.042016,-0.076881,0.127465,0.103503,-0.007752,...,0.42899,0.632351,0.60475,0.470823,-0.371786,-0.385012,-0.334039,-0.428193,-0.443589,-0.383211


## 3. Criação dos Datasets

- Criar dataset para série temporal
- Dividir o dataset em treino, validação e teste

In [8]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, windows_length=24):
        self.data = data
        self.sequence_length = windows_length

    def __len__(self):
        return len(self.data) - self.sequence_length

    def __getitem__(self, idx):
        x = self.data.iloc[idx:idx + self.sequence_length].values
        y = self.data.iloc[idx + self.sequence_length].values
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)


class NonOverlappingConcatDataset(Dataset):
    def __init__(self, datasets):
        self.datasets = datasets
        self.cumulative_lengths = []
        total = 0
        for d in datasets:
            self.cumulative_lengths.append(total)
            total += len(d)
        self.total_length = total

    def __len__(self):
        return self.total_length

    def __getitem__(self, idx):
        # Find which dataset this idx belongs to
        for i in range(len(self.datasets)):
            if idx < self.cumulative_lengths[i] + len(self.datasets[i]):
                local_idx = idx - self.cumulative_lengths[i]
                return self.datasets[i][local_idx]
        raise IndexError("Index out of range")


def split_train_validation_test(data: pd.DataFrame, train_size: float = 0.8, val_size: float = 0.1):
    train_end = int(len(data) * train_size)
    val_end = int(len(data) * (train_size + val_size))
    
    train_data = data.iloc[:train_end]
    val_data = data.iloc[train_end:val_end]
    test_data = data.iloc[val_end:]
    
    return train_data, val_data, test_data


datasets_train: List[TimeSeriesDataset] = []
datasets_validation: List[TimeSeriesDataset] = []
datasets_test: List[TimeSeriesDataset] = []
for year in range(2010, 2024):
    year_data = df_normalized[df_normalized.index.year == year]
    if not year_data.empty:
        train_data, val_data, test_data = split_train_validation_test(year_data)
        datasets_train.append(TimeSeriesDataset(train_data))
        datasets_validation.append(TimeSeriesDataset(val_data))
        datasets_test.append(TimeSeriesDataset(test_data))

train_dataset = NonOverlappingConcatDataset(datasets_train)
validation_dataset = NonOverlappingConcatDataset(datasets_validation)
test_dataset = NonOverlappingConcatDataset(datasets_test)


In [9]:
# Test for TimeSeriesDataset

# Create a simple DataFrame with increasing integers
test_df = pd.DataFrame({'A': np.arange(10)})

# Window length = 3
ts_dataset = TimeSeriesDataset(test_df, windows_length=3)

print("Testing TimeSeriesDataset:")
for i in range(len(ts_dataset)):
    x, y = ts_dataset[i]
    print(f"Index {i}: x = {x.squeeze().numpy()}, y = {y.numpy()}")

# Test for NonOverlappingConcatDataset
# Create two small TimeSeriesDatasets
df1 = pd.DataFrame({'A': np.arange(5)})
df2 = pd.DataFrame({'A': np.arange(10, 15)})

ds1 = TimeSeriesDataset(df1, windows_length=2)
ds2 = TimeSeriesDataset(df2, windows_length=2)

concat_ds = NonOverlappingConcatDataset([ds1, ds2])

print("\nTesting NonOverlappingConcatDataset")
print("Should not overlap and should concatenate correctly:")
for i in range(len(concat_ds)):
    x, y = concat_ds[i]
    print(f"Index {i}: x = {x.squeeze().numpy()}, y = {y.numpy()}")


Testing TimeSeriesDataset:
Index 0: x = [0. 1. 2.], y = [3.]
Index 1: x = [1. 2. 3.], y = [4.]
Index 2: x = [2. 3. 4.], y = [5.]
Index 3: x = [3. 4. 5.], y = [6.]
Index 4: x = [4. 5. 6.], y = [7.]
Index 5: x = [5. 6. 7.], y = [8.]
Index 6: x = [6. 7. 8.], y = [9.]

Testing NonOverlappingConcatDataset
Should not overlap and should concatenate correctly:
Index 0: x = [0. 1.], y = [2.]
Index 1: x = [1. 2.], y = [3.]
Index 2: x = [2. 3.], y = [4.]
Index 3: x = [10. 11.], y = [12.]
Index 4: x = [11. 12.], y = [13.]
Index 5: x = [12. 13.], y = [14.]
