# LSTM Baseline Model Testing

## 0 Imports & Constants

In [30]:
import sys
import os

# Füge das übergeordnete Verzeichnis zu sys.path hinzu
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, parent_dir)

In [31]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import pandas as pd
from pathlib import Path
import itertools
from tqdm import tqdm
import json
from datetime import datetime
import re

from TimeSeriesDataset import TimeSeriesDataset
from utils import load_time_series
from LSTM import add_lagged_data, scale_data, train_test_split_to_tensor
from baseline_model.LSTM import LSTM, train_model

In [32]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [33]:
DATA_FOLDER = Path("../data")
REAL_DATA_FOLDER = DATA_FOLDER / "real"
SYNTHETIC_DATA_FOLDER = DATA_FOLDER / "synthetic"
BENCHMARK = True

## 1 Data

### Data Loading

In [34]:
# Load data from csv
# -> convert Date column to datetime
data = load_time_series(f'{REAL_DATA_FOLDER}/AAPL_10_24_real.csv')

## 2 Benchmark Loop

In [35]:
hyperparameters = {
    'lag': [7, 14, 21],
    'lr': [0.002, 0.001],
    'hidden_size': [2, 4, 6, 8, 12],
    'num_layers': [1, 2],
    'batch_size': [8, 16, 32, 64],
}

In [36]:
# get all combinations of hyperparameters
keys, values = zip(*hyperparameters.items())
possible_hyperparameters = [dict(zip(keys, v)) for v in itertools.product(*values)]

In [37]:
possible_features = [
    ['Close', 'Volume'],
    ['Close', 'Open', 'Volume'],
    ['Close', 'Open', 'High', 'Low', 'Volume']
]

In [38]:
RESULTS = {}
if BENCHMARK:
    for features in possible_features:

        for hyperparameters in tqdm(possible_hyperparameters):
            
            print('*'*50)
            print('*'*50)
            print('STARTING NEW BENCHMARK RUN:')
            print(f"Features: {features}")
            print(f"Hyperparameters: {hyperparameters}")
            print('*'*50)
            print('*'*50)

            ### Select features
            features_incl_date = features+['Date']
            data_only_features = data[features_incl_date]
            
            ### Data Preprocessing
            data_lagged = add_lagged_data(data_only_features, hyperparameters['lag'], features)
            data_lagged_scaled, scaler_close = scale_data(data_lagged)
            X_train, y_train, X_test, y_test = train_test_split_to_tensor(data_lagged_scaled)

            ### Create datasets and DataLoaders
            train_dataset = TimeSeriesDataset(X_train, y_train)
            test_dataset = TimeSeriesDataset(X_test, y_test)
            train_loader = DataLoader(train_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)
            test_loader = DataLoader(test_dataset, batch_size=hyperparameters['batch_size'], shuffle=False)

            ### Train model
            validation_losses = [] # reset validation losses
            for i in range(2): # train 2 times because sometimes the model converges to a local minimum
                ### Instantiate model
                model = LSTM(
                    device=device,
                    input_size=len(features),
                    hidden_size=hyperparameters['hidden_size'],
                    num_stacked_layers=hyperparameters['num_layers']
                ).to(device)

                ### Optimizer, Criterion
                optimizer = torch.optim.Adam(model.parameters(), lr=hyperparameters['lr'])
                criterion = nn.MSELoss()

                validation_loss, model = train_model(
                    model=model,
                    train_loader=train_loader,
                    test_loader=test_loader,
                    optimizer=optimizer,
                    criterion=criterion,
                    device=device)
                
                # save loss for each run
                validation_losses.append(validation_loss)
            
            ### Save results to dict
            feature_acronym = ''.join([feature[0] for feature in features])
            RESULTS[f'{feature_acronym}_lag{hyperparameters["lag"]}_lr{hyperparameters["lr"]}_bs{hyperparameters["batch_size"]}_hs{hyperparameters["hidden_size"]}_nl{hyperparameters["num_layers"]}'] = min(validation_losses) # only save min loss
        
    ### Save results to json
    print('Saving results to json...')
    with open(f'./benchmark_results/LSTM_benchmark_results_{datetime.now().strftime("%Y_%m_%d_%H%M%S")}.json', 'w') as json_file:
        json.dump(RESULTS, json_file, indent=4)
    print('Results saved!')

    print('#'*50)
    print('#'*50)
    print('#'*50)
    print('TRAINING FINISHED')
    print('#'*50)
    print('#'*50)
    print('#'*50)

  0%|          | 0/240 [00:00<?, ?it/s]

**************************************************
**************************************************
STARTING NEW BENCHMARK RUN:
Features: ['Close', 'Volume']
Hyperparameters: {'lag': 7, 'lr': 0.002, 'hidden_size': 2, 'num_layers': 1, 'batch_size': 8}
**************************************************
**************************************************
Adding lagged data for columns: ['Close', 'Volume']
Shape of the numpy array wit lagged data: (3611, 8, 2)
Shape of X_train: torch.Size([3430, 7, 2]) 
 Shape of y_train: torch.Size([3430, 1]) 
 Shape of X_test: torch.Size([181, 7, 2]) 
 Shape of y_test: torch.Size([181, 1])
Epoch: 1
Validation Loss: 0.23082415111686871
**************************************************
Epoch: 2
Validation Loss: 0.012904002432427977
**************************************************
Epoch: 3
Validation Loss: 0.00529647696479831
**************************************************
Epoch: 4
Validation Loss: 0.002451465474618801
*******************************

  0%|          | 0/240 [00:26<?, ?it/s]


KeyboardInterrupt: 

## 3 Evaluate Results

In [None]:
with open('./benchmark_results/LSTM_benchmark_results_2024_06_19_205109.json', 'r') as json_file:
    results = json.load(json_file)

In [None]:
# get results as array of dicts (bc i saved them shitty in the first place)
result_array = []
for key, value in results.items():
    result_dict = {}
    result_dict['error'] = value
    result_dict['features'] = key.split('_')[0]
    for feature in ['lag', 'lr', 'bs', 'hs', 'nl']:
        pattern = rf"{feature}(\d+\.?\d*)" # regex pattern to extract the value of the hyperparameter
        match = re.search(pattern, key) # search for the value in the key
        result_dict[feature] = match.group()[len(feature):]
    result_array.append(result_dict)

In [None]:
# turn into dataframe and select best 20
result_df = pd.DataFrame(result_array)
result_df_sorted = result_df.sort_values(by='error', ascending=True)
best_20 = result_df_sorted.head(20)

In [None]:
# extract best hyperparameters from best 20 results
best_hyperparameters = {}
for feature in ['features', 'lag', 'lr', 'bs', 'hs', 'nl']:
    best_hyperparameters[feature] = best_20[feature].value_counts().idxmax()

In [None]:
best_hyperparameters

{'features': 'CV',
 'lag': '14',
 'lr': '0.001',
 'bs': '32',
 'hs': '12',
 'nl': '1'}