In [1]:
import sys
import os
import pathlib
import glob
from typing import List
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from tqdm.auto import tqdm
from dateutil import parser
from datetime import datetime
from urllib.parse import urlparse

import matplotlib.pyplot as plt
from flexitext import flexitext
# import seaborn as sns
# import plotly.graph_objs as go
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import lightning.pytorch as pl
import mlflow
import mlflow.pytorch



In [2]:
%load_ext autoreload
%autoreload 2

from helper_functions import epa_taiwan_data_pipeline, engine
from models import lstnet_gokul, lstnet_laigoukun

In [3]:
# Set the random seed to 420
pl.seed_everything(420)

device = "cpu"
# device = "cuda" if torch.cuda.is_available else "cpu"
# device

Global seed set to 420


In [4]:
root_dir = pathlib.Path(os.getcwd()).parent
raw_data_dir = root_dir / "data/0_raw"
processed_data_dir = root_dir / "data/1_processed"
experiment_dir = root_dir / "experiment"

# Data Loading and Preprocessing

- Import the data
- Feature engineering
- Turn the data into tensor

## Import the data

In [5]:
year = 2018
site_name = "Banqiao"
columns = ["SiteEngName","PM2.5","AMB_TEMP","CH4",'CO',"NMHC","read_time"]

# import data
pm25_df = epa_taiwan_data_pipeline.import_epa_data(site_name=site_name, year=year)[columns]

# basic preprocessing
pm25_df = epa_taiwan_data_pipeline.standardize_df(pm25_df)

## Feature engineering

In [6]:
train_split = 0.6
history_len = 256

In [7]:
train_data = pm25_df.iloc[:int(len(pm25_df)*train_split),:]
print(f"All data length:{len(pm25_df)} \nTrain data length:{len(train_data)}")
train_data.tail(2)

All data length:8760 
Train data length:5256


Unnamed: 0,siteengname,pm2.5,amb_temp,ch4,co,nmhc,read_time
399305,Banqiao,8.0,30.2,1.8,0.3,0.08,2018-08-07 22:00:00
399381,Banqiao,9.0,29.9,1.8,0.23,0.06,2018-08-07 23:00:00


In [8]:
test_data = pm25_df.iloc[int(len(pm25_df)*train_split):,:]
print(f"All data length:{len(pm25_df)} \nTest data length:{len(test_data)}")
test_data.tail(2)

All data length:8760 
Test data length:3504


Unnamed: 0,siteengname,pm2.5,amb_temp,ch4,co,nmhc,read_time
665609,Banqiao,4.0,16.6,1.9,0.34,0.08,2018-12-31 22:00:00
665685,Banqiao,4.0,16.6,1.9,0.31,0.07,2018-12-31 23:00:00


# Experimentation

- Prepare different scenarios
    - [x] number of epochs --> [50, 100, 200]
    - [x] lookback periods --> [24, 24x2, 24x7, 24x30] (history_len)
    - [x] batch size --> [16, 64, 128]
    - [x] loss function --> [MSE (nn.MSELoss()), MAE (nn.L1Loss()), Huber Loss (nn.SmoothL1Loss())]
- Log the experiment
- Monitor the result with MLFlow or ~~tensorboard~~

In [9]:
lr = 1e-3
weight_decay = 0.01

epochs = [10, 20, 50]
lookback_periods = [24//2, 24, 24*2, 24*7]
batch_sizes = [16, 32, 64]
# loss_functions = [nn.MSELoss(), nn.SmoothL1Loss()]

## Manual

In [21]:
experiment_dir = root_dir / "experiment"
if not os.path.exists(experiment_dir):
    os.mkdir(experiment_dir)
    
manual_exp_dir = experiment_dir / "manual"
if not os.path.exists(manual_exp_dir):
    os.mkdir(manual_exp_dir)

timestamp = datetime.now().strftime("%Y_%m_%d") # returns current date in YYYY-MM-DD format
current_manual_exp_dir = manual_exp_dir / str(timestamp)
if not os.path.exists(current_manual_exp_dir):
    os.mkdir(current_manual_exp_dir)

# lstnet_gokul_exp_dir = current_manual_exp_dir / "LSTNET_UNI_GOKUL"
# if not os.path.exists(lstnet_gokul_exp_dir):
#     os.mkdir(lstnet_gokul_exp_dir)

In [23]:
train_loss_tracker_name = "train_epoch_loss.txt"
test_loss_tracker_name = "test_epoch_loss.txt"

combinations = [(epoch, batch_size, lookback) for epoch in epochs for batch_size in batch_sizes for lookback in lookback_periods]

for epoch, batch_size, lookback in combinations:
# for epoch, batch_size, lookback in [(2,4,12)]:
    # train data preprocessing
    train_data, normalized_column_names = epa_taiwan_data_pipeline.min_max_df_norm(train_data)

    train_dataset = epa_taiwan_data_pipeline.AqiDataset(
        train_data,
        history_len=history_len,
        col_names=[normalized_column_names[0]], 
        device=device)

    train_data_loader = DataLoader(train_dataset, batch_size=batch_size)

    # test data preprocessing
    test_data, _ = epa_taiwan_data_pipeline.min_max_df_norm(test_data)

    test_dataset = epa_taiwan_data_pipeline.AqiDataset(
        test_data,
        history_len=history_len,
        col_names=[normalized_column_names[0]], 
        device=device)
    
    test_data_loader = DataLoader(test_dataset, batch_size=batch_size)

    # model preparation
    model_name = f"{epoch}E_{lookback}W_{batch_size}B"
    model = lstnet_gokul.LSTNet(
        ar_window_size=lookback,
        num_features=1,
        recc1_out_channels=64,
        conv1_out_channels=32).to(device)
    tracker_dir = current_manual_exp_dir / model_name
    if not os.path.exists(tracker_dir):
        os.mkdir(tracker_dir)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    # Define your training loop
    epoch_avg_train_loss, epoch_avg_test_loss = engine.train(
        model=model,
        train_dataloader=train_data_loader,
        test_dataloader=test_data_loader,
        optimizer=optimizer,
        loss_fn=criterion,
        epochs=epoch,
        device=device,
        train_tracker_dir=str(tracker_dir / train_loss_tracker_name),
        test_tracker_dir=str(tracker_dir / test_loss_tracker_name)
    )

    # with open(tracker_dir / train_loss_tracker_name, 'a+') as file:
    #     file.write(f'{epoch_avg_train_loss}\n')
    # with open(tracker_dir / test_loss_tracker_name, 'a+') as file:
    #     file.write(f'{epoch_avg_test_loss}\n')

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 0.0176 | test_loss: 0.0257 | 
Epoch: 2 | train_loss: 0.0056 | test_loss: 0.0104 | 


In [54]:
# tracked_files = list((current_manual_exp_dir / os.listdir(current_manual_exp_dir)[0]).glob("*.txt"))

# train_loss = pd.read_csv(tracked_files[1], delimiter = "\t", header=None)
# test_loss = pd.read_csv(tracked_files[0], delimiter = "\t", header=None)

In [60]:
# fig, ax = plt.subplots()

# train_loss.plot(ax=ax, label="training")
# test_loss.plot(ax=ax, label="testing")

## MLFlow

In [14]:
# # Set the experiment name
# experiment_name = datetime.now().strftime("%Y_%m_%d") # returns current date in YYYY-MM-DD format

# # Check if the experiment exists, and if not, create it
# if not mlflow.get_experiment_by_name(experiment_name):
#     mlflow.create_experiment(experiment_name)

# os.environ['MLFLOW_TRACKING_USERNAME'] = 'amrirasyidi'
# os.environ['MLFLOW_TRACKING_PASSWORD'] = 'a2c9e1ebaf6ce8285a9cced5e2c757c386254b7a'

# normalized_columns = ['pm2.5', 'amb_temp', 'ch4', 'co', 'nmhc']
# combinations = [(epoch, batch_size, lookback) for epoch in epochs for batch_size in batch_sizes for lookback in lookback_periods]

# mlflow.end_run()
# with mlflow.start_run(experiment_id=mlflow.get_experiment_by_name(experiment_name).experiment_id) as run:
#     ## For Remote server only (DAGShub)
#     remote_server_uri="https://dagshub.com/amrirasyidi/air_quality_forecasting.mlflow"
#     mlflow.set_tracking_uri(remote_server_uri)
#     tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

#     for epoch, batch_size, lookback in combinations:
#         # train data preprocessing
#         train_data, normalized_column_names = epa_taiwan_data_pipeline.min_max_df_norm(train_data)

#         train_dataset = epa_taiwan_data_pipeline.AqiDataset(
#             train_data,
#             history_len=history_len,
#             col_names=[normalized_column_names[0]], 
#             device=device)

#         train_data_loader = DataLoader(train_dataset, batch_size=batch_size)

#         # test data preprocessing
#         test_data, _ = epa_taiwan_data_pipeline.min_max_df_norm(test_data)

#         test_dataset = epa_taiwan_data_pipeline.AqiDataset(
#             test_data,
#             history_len=history_len,
#             col_names=[normalized_column_names[0]], 
#             device=device)

#         test_data_loader = DataLoader(test_dataset, batch_size=batch_size)

#         # model preparation
#         model_name = f"{epoch}E_{lookback}W_{batch_size}B"
#         model = lstnet_gokul.LSTNet(
#             ar_window_size=lookback,
#             num_features=1,
#             recc1_out_channels=64,
#             conv1_out_channels=32).to(device)

#         criterion = nn.MSELoss()
#         optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
#         # Define your training loop
#         epoch_avg_train_loss, epoch_avg_test_loss = engine.train(
#             model=model,
#             train_dataloader=train_data_loader,
#             test_dataloader=test_data_loader,
#             optimizer=optimizer,
#             loss_fn=criterion,
#             epochs=epoch,
#             device=device,
#         )

#         # print("LSTNET model (learning_rate={:f}, batch_size={:f}):".format(lr, batch_size))

#         # Log hyperparameters
#         mlflow.log_param("epoch",epoch)
#         mlflow.log_param("batch_size",batch_size)
#         mlflow.log_param("lookback",lookback)

#         # Log metrics during training
#         mlflow.log_metrics(
#             {
#                 "train_loss": epoch_avg_train_loss[0], 
#                 "test_loss": epoch_avg_test_loss[0]
#             },
#             step=epoch
#         )

#         # # Log additional artifacts
#         # mlflow.log_artifact("path/to/your/training_plots.png")

#         # # Model registry does not work with file store
#         # if tracking_url_type_store != "file":
#         #     # Register the model
#         #     # There are other ways to use the Model Registry, which depends on the use case,
#         #     # please refer to the doc for more information:
#         #     # https://mlflow.org/docs/latest/model-registry.html#api-workflow
#         #     mlflow.pytorch.log_model(
#         #         model, model_name, registered_model_name=model_name
#         #     )
#         # else:
#         #     mlflow.pytorch.log_model(model, model_name)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 0.0380 | test_loss: 0.0330 | 
Epoch: 2 | train_loss: 0.0177 | test_loss: 0.0242 | 
Epoch: 3 | train_loss: 0.0125 | test_loss: 0.0179 | 
Epoch: 4 | train_loss: 0.0092 | test_loss: 0.0138 | 
Epoch: 5 | train_loss: 0.0072 | test_loss: 0.0112 | 
Epoch: 6 | train_loss: 0.0060 | test_loss: 0.0094 | 
Epoch: 7 | train_loss: 0.0052 | test_loss: 0.0083 | 
Epoch: 8 | train_loss: 0.0046 | test_loss: 0.0075 | 
Epoch: 9 | train_loss: 0.0042 | test_loss: 0.0069 | 
Epoch: 10 | train_loss: 0.0039 | test_loss: 0.0065 | 


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 0.0129 | test_loss: 0.0089 | 
Epoch: 2 | train_loss: 0.0059 | test_loss: 0.0067 | 
Epoch: 3 | train_loss: 0.0046 | test_loss: 0.0061 | 
Epoch: 4 | train_loss: 0.0040 | test_loss: 0.0061 | 
Epoch: 5 | train_loss: 0.0037 | test_loss: 0.0058 | 
Epoch: 6 | train_loss: 0.0036 | test_loss: 0.0058 | 
Epoch: 7 | train_loss: 0.0035 | test_loss: 0.0059 | 
Epoch: 8 | train_loss: 0.0034 | test_loss: 0.0058 | 
Epoch: 9 | train_loss: 0.0033 | test_loss: 0.0057 | 
Epoch: 10 | train_loss: 0.0033 | test_loss: 0.0058 | 


MlflowException: INVALID_PARAMETER_VALUE: Response: {'error_code': 'INVALID_PARAMETER_VALUE'}

The cause of this error is typically due to repeated calls
to an individual run_id event logging.

Incorrect Example:
---------------------------------------
with mlflow.start_run():
    mlflow.log_param("depth", 3)
    mlflow.log_param("depth", 5)
---------------------------------------

Which will throw an MlflowException for overwriting a
logged parameter.

Correct Example:
---------------------------------------
with mlflow.start_run():
    with mlflow.start_run(nested=True):
        mlflow.log_param("depth", 3)
    with mlflow.start_run(nested=True):
        mlflow.log_param("depth", 5)
---------------------------------------

Which will create a new nested run for each individual
model and prevent parameter key collisions within the
tracking store.

# Inferencing

- Load the best model
- Prepare the test data
- Save the result

# Deployment