In [106]:
import sys
import os
import pathlib
import glob
from typing import List
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from tqdm.auto import tqdm
from dateutil import parser
from datetime import datetime
from urllib.parse import urlparse

import matplotlib.pyplot as plt
from flexitext import flexitext
# import seaborn as sns
# import plotly.graph_objs as go
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import lightning.pytorch as pl
import mlflow
import mlflow.pytorch

In [107]:
%load_ext autoreload
%autoreload 2

from helper_functions import epa_taiwan_data_pipeline, engine
from models import lstnet_gokul, lstnet_laigoukun

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [108]:
# Set the random seed to 420
pl.seed_everything(420)

device = "cpu"
# device = "cuda" if torch.cuda.is_available else "cpu"
# device

Global seed set to 420


In [109]:
root_dir = pathlib.Path(os.getcwd()).parent
raw_data_dir = root_dir / "data/0_raw"
processed_data_dir = root_dir / "data/1_processed"
experiment_dir = root_dir / "experiment"

# Data Loading and Preprocessing

- Import the data
- Feature engineering
- Turn the data into tensor

## Import the data

In [110]:
year = 2018
site_name = "Banqiao"
columns = ["SiteEngName","PM2.5","AMB_TEMP","CH4",'CO',"NMHC","read_time"]

# import data
pm25_df = epa_taiwan_data_pipeline.import_epa_data(site_name=site_name, year=year)[columns]

# basic preprocessing
pm25_df = epa_taiwan_data_pipeline.standardize_df(pm25_df)

## Feature engineering

In [115]:
train_split = 0.6
history_len = 256

In [112]:
train_data = pm25_df.iloc[:int(len(pm25_df)*train_split),:]
print(f"All data length:{len(pm25_df)} \nTrain data length:{len(train_data)}")
train_data.tail(2)

All data length:8760 
Train data length:5256


Unnamed: 0,siteengname,pm2.5,amb_temp,ch4,co,nmhc,read_time
399305,Banqiao,8.0,30.2,1.8,0.3,0.08,2018-08-07 22:00:00
399381,Banqiao,9.0,29.9,1.8,0.23,0.06,2018-08-07 23:00:00


In [114]:
test_data = pm25_df.iloc[int(len(pm25_df)*train_split):,:]
print(f"All data length:{len(pm25_df)} \nTest data length:{len(test_data)}")
test_data.tail(2)

All data length:8760 
Test data length:3504


Unnamed: 0,siteengname,pm2.5,amb_temp,ch4,co,nmhc,read_time
665609,Banqiao,4.0,16.6,1.9,0.34,0.08,2018-12-31 22:00:00
665685,Banqiao,4.0,16.6,1.9,0.31,0.07,2018-12-31 23:00:00


# Experimentation

- Prepare different scenarios
    - [x] number of epochs --> [50, 100, 200]
    - [x] lookback periods --> [24, 24x2, 24x7, 24x30] (history_len)
    - [x] batch size --> [16, 64, 128]
    - [x] loss function --> [MSE (nn.MSELoss()), MAE (nn.L1Loss()), Huber Loss (nn.SmoothL1Loss())]
- Log the experiment
- Monitor the result with MLFlow or ~~tensorboard~~

In [104]:
lr = 1e-3
weight_decay = 0.01

epochs = [10, 20, 50]
lookback_periods = [24//2, 24, 24*2, 24*7]
batch_sizes = [16, 32, 64]
# loss_functions = [nn.MSELoss(), nn.SmoothL1Loss()]

## Manual

In [14]:
# experiment_dir = root_dir / "experiment"
# if not os.path.exists(experiment_dir):
#     os.mkdir(experiment_dir)
    
# manual_exp_dir = experiment_dir / "manual"
# if not os.path.exists(manual_exp_dir):
#     os.mkdir(manual_exp_dir)

# timestamp = datetime.now().strftime("%Y_%m_%d") # returns current date in YYYY-MM-DD format
# current_manual_exp_dir = manual_exp_dir / str(timestamp)
# if not os.path.exists(current_manual_exp_dir):
#     os.mkdir(current_manual_exp_dir)
    
# lstnet_gokul_exp_dir = current_manual_exp_dir / "LSTNET_UNI_GOKUL"
# if not os.path.exists(lstnet_gokul_exp_dir):
#     os.mkdir(lstnet_gokul_exp_dir)

In [4]:
# running_loss_tracker_name = "running_loss.txt"
# epoch_loss_tracker_name = "epoch_loss.txt"

# for epoch in epochs:
#     for batch_size in batch_sizes:
#         for lookback in lookback_periods:
#             model_name = f"{epoch}E_{lookback}W_{batch_size}B"

#             train_loss_list = []

#             if os.path.exists(lstnet_gokul_exp_dir / running_loss_tracker_name):
#                 os.remove(lstnet_gokul_exp_dir / running_loss_tracker_name)

#             if os.path.exists(lstnet_gokul_exp_dir / epoch_loss_tracker_name):
#                 os.remove(lstnet_gokul_exp_dir / epoch_loss_tracker_name)
                    
#             for epoch in tqdm(range(epochs)):
#                 epoch_loss_train = 0
#                 for batch_no, (X, Y) in enumerate(temp_train_data_loader, start=1):
#                     X, Y = X.to(device), Y.to(device)
                    
#                     optimizer.zero_grad()
                    
#                     Y_pred = temp_model(X)
                    
#                     loss = criterion(Y_pred, Y)
#                     loss.backward()
                    
#                     optimizer.step()

#                     with open(lstnet_gokul_exp_dir / running_loss_tracker_name, 'a+') as file:
#                         file.write(f'{loss.item()}\n')

#                     epoch_loss_train += loss.item()

#                 epoch_loss_train = epoch_loss_train / len(temp_train_data_loader)
#                 train_loss_list.append(epoch_loss_train)

#                 with open(lstnet_gokul_exp_dir / epoch_loss_tracker_name, 'a+') as file:
#                     file.write(f'{epoch_loss_train}\n')
                        

## MLFlow

In [None]:
# Set the experiment name
experiment_name = datetime.now().strftime("%Y_%m_%d") # returns current date in YYYY-MM-DD format

# Check if the experiment exists, and if not, create it
if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name)
    
os.environ['MLFLOW_TRACKING_USERNAME'] = 'amrirasyidi'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'a2c9e1ebaf6ce8285a9cced5e2c757c386254b7a'

normalized_columns = ['pm2.5', 'amb_temp', 'ch4', 'co', 'nmhc']
combinations = [(epoch, batch_size, lookback) for epoch in epochs for batch_size in batch_sizes for lookback in lookback_periods]

for epoch, batch_size, lookback in combinations:
    # train data preprocessing
    train_data, normalized_column_names = epa_taiwan_data_pipeline.min_max_df_norm(train_data)

    train_dataset = epa_taiwan_data_pipeline.AqiDataset(
        train_data,
        history_len=history_len,
        col_names=[normalized_column_names[0]], 
        device=device)

    train_data_loader = DataLoader(train_dataset, batch_size=batch_size)

    # test data preprocessing
    test_data, _ = epa_taiwan_data_pipeline.min_max_df_norm(test_data)

    test_dataset = epa_taiwan_data_pipeline.AqiDataset(
        train_data,
        history_len=history_len,
        col_names=[normalized_column_names[0]], 
        device=device)
    
    test_data_loader = DataLoader(test_dataset, batch_size=batch_size)

    # model preparation
    model_name = f"{epoch}E_{lookback}W_{batch_size}B"
    model = lstnet_gokul.LSTNet(
        ar_window_size=lookback,
        num_features=1,
        recc1_out_channels=64,
        conv1_out_channels=32).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    with mlflow.start_run(experiment_id=mlflow.get_experiment_by_name(experiment_name).experiment_id):
        # Define your training loop
        epoch_avg_train_loss, epoch_avg_test_loss = engine.train(
            model=model,
            train_dataloader=train_data_loader,
            test_dataloader=test_data_loader,
            optimizer=optimizer,
            loss_fn=criterion,
            epochs=epoch,
            device=device,
        )

        # print("LSTNET model (learning_rate={:f}, batch_size={:f}):".format(lr, batch_size))

        # Log hyperparameters
        mlflow.log_params({
            # "learning_rate": lr,
            "batch_size": batch_size,
            "epoch": epoch,
            "lookback": lookback
            })

        # Log metrics during training
        mlflow.log_metrics(
            {
                "train_loss": epoch_avg_train_loss[0], 
                "test_loss": epoch_avg_test_loss[0]
            },
            step=epoch
        )

        # # Log additional artifacts
        # mlflow.log_artifact("path/to/your/training_plots.png")
        
        ## For Remote server only (DAGShub)
        remote_server_uri="https://dagshub.com/amrirasyidi/air_quality_forecasting.mlflow"
        mlflow.set_tracking_uri(remote_server_uri)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        # Model registry does not work with file store
        if tracking_url_type_store != "file":
            # Register the model
            # There are other ways to use the Model Registry, which depends on the use case,
            # please refer to the doc for more information:
            # https://mlflow.org/docs/latest/model-registry.html#api-workflow
            mlflow.pytorch.log_model(
                model, model_name, registered_model_name=model_name
            )
        else:
            mlflow.pytorch.log_model(model, model_name)

# Inferencing

- Load the best model
- Prepare the test data
- Save the result

# Deployment