In [None]:
import sys
import os
import pathlib
import glob
from typing import List
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from tqdm.auto import tqdm
from dateutil import parser
from datetime import datetime
from urllib.parse import urlparse

import matplotlib.pyplot as plt
from flexitext import flexitext
# import seaborn as sns
# import plotly.graph_objs as go
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import lightning.pytorch as pl
import mlflow
import mlflow.pytorch

In [None]:
%load_ext autoreload
%autoreload 2

from helper_functions import epa_taiwan_data_pipeline, engine
from models import lstnet_gokul, lstnet_laigoukun

In [None]:
# Set the random seed to 420
pl.seed_everything(420)

device = "cpu"
# device = "cuda" if torch.cuda.is_available else "cpu"
# device

In [None]:
root_dir = pathlib.Path(os.getcwd()).parent
raw_data_dir = root_dir / "data/0_raw"
processed_data_dir = root_dir / "data/1_processed"
experiment_dir = root_dir / "experiment"

# Data Loading and Preprocessing

- Import the data
- Feature engineering
- Turn the data into tensor

## Import the data

In [None]:
year = 2018
site_name = "Banqiao"
columns = ["SiteEngName","PM2.5","AMB_TEMP","CH4",'CO',"NMHC","read_time"]

# import data
pm25_df = epa_taiwan_data_pipeline.import_epa_data(site_name=site_name, year=year)[columns]

# basic preprocessing
pm25_df = epa_taiwan_data_pipeline.standardize_df(pm25_df)

## Feature engineering

In [None]:
def min_max_df_norm(
    df:pd.DataFrame,
    target:str='pm2.5',
    cols:List=['pm2.5', 'amb_temp', 'ch4', 'co', 'nmhc']
    ) -> pd.DataFrame:
    """do a normalization to a dataframe

    Args:
        df (pd.DataFrame): the dataframe to be normalized
        target (str, optional): the target to be predicted later. Defaults to 'pm2.5'.
        cols (List, optional): columns that will be normalized. Defaults to ['pm2.5', 'amb_temp', 'ch4', 'co', 'nmhc'].

    Returns:
        Tuple[pd.DataFrame, float, float]: return the normalized df and min and max value of the target
    """
    normalized_column_names = []
    for column in cols:
        normalized_column_name = column + '_normalized'
        normalized_column_names.append(normalized_column_name)
        df[normalized_column_name] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
        # max_column_name = column + '_max'
        # df[max_column_name] = df[column].max()
        # min_column_name = column + '_min'
        # df[min_column_name] = df[column].min()

    return df, normalized_column_names

In [None]:
train_split = 0.6

train_data = pm25_df.iloc[:int(len(pm25_df)*train_split),:]
print(f"All data length:{len(pm25_df)} \nTrain data length:{len(train_data)}")
train_data.tail(2)

All data length:8760 
Train data length:5256


Unnamed: 0,siteengname,pm2.5,amb_temp,ch4,co,nmhc,read_time
399305,Banqiao,8.0,30.2,1.8,0.3,0.08,2018-08-07 22:00:00
399381,Banqiao,9.0,29.9,1.8,0.23,0.06,2018-08-07 23:00:00


In [None]:
normalized_columns = ['pm2.5', 'amb_temp', 'ch4', 'co', 'nmhc']

train_data, normalized_column_names = min_max_df_norm(train_data)
train_data.head(2)

Unnamed: 0,siteengname,pm2.5,amb_temp,ch4,co,nmhc,read_time,pm2.5_normalized,amb_temp_normalized,ch4_normalized,co_normalized,nmhc_normalized
1,Banqiao,20.0,16.1,1.9,0.37,0.07,2018-01-01 00:00:00,0.22619,0.273973,0.214286,0.080605,0.04902
77,Banqiao,19.0,16.2,1.9,0.37,0.08,2018-01-01 01:00:00,0.214286,0.277397,0.214286,0.080605,0.053922


## Convert to tensor

In [None]:
# verify dataset instances
temp_train_dataset = epa_taiwan_data_pipeline.AqiDataset(
    train_data,
    history_len=48,
    col_names=[normalized_column_names[0]],
    device=device
)
print(len(temp_train_dataset))
x, y = temp_train_dataset[0]
print(x.shape, y.shape)

5208
torch.Size([48, 1]) torch.Size([1])


In [None]:
# train data_loader
temp_train_data_loader = DataLoader(temp_train_dataset, batch_size=4)
X, Y = next(iter(temp_train_data_loader))
print(X.shape, Y.shape)
print(X.is_cuda, Y.is_cuda)

torch.Size([4, 48, 1]) torch.Size([4, 1])
False False


# Training Pipeline

## Test the dummy model

In [None]:
# test the model
temp_model = lstnet_gokul.LSTNet(
    ar_window_size=48,
    num_features=1,
    recc1_out_channels=64,
    conv1_out_channels=32
)
# temp_model.to(device)

In [None]:
for X, Y in temp_train_data_loader:
    print(X.shape)
    out = temp_model(X.to(device))
    print(Y.shape, out.shape)
    break

torch.Size([4, 48, 1])
torch.Size([4, 1]) torch.Size([4, 1])


## MLFlow experiment testing 

### Data Prep

In [None]:
train_split = 0.6

history_len = 48
batch_size = 8

In [None]:
normalized_columns = ['pm2.5', 'amb_temp', 'ch4', 'co', 'nmhc']

train_data, normalized_column_names = min_max_df_norm(train_data)

train_dataset = epa_taiwan_data_pipeline.AqiDataset(
    train_data, 
    history_len=history_len, 
    col_names=[normalized_column_names[0]], 
    device=None)

# train data_loader
train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
X, Y = next(iter(train_data_loader))
print(len(train_data_loader))
print(X.shape, Y.shape)
print(X.is_cuda, Y.is_cuda)

651
torch.Size([8, 48, 1]) torch.Size([8, 1])
False False


In [None]:
test_data = pm25_df.iloc[int(len(pm25_df)*train_split):,:]

test_data, _ = min_max_df_norm(test_data)

test_dataset = epa_taiwan_data_pipeline.AqiDataset(
    test_data,
    history_len=history_len,
    col_names=[normalized_column_names[0]],
    device=None)

test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
print(len(test_data_loader))

432


### Model prep

In [None]:
model = lstnet_gokul.LSTNet(
    ar_window_size=24,
    num_features=1,
    recc1_out_channels=64,
    conv1_out_channels=32).to(device)

In [None]:
epochs = 2

lr = 1e-3
weight_decay = 0.01

criterion = nn.MSELoss()
optimizer = optim.Adam(temp_model.parameters(), lr=lr, weight_decay=weight_decay)

In [None]:
# # Set the experiment name
# timestamp = datetime.now().strftime("%Y_%m_%d") # returns current date in YYYY-MM-DD format

# try:
#     mlflow.set_experiment(f"{timestamp}")
# except:
#     os.mkdir("mlruns")
#     mlflow.set_experiment(f"{timestamp}")

### Experiment prep

In [None]:
# Set the experiment name
experiment_name = datetime.now().strftime("%Y_%m_%d") # returns current date in YYYY-MM-DD format

# Check if the experiment exists, and if not, create it
if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name)

# MLFLOW_TRACKING_URI=https://dagshub.com/amrirasyidi/air_quality_forecasting.mlflow \
# MLFLOW_TRACKING_USERNAME=amrirasyidi \
# MLFLOW_TRACKING_PASSWORD=a2c9e1ebaf6ce8285a9cced5e2c757c386254b7a \

os.environ['MLFLOW_TRACKING_USERNAME'] = 'amrirasyidi'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'a2c9e1ebaf6ce8285a9cced5e2c757c386254b7a'

with mlflow.start_run(experiment_id=mlflow.get_experiment_by_name(experiment_name).experiment_id):
    # Define your training loop
    epoch_avg_train_loss, epoch_avg_test_loss = engine.train(
        model=model,
        train_dataloader=train_data_loader,
        test_dataloader=test_data_loader,
        optimizer=optimizer,
        loss_fn=criterion,
        epochs=epochs,
        device=device,
    )

    print("LSTNET model (learning_rate={:f}, batch_size={:f}):".format(lr, batch_size))
    print("  Epoch average training loss: %s" % epoch_avg_train_loss)
    print("  Epoch average test loss: %s" % epoch_avg_test_loss)

    # Log hyperparameters
    mlflow.log_params({"learning_rate": lr, "batch_size": batch_size})

    # Log metrics during training
    mlflow.log_metrics(
        {"train_loss": epoch_avg_train_loss[0], "test_loss": epoch_avg_test_loss[0]},
        # step=epoch
    )

    # # Log additional artifacts
    # mlflow.log_artifact("path/to/your/training_plots.png")

    ## For Remote server only (DAGShub)

    remote_server_uri="https://dagshub.com/amrirasyidi/air_quality_forecasting.mlflow"
    mlflow.set_tracking_uri(remote_server_uri)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file":
        # Register the model
        # There are other ways to use the Model Registry, which depends on the use case,
        # please refer to the doc for more information:
        # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        mlflow.pytorch.log_model(
            model, "temp_model", registered_model_name="test_model"
        )
    else:
        mlflow.pytorch.log_model(model, "temp_model")