In [2]:
import sys
import os
import pathlib
import glob
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from tqdm.auto import tqdm
from dateutil import parser
# from datetime import datetime, timedelta

import matplotlib.pyplot as plt
# import seaborn as sns
# import plotly.graph_objs as go
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
%load_ext autoreload
%autoreload 2

from helper_functions import epa_taiwan_data_pipeline
from models import lstnet_gokul, lstnet_laigoukun

In [4]:
device = "cuda" if torch.cuda.is_available else "cpu"
device

'cuda'

In [5]:
root_dir = pathlib.Path(os.getcwd()).parent
raw_data_dir = root_dir / "data/0_raw"
processed_data_dir = root_dir / "data/1_processed"
experiment_dir = root_dir / "experiment"

# Data Loading and Preprocessing

- Import the data
- Feature engineering
- Turn the data into tensor

## Import the data

In [6]:
year = 2018
site_name = "Banqiao"
columns = ["SiteEngName","PM2.5","AMB_TEMP","CH4",'CO',"NMHC","read_time"]

# import data
pm25_df = epa_taiwan_data_pipeline.import_epa_data(site_name=site_name, year=year)[columns]

# basic preprocessing
pm25_df = epa_taiwan_data_pipeline.standardize_df(pm25_df)

## Feature engineering

In [7]:
train_split = 0.6

train_data = pm25_df.iloc[:int(len(pm25_df)*train_split),:]
print(f"All data length:{len(pm25_df)} \nTrain data length:{len(train_data)}")
train_data.tail()

All data length:8760 
Train data length:5256


Unnamed: 0,siteengname,pm2.5,amb_temp,ch4,co,nmhc,read_time
399077,Banqiao,11.0,31.4,1.8,0.41,0.13,2018-08-07 19:00:00
399153,Banqiao,11.0,30.8,1.8,0.35,0.1,2018-08-07 20:00:00
399229,Banqiao,10.0,30.4,1.8,0.31,0.09,2018-08-07 21:00:00
399305,Banqiao,8.0,30.2,1.8,0.3,0.08,2018-08-07 22:00:00
399381,Banqiao,9.0,29.9,1.8,0.23,0.06,2018-08-07 23:00:00


In [8]:
normalized_columns = ['pm2.5', 'amb_temp', 'ch4', 'co', 'nmhc']

for column in normalized_columns:
    normalized_column_name = column + '_normalized'
    train_data[normalized_column_name] = (train_data[column] - train_data[column].min()) / (train_data[column].max() - train_data[column].min())

## Convert to tensor

In [9]:
class AqiDataset(Dataset):
    def __init__(self, data, history_len):
        self.data = data
        self.history_len = history_len
        
    def __len__(self):
        self.len = len(self.data) - self.history_len  
        return self.len
    
    def __getitem__(self, index):
        x_cols = ['pm2.5_normalized', 'amb_temp_normalized', 'ch4_normalized', 'co_normalized', 'nmhc_normalized']
        y_cols = ['pm2.5_normalized']
        x = self.data.iloc[index: index+self.history_len, :][x_cols].values
        y = self.data.iloc[index+self.history_len, :][y_cols].values.astype('float')
        x = torch.tensor(x).float()
        y = torch.tensor(y).float()
        return x, y

In [10]:
# verify dataset instances
temp_train_dataset = AqiDataset(train_data, history_len=48)
print(len(temp_train_dataset))
x, y = temp_train_dataset[0]
print(x.shape, y.shape)

5208
torch.Size([48, 5]) torch.Size([1])


In [11]:
# train data_loader
temp_train_data_loader = DataLoader(temp_train_dataset, batch_size=4)
X, Y = next(iter(temp_train_data_loader))
print(X.shape, Y.shape)

torch.Size([4, 48, 5]) torch.Size([4, 1])


# Training Pipeline

- Prepare the model
- Initiate loss and optimization function
- Training process
- Plot the loss curve

In [12]:
# test the model
temp_model = lstnet_gokul.LSTNet()

for X, Y in temp_train_data_loader:
    print(X.shape)
    out = temp_model(X)
    print(Y.shape, out.shape)
    break

torch.Size([4, 48, 5])
torch.Size([4, 1]) torch.Size([4, 1])


In [13]:
history_len = 48
batch_size = 8 

epochs = 10

lr = 0.01
weight_decay = 0.01

# Experimentation

- Prepare different scenarios
    - [x] number of epochs --> [50, 100, 200]
    - [x] lookback periods --> [24, 24*2, 24*7, 24*30]
    - [x] batch size --> [16, 64, 128]
    - [x] loss function --> [MSE (nn.MSELoss()), MAE (nn.L1Loss()), Huber Loss (nn.SmoothL1Loss())]
- Log the experiment
- Monitor the result with MLFlow or ~~tensorboard~~

In [14]:
epochs = [10, 20, 50]
lookback_periods = [24//2, 24, 24*2, 24*7]
batch_sizes = [16, 64, 128]
loss_functions = [nn.MSELoss(), nn.SmoothL1Loss()]

# Inferencing

- Load the best model
- Prepare the test data
- Save the result

# Deployment