In [1]:
import sys
import os
import pathlib
import glob
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from dateutil import parser
# from datetime import datetime, timedelta

import matplotlib.pyplot as plt
# import seaborn as sns
# import plotly.graph_objs as go
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
%load_ext autoreload
%autoreload 2
from helper_functions import epa_taiwan_data_pipeline

In [3]:
device = "cuda" if torch.cuda.is_available else "cpu"
device

'cuda'

In [4]:
root_dir = pathlib.Path(os.getcwd()).parent
raw_data_dir = root_dir / "data/0_raw"
processed_data_dir = root_dir / "data/1_processed"
experiment_dir = root_dir / "experiment"

# Data Loading and Preprocessing

- Import the data
- Preprocess the data
- Turn the data into tensor

In [7]:
year = 2018
site_name = "Banqiao"
columns = ["SiteEngName","PM2.5","AMB_TEMP","CH4",'CO',"NMHC","read_time"]

# import data
pm25_df = epa_taiwan_data_pipeline.import_epa_data(site_name=site_name, year=year)[columns]

# basic preprocessing
pm25_df = epa_taiwan_data_pipeline.standardize_df(pm25_df)

In [9]:
# make sure the time diff is always constant
pm25_df['time_diff'] = pm25_df['read_time'].diff()
time_diff_list = pm25_df['time_diff'].unique().dropna()

if len(time_diff_list) > 1:
    print(f"{time_diff_list.unique()}")
else:
    print(f"time difference is always: {time_diff_list[0]}")
    pm25_df = pm25_df.drop(columns=["time_diff"])
    pm25_df.head()

time difference is always: 0 days 01:00:00


## Feature Engineering

# Training Pipeline

- Prepare the model
- Initiate loss and optimization function
- Training process
- Plot the loss curve

In [10]:
train_split = 0.6

train_data = pm25_df.iloc[:int(len(pm25_df)*train_split),:]
print(f"All data length:{len(pm25_df)} \nTrain data length:{len(train_data)}")
train_data.tail()

All data length:8760 
Train data length:5256


Unnamed: 0,siteengname,pm2.5,amb_temp,ch4,co,nmhc,read_time
399077,Banqiao,11.0,31.4,1.8,0.41,0.13,2018-08-07 19:00:00
399153,Banqiao,11.0,30.8,1.8,0.35,0.1,2018-08-07 20:00:00
399229,Banqiao,10.0,30.4,1.8,0.31,0.09,2018-08-07 21:00:00
399305,Banqiao,8.0,30.2,1.8,0.3,0.08,2018-08-07 22:00:00
399381,Banqiao,9.0,29.9,1.8,0.23,0.06,2018-08-07 23:00:00


In [None]:
normalized_columns = ['pm25', 'ambtemp', 'ch4', 'co', 'nmhc']

for column in normalized_columns:
    normalized_column_name = column + '_normalized'
    train_data[normalized_column_name] = (train_data[column] - train_data[column].min()) / (train_data[column].max() - train_data[column].min())

# Experimentation

- Prepare different scenario
- Log the experiment
- Monitor the result with MLFlow or tensorboard

# Inferencing

- Load the best model
- Prepare the test data
- Save the result

# Deployment