Author:
        
        PARK, JunHo, junho@ccnets.org

        
        KIM, JeongYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.

# Time Series Air Quality Data of India (2010-2023)

https://www.kaggle.com/datasets/abhisheksjha/time-series-air-quality-data-of-india-2010-2023/

*This notebook's data preprocessing is based on [here](https://www.kaggle.com/code/sotiristzamaras/india-s-air-quality-eda-ensemble-forecasting-pt-1).*

<a id="1"></a>
> <h1 style = 'font-family: Times New Roman'><b> <b style = 'color: #42c2f5'>1.</b> Import Necessary Libraries </b></h1>

In [None]:
import os
import sys
import time
import glob
import warnings
warnings.filterwarnings("ignore")

import numpy as np   
import pandas as pd  


path_append = "../../"
sys.path.append(path_append)  # Go up one directory from where you are.

In [None]:
dataset_name = "Time Series Air Quality Data of India"

DATASET_SRC = path_append + f'../data/Time Series Air Quality Data of India (2010-2023)'

df_states = pd.read_csv(f'{DATASET_SRC}/stations_info.csv')
df_states.drop(columns=['agency', 'station_location', 'start_month'], inplace=True)
df_states.head()

In [None]:
unique_states = df_states['state'].unique()
unique_states

In [None]:
def combine_state_df(state_name):
    '''
    Combine all state files into a single dataframe and attaching the city information.

    Parameters
    ----------
        state_name (str): The name of the state

    Return
    ------
        df (DataFrame): The combined dataframe from all files of a specific state
    '''
    
    state_code = df_states[df_states['state'] == state_name]['file_name'].iloc[0][:2]
    state_files = glob.glob(f'{DATASET_SRC}/{state_code}*.csv')
    print(f'Combining a total of {len(state_files)} files...\n')

    combined_df = []
    
    for state_file in state_files:
        file_name = state_file.split(f'{DATASET_SRC}')[1][1:-4]
        file_df = pd.read_csv(state_file)
        file_df['city'] = df_states[df_states['file_name'] == file_name]['city'].values[0]
        file_df['city'] = file_df['city'].astype('string')
        combined_df.append(file_df)
        
    return pd.concat(combined_df)

In [None]:
df = combine_state_df('Delhi')

In [None]:
# Make the 'From Date' column the index as datetime
def create_dt_index(dataframe):
    dataframe = dataframe.drop(columns='To Date')
    dataframe['From Date'] = pd.to_datetime(dataframe['From Date'])
    dataframe = dataframe.rename(columns={'From Date': 'datetime'})
    return dataframe.set_index('datetime')

df = create_dt_index(df)

In [None]:
reduction_groups = {
    "Xylene (ug/m3)":    ["Xylene ()"],
    "MP-Xylene (ug/m3)": ["MP-Xylene ()"],
    "Benzene (ug/m3)":   ["Benzene ()"],
    "Toluene (ug/m3)":   ["Toluene ()"],
    "SO2 (ug/m3)":       ["SO2 ()"],
    "NOx (ug/m3)":       ["NOx (ppb)"],
    "Ozone (ug/m3)":     ["Ozone (ppb)"],
    "AT (degree C)":     ["AT ()"],
    "WD (degree)":       ["WD (degree C)", "WD (deg)", "WD ()"],
    "WS (m/s)":          ["WS ()"]
}

In [None]:
def merge_columns(dataframe, columns):
    '''
    Merges column records into a single column.

    Parameters
    ----------
        dataframe (DataFrame): The DataFrame to edit
        column (str): The name of the column to merge records into
        cols_to_merge (list[str]): A list of column names to retrieve records
    '''
    
    for column, cols_to_merge in columns.items():
        # Check if the original column exist, otherwise create it
        if column not in dataframe.columns and any(name in dataframe.columns for name in cols_to_merge):
            dataframe[column] = np.nan

        for col_name in cols_to_merge:
            if col_name in dataframe.columns:
                dataframe[column] = dataframe[column].fillna(dataframe[col_name])
                dataframe = dataframe.drop(columns=[col_name])
            
    return dataframe
df = merge_columns(df, reduction_groups)

In [None]:
df = df.dropna(how='all')
df = df.dropna(how='all', axis='columns')

In [None]:
# Threshold value indicating how much of the dataset needs to be not missing.
threshold = 0.6
df = df.dropna(thresh=df.shape[0]*threshold, axis=1)

In [None]:
df = df.resample('60min').mean(numeric_only=True)

In [None]:
df['PM2.5 (ug/m3)'] = df['PM2.5 (ug/m3)'].mask(df['PM2.5 (ug/m3)'].gt(950))
df['CO (mg/m3)'] = df['CO (mg/m3)'].mask(((df.index > '2015') & df['CO (mg/m3)'].gt(35)))
df['Ozone (ug/m3)'] = df['Ozone (ug/m3)'].mask(df['Ozone (ug/m3)'].gt(185))
df['NOx (ug/m3)'] = df['NOx (ug/m3)'].mask((
    ((df.index < '2013') & (df['NOx (ug/m3)'].gt(380))) |
    ((df.index > '2015') & (df.index < '2016') & (df['NOx (ug/m3)'].gt(400))) |
    ((df.index > '2016') & (df['NOx (ug/m3)'].gt(450)))
))
df = df.interpolate(method='pad')
df = df.fillna(df.mean())

In [None]:
date_time = pd.to_datetime(df.index, format='%d.%m.%Y %H:%M:%S')
timestamp = date_time.map(pd.Timestamp.timestamp)
day = 24*60*60
year = (365.2425)*day

df['Day sin'] = np.sin(timestamp * (2 * np.pi / day))
df['Day cos'] = np.cos(timestamp * (2 * np.pi / day))
df['Year sin'] = np.sin(timestamp * (2 * np.pi / year))
df['Year cos'] = np.cos(timestamp * (2 * np.pi / year))

DATETIME_FEATURES = ['Day sin', 'Day cos', 'Year sin', 'Year cos']

In [None]:
df = df.query('datetime > 2014')

In [None]:
df.head()

In [None]:
import torch
from sklearn.model_selection import train_test_split
from tools.preprocessing.template_dataset import TemplateDataset

train_df, testset = train_test_split(df, test_size=0.2, shuffle=False)
train_df, val_df = train_test_split(train_df, test_size=0.25, shuffle=False)
# predict the next value in the sequence
train_df_x = train_df.iloc[:, :-1] # all columns except the last one
train_df_y = train_df.iloc[:, -1:] # only the last column

val_df_x = val_df.iloc[:, :-1] # all columns except the last one
val_df_y = val_df.iloc[:, -1:] # only the last column

print('train df shape: ', train_df.shape)
print('test df shape: ', val_df.shape)
min_seq_len = 16    
max_seq_len = 32
trainset = TemplateDataset(train_df_x, train_df_y, min_seq_len = min_seq_len, max_seq_len = max_seq_len)
valset = TemplateDataset(val_df_x, val_df_y, min_seq_len = max_seq_len, max_seq_len = max_seq_len)

In [None]:
from tools.setting.data_config import DataConfig
from tools.setting.ml_params import MLParameters
from trainer_hub import TrainerHub

num_features = trainset.X.shape[1]
num_classes = trainset.y.shape[1]
data_config = DataConfig(dataset_name = dataset_name, task_type='regression', obs_shape=[num_features], label_size=num_classes)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(model_name = 'gpt')
ml_params.training.error_function = 'mae'
ml_params.training.num_epoch = 1

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=True) 

In [None]:

trainer_hub.train(trainset, valset)

In [None]:
trainer_hub.test(valset)