# **2025 ML FALL HW1: PM2.5 Prediction (Regression)**

Author: MLTAs

Methods:
* Training with all data
* Optimizer: RMSProp (default)
* TODOs:
  - Complete the `valid()` function
  - Tune the hyperparameters in `train_config`
  - Implement 2nd-order polynomial regression model (without interaction terms) in `minibatch_2()`
  - Implement feature normalization in `normalize_train_data()`
  - Feature selection



# **Import Some Packages**

In [19]:
import numpy as np
import csv
import math
import pandas as pd
import os

# **Fix random seed**


This is for the reproduction of your result. **DO NOT modify this secton!**


In [20]:
seed = 9487
np.random.seed(seed)

# **Download training data**


In [21]:
# !gdown --id "1Hfzrcm69QwdFvdeF0uASoQlcVxKw_hHy" --output "train.csv"
# !gdown --id '155N6fzI7vAFzHAGdy6jkaWIksWH6Y1G2' --output "test.csv"

# Incase the links above die, you can use the following instead.
#!gdown --id '11abE854Eyv4BA7qt5k8r_80sJ3KuOQUN' --output "train.csv"
#!gdown --id '1uod-Z4ztluXnuHtgUbm39nMudUKqXHMl' --output "test.csv"

# If the data is still missing, you can manually download it from kaggle, and upload the files under /content

In [22]:
def valid(x, y):
  # TODO: Try to filter out extreme values.
  #  ex: If PM2.5 > 100, then we don't use the data to train (return False), otherwise return True,

  

  return True


# Create your dataset
def parse2train(data, feats):

  x = []
  y = []

  # Use data #0~#7 to predict #8 => Total data length should be decresased by 8.
  total_length = data.shape[1] - 8

  for i in range(total_length):
    x_tmp = data[feats, i:i+8] # Use data #0~#7 to predict #8, data #1~#8 to predict #9, etc.
    y_tmp = data[-1, i+8] # last column of (i+8)th row: PM2.5

    # Filter out extreme values to train.
    if valid(x_tmp, y_tmp):
      x.append(x_tmp.reshape(-1,))
      y.append(y_tmp)

  # x.shape: (n, 15, 8)
  # y.shape: (n, 1)
  x = np.array(x)
  y = np.array(y)

  return x,y


#**Gradient descent**
###**RMSProp**
1. $v_t=\beta \cdot v_{t-1} + (1-\beta)(\nabla w_t)^2$
2. $w_{t+1}=w_t - \frac{\eta}{\sqrt{(v_t)}+\epsilon}\nabla w_t$




* This is our gradient descent algorithm. RMSProp was implemented in `minibatch()`.
* You can implement other algorithm, such as SGD or other gradient descent variants listed below, which may (or may not) improve performance.
* However, **modules like sklearn and pytorch are not allowed!!!**
* Ref:
  - Prof. G. Hinton's lecture: https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
  - Prof. Hung-Yi Lee's video: https://youtu.be/HYUXEeh3kwY?si=RtLjSj51WK1pmz87

###**Adam (RMSProp + Momemtum)**
* Ref:
  - Paper: https://arxiv.org/pdf/1412.6980
  - Prof. Hung-Yi Lee's video: https://youtu.be/HYUXEeh3kwY?si=RtLjSj51WK1pmz87

###**AdamW (Adam with decoupled weight decay)**
* Ref:
  - Paper: https://arxiv.org/pdf/1711.05101




In [23]:
def minibatch(x, y, config):
    # Randomize the data in minibatch
    index = np.arange(x.shape[0])
    np.random.shuffle(index)
    x = x[index]
    y = y[index]

    # Initialization
    batch_size = config.batch_size
    lr = config.lr
    epoch = config.epoch
    decay_rate = config.decay_rate
    epsilon = 1e-8

    # Linear regression: only contains two parameters (w, b).
    w = np.full(x[0].shape, 0.1).reshape(-1, 1)
    w
    bias = 0.1

    # Optimizer states
    cache_w = np.zeros_like(w)
    cache_b = 0.0

    # Training loop
    for num in range(epoch):
        epoch_loss = 0
        count = 0
        for b in range(int(x.shape[0] / batch_size)):
            x_batch = x[b * batch_size:(b + 1) * batch_size]
            y_batch = y[b * batch_size:(b + 1) * batch_size].reshape(-1, 1)

            # Prediction of linear regression
            pred = np.dot(x_batch, w) + bias

            # Loss
            loss = y_batch - pred

            # Compute gradient
            g_t = np.dot(x_batch.transpose(), loss) * (-2)
            g_t_b = loss.sum(axis=0) * (-2)

            # Update cache
            cache_w = decay_rate * cache_w + (1 - decay_rate) * g_t**2
            cache_b = decay_rate * cache_b + (1 - decay_rate) * g_t_b**2

            # Update weight & bias
            w -= lr * g_t / (np.sqrt(cache_w) + epsilon)
            bias -= lr * g_t_b / (np.sqrt(cache_b) + epsilon)

            # Accumulate loss for RMSE
            epoch_loss += np.sum(loss ** 2)
            count += y_batch.shape[0]

        rmse = np.sqrt(epoch_loss / count)
        print(f"Epoch {num+1}/{epoch}, RMSE: {rmse:.4f}")

    return w, bias

# TODO: Implement 2-nd polynomial regression version for the report.
def minibatch_2(x, y, config):
    # Randomize the data in minibatch
    index = np.arange(x.shape[0])
    np.random.shuffle(index)
    x = x[index]
    y = y[index]
    
    # Initialization
    batch_size = config.batch_size
    lr = config.lr
    epoch = config.epoch
    decay_rate = config.decay_rate
    epsilon = 1e-8
    
    # Polynomial regression
    w2 = np.full(x[0].shape, 0.1).reshape(-1, 1)
    w1 = np.full(x[0].shape, 0.1).reshape(-1, 1)
    bias = 0.1

    # Optimizer states
    cache_w2 = np.zeros_like(w2)
    cache_w1 = np.zeros_like(w1)
    cache_b = 0.0
    
    # Training loop
    for num in range(epoch):
        epoch_loss = 0
        count = 0
        for b in range(int(x.shape[0] / batch_size)):
            x_batch = x[b * batch_size:(b + 1) * batch_size]
            y_batch = y[b * batch_size:(b + 1) * batch_size].reshape(-1, 1)
            
            # Prediction of polynomial regression
            pred = np.dot(x_batch**2, w2) + np.dot(x_batch, w1) + bias
            
            # Loss
            loss = y_batch - pred
            
            # Compute gradient - Fixed the gradient computation
            g_t2 = np.dot((x_batch**2).transpose(), loss) * (-2)  # Gradient for w2
            g_t1 = np.dot(x_batch.transpose(), loss) * (-2)      # Gradient for w1
            g_t_b = loss.sum(axis=0) * (-2)                      # Gradient for bias
            
            # Update cache
            cache_w2 = decay_rate * cache_w2 + (1 - decay_rate) * g_t2**2
            cache_w1 = decay_rate * cache_w1 + (1 - decay_rate) * g_t1**2
            cache_b = decay_rate * cache_b + (1 - decay_rate) * g_t_b**2
            
            # Update weights & bias
            w2 -= lr * g_t2 / (np.sqrt(cache_w2) + epsilon)
            w1 -= lr * g_t1 / (np.sqrt(cache_w1) + epsilon)
            bias -= lr * g_t_b / (np.sqrt(cache_b) + epsilon)
            
            # Accumulate loss for RMSE
            epoch_loss += np.sum(loss ** 2)
            count += y_batch.shape[0]
        rmse = np.sqrt(epoch_loss / count)
        print(f"Epoch {num+1}/{epoch}, RMSE: {rmse:.4f}")
    
    # Calculate RMSE on the entire training set
    total_pred = np.dot(x**2, w2) + np.dot(x, w1) + bias
    total_loss = y.reshape(-1, 1) - total_pred
    total_rmse = np.sqrt(np.mean(total_loss ** 2))
    print(f"Total Training RMSE: {total_rmse:.4f}")
    return w2, w1, bias

In [24]:
from argparse import Namespace

# TODO: Tune the config to boost your performance.
train_config = Namespace(
    batch_size = 16,
    lr = 0.001,
    epoch = 200,
    decay_rate = 0.9
)

use_norm = True

# **Training your regression model**

In [25]:
train_df = pd.read_csv("train.csv")
train_df

Unnamed: 0,AMB_TEMP,CO,NO,NO2,NOx,O3,PM10,WS_HR,RAINFALL,RH,SO2,WD_HR,WIND_DIREC,WIND_SPEED,PM2.5
0,10.8,0.32,1.7,8.6,10.3,22.9,21,0.6,0.0,71,1.9,172,171,0.6,15
1,10.8,0.27,1.6,6.2,7.8,23.8,20,1.4,0.0,71,1.7,161,129,1.8,13
2,11.0,0.25,0.9,5.4,6.3,27.4,21,0.8,0.0,68,1.6,152,147,1.5,12
3,11.0,0.23,0.7,3.1,3.8,29.5,21,1.8,0.0,68,1.6,138,145,1.7,9
4,11.3,0.22,0.8,2.9,3.8,30.7,16,1.9,0.0,67,1.6,140,139,1.7,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5769,29.0,0.41,1.2,14.0,15.3,23.0,21,1.4,0.0,74,2.8,149,168,2.0,14
5770,28.2,0.33,1.7,11.7,13.5,19.5,23,2.1,0.0,78,2.3,187,179,2.5,15
5771,28.0,0.29,1.3,9.1,10.4,17.6,17,1.5,0.0,78,2.0,173,200,1.5,13
5772,28.0,0.27,1.4,9.5,11.0,15.4,17,1.1,0.0,75,1.8,171,135,0.9,10


In [26]:
# TODO: Normalize each column (except PM2.5) for the report (use z-score normalization)
def normalize_train_data(df):
    """
    Steps:
    1. For each column (except PM2.5): calculate mean and std
    2. Apply standardization: (column - mean) / std
    3. Store normalization parameters for later use on test data

    Returns:
        normalized_df: DataFrame with normalized features
        norm_params: Dict with {'column': {'mean': X, 'std': Y}}

    Hint: Loop through data.columns, skip PM2.5
    """
    # Your implementation here
    normalize_df = df.copy()
    norm_params = {}
    for col in df.columns:
        if col != 'PM2.5':
            mean = df[col].mean()
            std = df[col].std()
            normalize_df[col] = (df[col] - mean) / std
            norm_params[col] = {'mean': mean, 'std': std}
    return normalize_df, norm_params

In [27]:
# Choose your features to train.
# Hint:
# 1. You can select more than one feature.
# 2. You should select "good" features.

# TODO: Carefully justify which feature should be chosen.
# feats = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
feats = [11, 14]

In [28]:
# Training data preprocessing
def train_processing(train_df, norm=False):
    """Process training train_df with optional normalization"""

    if norm:
        # Normalize training data and save parameters (mean & std)
        data_norm, norm_params = normalize_train_data(train_df)
        data_values = data_norm.values
    else:
        # Use raw training data
        data_values = train_df.values
        norm_params = None

    # Common processing steps
    train_data = np.transpose(np.array(np.float64(data_values)))
    train_x, train_y = parse2train(train_data, feats)

    return train_x, train_y, norm_params

train_x, train_y, norm_params = train_processing(train_df, norm=use_norm)

In [29]:
w, bias = minibatch(train_x, train_y, train_config)
print(w.shape, bias)

Epoch 1/200, RMSE: 4.9434
Epoch 2/200, RMSE: 4.3302
Epoch 3/200, RMSE: 4.0484
Epoch 4/200, RMSE: 3.9133
Epoch 5/200, RMSE: 3.8387
Epoch 6/200, RMSE: 3.7924
Epoch 7/200, RMSE: 3.7618
Epoch 8/200, RMSE: 3.7408
Epoch 9/200, RMSE: 3.7260
Epoch 10/200, RMSE: 3.7153
Epoch 11/200, RMSE: 3.7074
Epoch 12/200, RMSE: 3.7016
Epoch 13/200, RMSE: 3.6971
Epoch 14/200, RMSE: 3.6938
Epoch 15/200, RMSE: 3.6911
Epoch 16/200, RMSE: 3.6891
Epoch 17/200, RMSE: 3.6875
Epoch 18/200, RMSE: 3.6862
Epoch 19/200, RMSE: 3.6852
Epoch 20/200, RMSE: 3.6844
Epoch 21/200, RMSE: 3.6838
Epoch 22/200, RMSE: 3.6832
Epoch 23/200, RMSE: 3.6828
Epoch 24/200, RMSE: 3.6824
Epoch 25/200, RMSE: 3.6822
Epoch 26/200, RMSE: 3.6819
Epoch 27/200, RMSE: 3.6817
Epoch 28/200, RMSE: 3.6816
Epoch 29/200, RMSE: 3.6814
Epoch 30/200, RMSE: 3.6813
Epoch 31/200, RMSE: 3.6812
Epoch 32/200, RMSE: 3.6811
Epoch 33/200, RMSE: 3.6811
Epoch 34/200, RMSE: 3.6810
Epoch 35/200, RMSE: 3.6810
Epoch 36/200, RMSE: 3.6809
Epoch 37/200, RMSE: 3.6809
Epoch 38/2

In [30]:
# # Plot feature to PM2.5 scatter plot
# import matplotlib.pyplot as plt
# plt.figure(figsize=(10, 6))
# plt.scatter(train_df["NO"], train_df["PM2.5"], alpha=0.5)
# plt.title('Feature vs PM2.5 Scatter Plot')
# plt.xlabel('Feature Value')
# plt.ylabel('PM2.5 Value')
# plt.show()

In [31]:
# # Plot each feature to PM2.5 scatter plot and save the figures
# for feature in train_df.columns:
#     if feature != 'PM2.5':
#         plt.figure(figsize=(10, 6))
#         plt.scatter(train_df[feature], train_df["PM2.5"], alpha=0.5)
#         plt.title(f'{feature} vs PM2.5 Scatter Plot')
#         plt.xlabel(f'{feature} Value')
#         plt.ylabel('PM2.5 Value')
#         plt.savefig(f'{feature}_vs_PM2.5_scatter.png')
#         plt.close()

# **Testing:**


In [32]:
def parse2test(data, feats):
  x = []
  for i in range(90):
    x_tmp = data[feats,8*i: 8*i+8]
    x.append(x_tmp.reshape(-1,))

  # x.shape: (n, 15, 8)
  x = np.array(x)
  return x

In [33]:
def normalize_test_data(df, norm_params):
    data_norm = df.copy()

    for col, params in norm_params.items():
        if col in df.columns:
            data_norm[col] = (df[col] - params['mean']) / params['std']

    return data_norm

In [34]:
test_df = pd.read_csv('test.csv')
test_df

Unnamed: 0,AMB_TEMP,CO,NO,NO2,NOx,O3,PM10,WS_HR,RAINFALL,RH,SO2,WD_HR,WIND_DIREC,WIND_SPEED,PM2.5
0,27.5,0.22,0.7,9.0,9.8,13.2,31.0,1.2,0.0,79.0,1.7,180.0,171.0,1.2,20.0
1,27.2,0.17,0.4,5.0,5.4,15.7,20.0,1.5,0.0,79.0,1.6,192.0,187.0,1.9,8.0
2,26.8,0.17,0.4,4.3,4.8,12.8,16.0,1.6,0.0,81.0,1.3,181.0,180.0,1.8,9.0
3,26.7,0.19,0.4,4.1,4.5,12.0,21.0,1.7,0.0,80.0,1.5,179.0,188.0,2.3,6.0
4,26.4,0.22,0.4,4.1,4.6,10.1,23.0,2.2,0.0,81.0,1.5,184.0,186.0,1.9,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,16.0,0.26,0.3,3.9,4.2,47.1,34.0,2.7,0.0,70.0,0.5,130.0,133.0,2.8,13.0
716,15.6,0.25,0.4,3.3,3.7,44.1,27.0,3.0,0.0,74.0,0.6,136.0,131.0,3.2,15.0
717,15.7,0.24,0.4,3.7,4.1,44.1,29.0,2.9,0.0,73.0,0.6,133.0,129.0,2.7,12.0
718,15.1,0.24,0.6,10.5,11.1,29.9,9.0,0.8,0.0,95.0,0.6,24.0,21.0,1.1,8.0


In [35]:
# Testing data preprocessing
def test_processing(test_df, norm=False, norm_params=norm_params):
    if norm:
        if norm_params is None:
            raise ValueError("norm_params required when norm=True")

        # Apply training normalization parameters to testing data
        data_norm = normalize_test_data(test_df, norm_params)
        data_values = data_norm.values
    else:
        # Use raw testing data
        data_values = test_df.values

    # Common processing steps
    test_data = np.transpose(np.array(np.float64(data_values)))
    test_x = parse2test(test_data, feats)

    return test_x

test_x = test_processing(test_df, norm=use_norm, norm_params=norm_params)

# **Write result as .csv**

---



In [36]:
with open('my_sol.csv', 'w', newline='') as csvf:
  writer = csv.writer(csvf)
  writer.writerow(['Id','Predicted'])

  print(test_x.shape)
  for i in range(int(test_x.shape[0])):
    # Prediction of linear regression
    prediction = (np.dot(np.reshape(w,-1),test_x[i]) + bias)[0]
    writer.writerow([i, prediction])

(90, 16)
