In [3]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import lightgbm as lgb

In [4]:
# Define feature columns and target column
FEATURE_COLUMNS = ['lat', 'lon',
                   'TMP_P0_L1_GLL0', 'SPFH_P0_2L108_GLL0', 'RH_P0_L4_GLL0',
                   'PWAT_P0_L200_GLL0', 'UGRD_P0_L6_GLL0', 'GUST_P0_L1_GLL0',
                   'PRES_P0_L7_GLL0', 'CultivatedLand', 'WoodLand', 'GrassLand', 'Waters',
                   'UrbanRural', 'UnusedLand', 'Ocean', 'ELEVATION', 'AOD', 'month',
                   'year', 'weekday']

target_columns = ['o3', 'pm10', 'pm2_5', 'no2']

# Load the datasets
train_set = pd.read_csv('./dataset/train_set.csv')
test_set = pd.read_csv('./dataset/test_set.csv')

In [5]:
# LightGBM model parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    # 'device': 'gpu',  # Enable GPU training
    # 'gpu_platform_id': 0,  # Use the first GPU
    # 'gpu_device_id': 0,
    'num_leaves': 1500,
    'max_depth': 20,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

In [6]:
def train_lgb(train_set, target_column):
    """
    Train a LightGBM model on the given training set for a specified target column.
    
    Args:
    train_set (pd.DataFrame): The training dataset.
    target_column (str): The name of the target column.
    
    Returns:
    lgb.Booster: Trained LightGBM model.
    """
    # Prepare the training data
    X_train, y_train = train_set[FEATURE_COLUMNS], train_set[target_column]
    
    # Create LightGBM dataset
    train_data = lgb.Dataset(X_train, label=y_train)
    
    # Train the model
    num_rounds = 800
    lgb_model = lgb.train(params, train_data, num_rounds)
    
    return lgb_model

In [7]:
def predict_lgb(model, test_set, target_column):
    """
    Make predictions using a trained LightGBM model on the test set.
    
    Args:
    model (lgb.Booster): Trained LightGBM model.
    test_set (pd.DataFrame): The test dataset.
    target_column (str): The name of the target column.
    
    Returns:
    tuple: A tuple containing:
        - y_pred (np.array): Predictions on the test set.
        - y_test (pd.Series): Actual values from the test set.
    """
    # Prepare the testing data
    X_test, y_test = test_set[FEATURE_COLUMNS], test_set[target_column]
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    return y_pred, y_test

In [10]:
def evaluate_model(y_test, y_pred):
    """
    Evaluate the model performance using various metrics.
    
    Args:
    y_test (pd.Series): Actual values.
    y_pred (np.array): Predicted values.
    
    Returns:
    dict: A dictionary containing the evaluation metrics.
    """
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    smape = np.mean(2 * np.abs(y_pred - y_test) / (np.abs(y_pred) + np.abs(y_test))) * 100
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return {
        "RMSE": rmse,
        "SMAPE": smape,
        "MAE": mae,
        "R2": r2
    }

In [12]:

for target_column in target_columns:
    print(f"\n--- Processing {target_column} ---")
    # Train the model
    lgb_model = train_lgb(train_set, target_column)
    # Make predictions
    y_pred, y_test = predict_lgb(lgb_model, test_set, target_column)
    # Save the results
    results_df = pd.DataFrame({
        'y_test': y_test,
        'predictions': y_pred
    })
    results_df.to_csv(f'{target_column}_results.csv', index=False)
    # Evaluate the model
    metrics = evaluate_model(y_test, y_pred)
    
    # Print the evaluation metrics
    for metric, value in metrics.items():
        print(f"{metric}: {value}")


--- Processing o3 ---
RMSE: 13.182913084622786
SMAPE: 18.55019421688177
MAE: 9.982861611922699
R2: 0.7838088284058791

--- Processing pm10 ---
RMSE: 30.139230066977923
SMAPE: 26.134141942020918
MAE: 15.499543182866114
R2: 0.7518057614399709

--- Processing pm2_5 ---
RMSE: 15.83370898405577
SMAPE: 28.54110193432686
MAE: 9.002091174875636
R2: 0.7690399196693183

--- Processing no2 ---
RMSE: 8.068337885363944
SMAPE: 28.418755811008158
MAE: 6.023987340566541
R2: 0.7271012637136374
