# Imports

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import json
import time
import os
import numpy as np


In [2]:
# Data directory based on root folder
DATA_DIR = 'data'


# GET ABSOLUTE PATH
head, _ = os.path.split(os.getcwd()) # Get parent directory, from notebooks
DATA_DIR = os.path.join(head, 'data')

# DATA LOADING

In [3]:
def load_data(filename: str, prefix: str="", data_dir: os.PathLike=DATA_DIR):
    """Load data file with a specific prefix"""

    filename = f"{prefix}-{filename}" if len(prefix) else filename

    filepath = os.path.join(data_dir, filename)
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Data file not found: {filepath}. Run scraping first.")

    df = pd.read_csv(filepath)
    df.columns = df.columns.str.strip()
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])

    # if 'date' in df.columns:
    #     df['date'] = pd.to_datetime(df['date'])

    return df

# ============================================================================
# DATA LOADING FUNCTIONS WINDSPEED AND DIRECTION
# ============================================================================

def load_all_stations(input_dir: os.PathLike =DATA_DIR, file_pattern: str=""):
    """Load data for all stations"""
    import glob
    pattern = os.path.join(input_dir, file_pattern)
    station_files = glob.glob(pattern)

    station_dfs = []
    for filepath in station_files:
        filename = os.path.basename(filepath)
        station_dfs.append(load_data(filename, data_dir=input_dir))

    return pd.concat(station_dfs).reset_index(drop=True)

def load_stations_metadata(input_dir: os.PathLike=DATA_DIR):
    """Load stations metadata"""
    filename = os.path.join(input_dir, "stations_metadata.csv")
    if not os.path.exists(filename):
        raise FileNotFoundError(f"Metadata file not found: {filename}")

    return pd.read_csv(filename)


In [4]:
REGIONS_PREFIX = ['east', 'west', 'north', 'south', 'central']

pm25_hourly_df_temp = {region: load_data(f"pm2.5-hourly-230701-251101.csv", prefix=region, data_dir=os.path.join(DATA_DIR, "pm2.5-hourly")) for region in REGIONS_PREFIX}
pm25_hourly_df = []
for region, df in pm25_hourly_df_temp.items():
    df['region'] = region
    pm25_hourly_df.append(df)
pm25_hourly_df = pd.concat(pm25_hourly_df).reset_index(drop=True)
pm25_hourly_df['region'] = pm25_hourly_df['region'].astype('category')
pm25_hourly_df['pm25'] = pd.to_numeric(pm25_hourly_df['pm25'], errors='coerce')
pm25_hourly_df.drop(columns=['date'], axis=1, inplace=True)
del pm25_hourly_df_temp


pm25_daily_df_temp = {region: load_data(f"singapore-air-quality.csv", prefix=region, data_dir=os.path.join(DATA_DIR, "pm2.5-daily")) for region in REGIONS_PREFIX}
pm25_daily_df = []
for region, df in pm25_daily_df_temp.items():
    df['region'] = region
    pm25_daily_df.append(df)
pm25_daily_df = pd.concat(pm25_daily_df).reset_index(drop=True)
pm25_daily_df['region'] = pm25_daily_df['region'].astype('category')
pm25_daily_df.drop(columns=['pm10', 'o3', 'no2', 'so2', 'co', 'psi'], axis=1, inplace=True)
pm25_daily_df['pm25'] = pd.to_numeric(pm25_daily_df['pm25'], errors='coerce')
del pm25_daily_df_temp


wind_speed_df = load_all_stations(os.path.join(DATA_DIR, 'wind-speed'), file_pattern="*-wind-speed-hourly-*.csv")
wind_speed_df.drop(columns=['last_update'], axis=1, inplace=True)
wind_speed_df['station_name'] = wind_speed_df['station_name'].astype('category')

wind_direction_df = load_all_stations(os.path.join(DATA_DIR, 'wind-direction'), file_pattern="*-wind-direction-hourly-*.csv")
wind_direction_df.drop(columns=['last_update'], axis=1, inplace=True)
wind_direction_df['station_name'] = wind_direction_df['station_name'].astype('category')

air_temperature_df = load_all_stations(os.path.join(DATA_DIR, 'air-temperature'), file_pattern="*-air-temperature-hourly-*.csv")
air_temperature_df.drop(columns=['last_update'], axis=1, inplace=True)
air_temperature_df['station_name'] = air_temperature_df['station_name'].astype('category')

In [6]:
# 1) make sure timestamp is datetime64 and date is also datetime64 (not python date)
wind_speed_df["date"] = pd.to_datetime(wind_speed_df["timestamp"].dt.date)

# (optional) make station_name a plain string to avoid category weirdness
wind_speed_df["station_name"] = wind_speed_df["station_name"].astype(str)

# 2) group JUST the numeric column, then reset_index
wind_speed_daily_df = (
    wind_speed_df
    .groupby(["station_name", "date"])["wind_speed_avg"]
    .agg(["mean", "max", "min", "std"])
    .reset_index()
)
wind_speed_daily_df.rename({'date': 'timestamp'}, axis=1, inplace=True)
wind_speed_df.drop(columns=['date'], axis=1, inplace=True)

# 3) rename columns to something nice
wind_speed_daily_df = wind_speed_daily_df.rename(columns={
    "mean": "wind_speed_avg_mean",
    "max": "wind_speed_avg_max",
    "min": "wind_speed_avg_min",
    "std": "wind_speed_avg_std",
})

# get one lat/lon per station
station_coords = (
    wind_speed_df
    .groupby("station_name")[["latitude", "longitude"]]
    .first()
    .reset_index()
)

wind_speed_daily_df = wind_speed_daily_df.merge(station_coords, on="station_name", how="left")

# WIND DIRECTION
wind_direction_df["date"] = pd.to_datetime(wind_direction_df["timestamp"].dt.date)
wind_direction_df["station_name"] = wind_direction_df["station_name"].astype(str)

wind_direction_daily_df = (
    wind_direction_df
    .groupby(["station_name", "date"])["wind_direction_avg"]
    .agg(["mean", "max", "min", "std"])
    .reset_index()
    .rename(columns={
        "mean": "wind_direction_avg_mean",
        "max": "wind_direction_avg_max",
        "min": "wind_direction_avg_min",
        "std": "wind_direction_avg_std",
    })
)
wind_direction_daily_df.rename({'date': 'timestamp'}, axis=1, inplace=True)
wind_direction_df.drop(columns=['date'], axis=1, inplace=True)


station_coords = (
    wind_direction_df
    .groupby("station_name")[["latitude", "longitude"]]
    .first()
    .reset_index()
)

wind_direction_daily_df = wind_direction_daily_df.merge(station_coords, on="station_name", how="left")

# AIR TEMPERATURE
air_temperature_df["date"] = pd.to_datetime(air_temperature_df["timestamp"].dt.date)
air_temperature_df["station_name"] = air_temperature_df["station_name"].astype(str)

air_temperature_daily_df = (
    air_temperature_df
    .groupby(["station_name", "date"])["air_temperature_avg"]
    .agg(["mean", "max", "min", "std"])
    .reset_index()
    .rename(columns={
        "mean": "air_temperature_avg_mean",
        "max": "air_temperature_avg_max",
        "min": "air_temperature_avg_min",
        "std": "air_temperature_avg_std",
    })
)
air_temperature_daily_df.rename({'date': 'timestamp'}, axis=1, inplace=True)
air_temperature_df.drop(columns=['date'], axis=1, inplace=True)


station_coords = (
    air_temperature_df
    .groupby("station_name")[["latitude", "longitude"]]
    .first()
    .reset_index()
)

air_temperature_daily_df = air_temperature_daily_df.merge(station_coords, on="station_name", how="left")


In [7]:
# PLAYGROUND AREA TO VIEW LOADED DATAFRAMES

wind_direction_df.head(60)


Unnamed: 0,timestamp,station_name,wind_direction_avg,wind_direction_std,reading_count,latitude,longitude
0,2023-01-01 00:00:00+08:00,Upper Changi Road North,56.066598,0.068556,60.0,1.3678,103.9826
1,2023-01-01 01:00:00+08:00,Upper Changi Road North,56.867296,0.112889,60.0,1.3678,103.9826
2,2023-01-01 02:00:00+08:00,Upper Changi Road North,63.101041,0.128357,60.0,1.3678,103.9826
3,2023-01-01 03:00:00+08:00,Upper Changi Road North,66.554556,0.075083,60.0,1.3678,103.9826
4,2023-01-01 04:00:00+08:00,Upper Changi Road North,66.099435,0.042358,60.0,1.3678,103.9826
5,2023-01-01 05:00:00+08:00,Upper Changi Road North,62.364784,0.052254,60.0,1.3678,103.9826
6,2023-01-01 06:00:00+08:00,Upper Changi Road North,54.312672,0.151527,60.0,1.3678,103.9826
7,2023-01-01 07:00:00+08:00,Upper Changi Road North,32.768455,0.439317,54.0,1.3678,103.9826
8,2023-01-10 11:00:00+08:00,Upper Changi Road North,74.578086,0.053767,45.0,1.3678,103.9826
9,2023-01-10 12:00:00+08:00,Upper Changi Road North,71.098144,1.682904,60.0,1.3678,103.9826


In [8]:
pm25_daily_df.head()

wind_direction_daily_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13093 entries, 0 to 13092
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   station_name             13093 non-null  object        
 1   timestamp                13093 non-null  datetime64[ns]
 2   wind_direction_avg_mean  13093 non-null  float64       
 3   wind_direction_avg_max   13093 non-null  float64       
 4   wind_direction_avg_min   13093 non-null  float64       
 5   wind_direction_avg_std   13074 non-null  float64       
 6   latitude                 13093 non-null  float64       
 7   longitude                13093 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 818.4+ KB


# FEATURE ENGINEERING FOR REGRESSION

In [9]:
REGION_COORDS = pd.DataFrame({
    "region": ["central", "north", "south", "east", "west"],
    "latitude": [1.3521, 1.4180, 1.2800, 1.3500, 1.3400],
    "longitude": [103.8198, 103.8270, 103.8500, 103.9400, 103.7000]
})

In [10]:
from typing import Callable

def regression_features_pm25_daily(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Lag features
    for lag in [1, 2, 3, 4, 5, 6, 7, 14, 28]:
        df[f'pm25_lag_{lag}d'] = df['pm25'].shift(lag)

    # Rolling statistics
    for window in [3, 7, 14, 28]:
        min_valid = max(1, window // 2)
        df[f'pm25_rolling_mean_{window}d'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'pm25_rolling_std_{window}d'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).std()
        df[f'pm25_rolling_min_{window}d'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).min()
        df[f'pm25_rolling_max_{window}d'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).max()

    df_clean = df.dropna()

    return df_clean


def regression_features_pm25_hourly(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Lag features
    for lag in [1, 2, 3, 6, 12, 24]:
        df[f'pm25_lag_{lag}h'] = df['pm25'].shift(lag)

    # Rolling statistics
    for window in [6, 12, 24, 72, 168]:
        min_valid = max(1, window // 2)
        df[f'pm25_rolling_mean_{window}h'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'pm25_rolling_std_{window}h'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).std()
        df[f'pm25_rolling_min_{window}h'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).min()
        df[f'pm25_rolling_max_{window}h'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).max()

    df_clean = df.dropna()

    return df_clean


def regression_features_wind_direction(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Map coordinates to a sensor region
    df["region"] = _map_coords_to_region(df, REGION_COORDS)

    # Convert direction to vector components (recommended for modeling)
    df['wind_u'] = -np.sin(np.deg2rad(df['wind_direction_avg']))
    df['wind_v'] = -np.cos(np.deg2rad(df['wind_direction_avg']))

    # Lag features (using vector components)
    for lag in [1, 2, 3, 6, 12, 24, 48, 72, 168]:
        df[f'wind_u_lag_{lag}h'] = df['wind_u'].shift(lag)
        df[f'wind_v_lag_{lag}h'] = df['wind_v'].shift(lag)
        df[f'wind_direction_lag_{lag}h'] = df['wind_direction_avg'].shift(lag)

    # Rolling statistics
    for window in [6, 12, 24, 72, 168]:
        min_valid = max(1, window // 2)
        df[f'wind_u_rolling_mean_{window}h'] = df['wind_u'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'wind_v_rolling_mean_{window}h'] = df['wind_v'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'direction_std_rolling_mean_{window}h'] = df['wind_direction_std'].shift(1).rolling(window, min_periods=min_valid).mean()

    df_clean = df.dropna()

    return df_clean


def regression_features_wind_speed(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Map coordinates to a sensor region
    df["region"] = _map_coords_to_region(df, REGION_COORDS)

    # Lag features
    for lag in [1, 2, 3, 6, 12, 24, 48, 72, 168]:
        df[f'wind_speed_lag_{lag}h'] = df['wind_speed_avg'].shift(lag)

    # Rolling statistics
    for window in [6, 12, 24, 72, 168]:
        min_valid = max(1, window // 2)
        df[f'wind_speed_rolling_mean_{window}h'] = df['wind_speed_avg'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'wind_speed_rolling_std_{window}h'] = df['wind_speed_avg'].shift(1).rolling(window, min_periods=min_valid).std()
        df[f'wind_speed_rolling_min_{window}h'] = df['wind_speed_avg'].shift(1).rolling(window, min_periods=min_valid).min()
        df[f'wind_speed_rolling_max_{window}h'] = df['wind_speed_avg'].shift(1).rolling(window, min_periods=min_valid).max()

    df_clean = df.dropna()

    return df_clean


def regression_features_air_temperature(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Map coordinates to a sensor region
    df["region"] = _map_coords_to_region(df, REGION_COORDS)

    # Lag features
    for lag in [1, 2, 3, 6, 12, 24, 48, 72, 168]:
        df[f'air_temperature_lag_{lag}h'] = df['air_temperature_avg'].shift(lag)

    # Rolling statistics
    for window in [6, 12, 24, 72, 168]:
        min_valid = max(1, window // 2)
        df[f'air_temperature_rolling_mean_{window}h'] = df['air_temperature_avg'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'air_temperature_rolling_std_{window}h'] = df['air_temperature_avg'].shift(1).rolling(window, min_periods=min_valid).std()
        df[f'air_temperature_rolling_min_{window}h'] = df['air_temperature_avg'].shift(1).rolling(window, min_periods=min_valid).min()
        df[f'air_temperature_rolling_max_{window}h'] = df['air_temperature_avg'].shift(1).rolling(window, min_periods=min_valid).max()

    df_clean = df.dropna()

    return df_clean

def regression_features_wind_direction_daily(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Map coordinates to a sensor region
    df["region"] = _map_coords_to_region(df, REGION_COORDS)

    # Convert direction to vector components (recommended for modeling)
    df['wind_u'] = -np.sin(np.deg2rad(df['wind_direction_avg_mean']))
    df['wind_v'] = -np.cos(np.deg2rad(df['wind_direction_avg_mean']))

    # Lag features (using vector components)
    for lag in [1, 2, 3, 4, 5, 6, 7, 14, 28]:
        df[f'wind_u_lag_{lag}d'] = df['wind_u'].shift(lag)
        df[f'wind_v_lag_{lag}d'] = df['wind_v'].shift(lag)
        df[f'wind_direction_lag_{lag}d'] = df['wind_direction_avg_mean'].shift(lag)

    # Rolling statistics
    for window in [3, 7, 14, 28]:
        min_valid = max(1, window // 2)
        df[f'wind_u_rolling_mean_{window}d'] = df['wind_u'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'wind_v_rolling_mean_{window}d'] = df['wind_v'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'direction_std_rolling_mean_{window}d'] = df['wind_direction_avg_std'].shift(1).rolling(window, min_periods=min_valid).mean()

    df_clean = df.dropna()

    return df_clean


def regression_features_wind_speed_daily(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Map coordinates to a sensor region
    df["region"] = _map_coords_to_region(df, REGION_COORDS)

    # Lag features
    for lag in [1, 2, 3, 4, 5, 6, 7, 14, 28]:
        df[f'wind_speed_lag_{lag}d'] = df['wind_speed_avg_mean'].shift(lag)

    # Rolling statistics
    for window in [3, 7, 14, 28]:
        min_valid = max(1, window // 2)
        df[f'wind_speed_rolling_mean_{window}d'] = df['wind_speed_avg_mean'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'wind_speed_rolling_std_{window}d'] = df['wind_speed_avg_mean'].shift(1).rolling(window, min_periods=min_valid).std()
        df[f'wind_speed_rolling_min_{window}d'] = df['wind_speed_avg_mean'].shift(1).rolling(window, min_periods=min_valid).min()
        df[f'wind_speed_rolling_max_{window}d'] = df['wind_speed_avg_mean'].shift(1).rolling(window, min_periods=min_valid).max()

    df_clean = df.dropna()

    return df_clean


def regression_features_air_temperature_daily(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Map coordinates to a sensor region
    df["region"] = _map_coords_to_region(df, REGION_COORDS)

    # Lag features
    for lag in [1, 2, 3, 4, 5, 6, 7, 14, 28]:
        df[f'air_temperature_lag_{lag}d'] = df['air_temperature_avg_mean'].shift(lag)

    # Rolling statistics
    for window in [3, 7, 14, 28]:
        min_valid = max(1, window // 2)
        df[f'air_temperature_rolling_mean_{window}d'] = df['air_temperature_avg_mean'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'air_temperature_rolling_std_{window}d'] = df['air_temperature_avg_mean'].shift(1).rolling(window, min_periods=min_valid).std()
        df[f'air_temperature_rolling_min_{window}d'] = df['air_temperature_avg_mean'].shift(1).rolling(window, min_periods=min_valid).min()
        df[f'air_temperature_rolling_max_{window}d'] = df['air_temperature_avg_mean'].shift(1).rolling(window, min_periods=min_valid).max()

    df_clean = df.dropna()

    return df_clean

def _map_coords_to_region(df: pd.DataFrame, region_coords: pd.DataFrame) -> pd.Series:
    """
    For each row in df (with latitude/longitude) find the nearest region from region_coords.
    Returns a pandas Series of region names aligned with df.
    """
    # df: N x 2 (lat, lon)
    sensor_xy = df[["latitude", "longitude"]].to_numpy()  # shape (N, 2)
    # region_coords: M x 2 (lat, lon)
    region_xy = region_coords[["latitude", "longitude"]].to_numpy()  # shape (M, 2)

    # compute squared distances (N, M)
    # distance^2 = (lat1 - lat2)^2 + (lon1 - lon2)^2
    diff_lat = sensor_xy[:, [0]] - region_xy[:, 0]  # (N, 1) - (M,) -> (N, M)
    diff_lon = sensor_xy[:, [1]] - region_xy[:, 1]
    dist_sq = diff_lat**2 + diff_lon**2  # (N, M)

    # index of closest region for each sensor row
    nearest_idx = dist_sq.argmin(axis=1)  # (N,)

    # map to region names
    regions = region_coords["region"].to_numpy()
    return pd.Series(regions[nearest_idx], index=df.index, name="region")

def apply_func_to_groups(df: pd.DataFrame, group_col: list[str], func: Callable[[pd.DataFrame], pd.DataFrame]) -> pd.DataFrame:
    """Apply a function to each group in the DataFrame and combine results"""
    grouped = df.groupby(group_col)
    processed_groups = []

    for name, group in grouped:
        processed_group = func(group)
        processed_groups.append(processed_group)

    return pd.concat(processed_groups).reset_index(drop=True)

def add_time_features(df: pd.DataFrame, add_hour: bool = False) -> pd.DataFrame:
    """Add time-based features to the DataFrame"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Time-based features
    if add_hour:
        df['hour'] = (df['timestamp'].dt.hour).astype('int8')
    df['day_of_week'] = (df['timestamp'].dt.dayofweek).astype('int8')
    df['day_of_month'] = (df['timestamp'].dt.day).astype('int8')
    df['month'] = (df['timestamp'].dt.month).astype('int8')
    df['year'] = df['timestamp'].dt.year
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(bool)

    return df

In [11]:
pm25_daily_df = add_time_features(pm25_daily_df, add_hour=False)
pm25_hourly_df = add_time_features(pm25_hourly_df, add_hour=True)
wind_direction_df = add_time_features(wind_direction_df, add_hour=True)
wind_speed_df = add_time_features(wind_speed_df, add_hour=True)
air_temperature_df = add_time_features(air_temperature_df, add_hour=True)
wind_direction_daily_df = add_time_features(wind_direction_daily_df, add_hour=False)
wind_speed_daily_df = add_time_features(wind_speed_daily_df, add_hour=False)
air_temperature_daily_df = add_time_features(air_temperature_daily_df, add_hour=False)


In [12]:
pm25_daily_df = apply_func_to_groups(pm25_daily_df, ['region'], regression_features_pm25_daily)
pm25_hourly_df = apply_func_to_groups(pm25_hourly_df, ['region', 'hour'], regression_features_pm25_hourly)
wind_direction_df = apply_func_to_groups(wind_direction_df, ['station_name', 'hour'], regression_features_wind_direction)
wind_speed_df = apply_func_to_groups(wind_speed_df, ['station_name', 'hour'], regression_features_wind_speed)
air_temperature_df = apply_func_to_groups(air_temperature_df, ['station_name', 'hour'], regression_features_air_temperature)


wind_direction_daily_df = apply_func_to_groups(wind_direction_daily_df, ['station_name'], regression_features_wind_direction_daily)
wind_speed_daily_df = apply_func_to_groups(wind_speed_daily_df, ['station_name'], regression_features_wind_speed_daily)
air_temperature_daily_df = apply_func_to_groups(air_temperature_daily_df, ['station_name'], regression_features_air_temperature_daily)


  grouped = df.groupby(group_col)
  grouped = df.groupby(group_col)


In [13]:
wind_direction_df.sort_values('timestamp').head(20)

Unnamed: 0,timestamp,station_name,wind_direction_avg,wind_direction_std,reading_count,latitude,longitude,hour,day_of_week,day_of_month,...,direction_std_rolling_mean_12h,wind_u_rolling_mean_24h,wind_v_rolling_mean_24h,direction_std_rolling_mean_24h,wind_u_rolling_mean_72h,wind_v_rolling_mean_72h,direction_std_rolling_mean_72h,wind_u_rolling_mean_168h,wind_v_rolling_mean_168h,direction_std_rolling_mean_168h
197241,2023-07-04 00:00:00+08:00,Upper Changi Road North,136.284523,0.277187,60.0,1.3678,103.9826,0,1,4,...,0.342364,-0.231205,0.54047,0.255512,-0.143837,0.411524,0.22633,-0.413835,-0.09623,0.22313
217255,2023-07-04 00:00:00+08:00,Woodlands Avenue 9,83.212754,0.691519,60.0,1.44387,103.78538,0,1,4,...,0.20921,-0.077396,0.214065,0.225863,-0.061691,0.174369,0.278999,-0.161832,-0.305012,0.237289
78667,2023-07-04 00:00:00+08:00,Kim Chuan Road,354.49431,1.115611,59.0,1.3399,103.8878,0,1,4,...,0.267278,0.458087,0.21796,0.349232,0.254996,0.111967,0.319284,-0.228215,-0.266953,0.230591
38509,2023-07-04 00:00:00+08:00,Clementi Road,55.662297,0.573353,60.0,1.3337,103.7768,0,1,4,...,0.496263,-0.439556,0.037654,0.430933,-0.308723,-0.010966,0.390861,-0.151856,-0.450044,0.323934
39344,2023-07-04 01:00:00+08:00,Clementi Road,46.747706,0.29209,60.0,1.3337,103.7768,1,1,4,...,0.450999,-0.453327,0.039036,0.388331,-0.271053,-0.076223,0.414325,-0.113155,-0.487747,0.324969
197429,2023-07-04 01:00:00+08:00,Upper Changi Road North,120.306759,0.160094,60.0,1.3678,103.9826,1,1,4,...,0.38583,-0.191691,0.636694,0.303496,-0.164763,0.458693,0.272443,-0.421258,-0.090111,0.250577
79493,2023-07-04 01:00:00+08:00,Kim Chuan Road,31.186005,0.993322,60.0,1.3399,103.8878,1,1,4,...,0.308418,0.489519,0.287179,0.378063,0.266914,0.150085,0.329605,-0.196411,-0.272664,0.241094
218065,2023-07-04 01:00:00+08:00,Woodlands Avenue 9,30.871252,0.419744,60.0,1.44387,103.78538,1,1,4,...,0.238365,-0.197975,0.168248,0.219692,-0.05157,0.078315,0.260411,-0.140819,-0.370901,0.224144
218873,2023-07-04 02:00:00+08:00,Woodlands Avenue 9,18.032573,0.910018,60.0,1.44387,103.78538,2,1,4,...,0.398955,-0.196448,0.089997,0.416017,-0.016351,0.07872,0.344573,-0.08307,-0.377543,0.271517
40179,2023-07-04 02:00:00+08:00,Clementi Road,53.48613,0.269464,60.0,1.3337,103.7768,2,1,4,...,0.470931,-0.376388,0.062392,0.36637,-0.229945,-0.05808,0.327949,-0.073627,-0.468737,0.312636


In [14]:
# print(pm25_daily_df.info())
print(pm25_daily_df.dtypes)
# print(pm25_daily_df.columns)
# print(pm25_hourly_df.info())
print(pm25_hourly_df.dtypes)
# print(pm25_hourly_df.columns)
# print(wind_direction_df.info())
print(wind_direction_df.dtypes)
# print(wind_direction_df.columns)
# print(wind_speed_df.info())
print(wind_speed_df.dtypes)
# print(wind_speed_df.columns)
# print(air_temperature_df.region.head(200000))
print(air_temperature_df.dtypes)
# print(air_temperature_df.columns)
# print(wind_direction_daily_df.info())
print(wind_direction_daily_df.dtypes)
# print(wind_direction_daily_df.columns)
# print(wind_speed_daily_df.info())
print(wind_speed_daily_df.dtypes)
# print(wind_speed_daily_df.columns)
# print(air_temperature_daily_df.info())
print(air_temperature_daily_df.dtypes)


timestamp                datetime64[ns]
pm25                            float64
region                         category
day_of_week                        int8
day_of_month                       int8
month                              int8
year                              int32
is_weekend                         bool
pm25_lag_1d                     float64
pm25_lag_2d                     float64
pm25_lag_3d                     float64
pm25_lag_4d                     float64
pm25_lag_5d                     float64
pm25_lag_6d                     float64
pm25_lag_7d                     float64
pm25_lag_14d                    float64
pm25_lag_28d                    float64
pm25_rolling_mean_3d            float64
pm25_rolling_std_3d             float64
pm25_rolling_min_3d             float64
pm25_rolling_max_3d             float64
pm25_rolling_mean_7d            float64
pm25_rolling_std_7d             float64
pm25_rolling_min_7d             float64
pm25_rolling_max_7d             float64


# Data Ingestion

In [15]:
import hopsworks
project = hopsworks.login(engine="python", project="akeelaf")
fs = project.get_feature_store()

2025-11-15 19:37:36,888 INFO: Initializing external client
2025-11-15 19:37:36,888 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2025-11-15 19:37:38,289 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1277076


In [None]:
# %%script echo skipping --no-raise-error

# Feature Group 1: Hourly PM2.5 features
fg_pm25_hourly = fs.get_or_create_feature_group(
    name="pm25_hourly",
    description="Hourly PM2.5 features with short-term patterns",
    version=3,
    primary_key=["region", "timestamp"],
    partition_key=["region"],
    event_time="timestamp",
)
fg_pm25_hourly.insert(pm25_hourly_df)

# # Feature Group 2: Daily PM2.5 features
fg_pm25_daily = fs.get_or_create_feature_group(
    name="pm25_daily",
    description="Daily PM2.5 aggregations for long-term trends",
    version=3,
    primary_key=["region", "timestamp"],
    partition_key=["region"],
    event_time="timestamp",
)
fg_pm25_daily.insert(pm25_daily_df)

# # Feature Group 3: Wind Direction features
fg_wind_direction = fs.get_or_create_feature_group(
    name="wind_direction_hourly",
    description="Wind direction features with vector components",
    version=3,
    primary_key=["region", "timestamp"],
    partition_key=["station_name", "region"],
    event_time="timestamp",
)
fg_wind_direction.insert(wind_direction_df)

# # Feature Group 4: Wind Speed features
fg_wind_speed = fs.get_or_create_feature_group(
    name="wind_speed_hourly",
    description="Wind speed features",
    version=3,
    primary_key=["region", "timestamp"],
    partition_key=["station_name", "region"],
    event_time="timestamp",
)
fg_wind_speed.insert(wind_speed_df)

# # Feature Group 5: Air Temperature features
fg_air_temperature = fs.get_or_create_feature_group(
    name="air_temperature_hourly",
    description="Air temperature features",
    version=3,
    primary_key=["region", "timestamp"],
    partition_key=["station_name", "region"],
    event_time="timestamp",
)
fg_air_temperature.insert(air_temperature_df)

time.sleep(150) # this is to avoid exceeding 5 parallel job limit

# Feature Group 6: Wind Direction features (daily)
fg_wind_direction_daily = fs.get_or_create_feature_group(
    name="wind_direction_daily",
    description="Daily wind direction features with vector components",
    version=3,
    primary_key=["region", "timestamp"],
    partition_key=["station_name", "region"],
    event_time="timestamp",
)
fg_wind_direction_daily.insert(wind_direction_daily_df)

# Feature Group 7: Wind Speed features (daily)
fg_wind_speed_daily = fs.get_or_create_feature_group(
    name="wind_speed_daily",
    description="Daily wind speed features",
    version=3,
    primary_key=["region", "timestamp"],
    partition_key=["station_name", "region"],
    event_time="timestamp",
)
fg_wind_speed_daily.insert(wind_speed_daily_df)

# Feature Group 8: Air Temperature features (daily)
fg_air_temperature_daily = fs.get_or_create_feature_group(
    name="air_temperature_daily",
    description="Daily air temperature features",
    version=3,
    primary_key=["station_name", "timestamp"],
    partition_key=["station_name", "region"],
    event_time="timestamp",
)
fg_air_temperature_daily.insert(air_temperature_daily_df)


Uploading Dataframe: 100.00% |â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| Rows 91197/91197 | Elapsed Time: 00:03 | Remaining Time: 00:00


Launching job: pm25_hourly_3_offline_fg_materialization


KeyboardInterrupt: 

In [16]:
# FORECASRT_HOURS_AHEAD = 24
# # Target variables
# df['target'] = df['wind_speed_avg'].shift(-FORECASRT_HOURS_AHEAD)

# df['target'] = df['wind_direction_avg'].shift(-FORECASRT_HOURS_AHEAD)

# df['target'] = df['pm25'].shift(-FORECASRT_HOURS_AHEAD)
