# Imports

In [44]:
import pandas as pd
from datetime import datetime, timedelta
import time
import os
import numpy as np


In [45]:
# Data directory based on root folder
DATA_DIR = 'data'


# GET ABSOLUTE PATH
head, _ = os.path.split(os.getcwd()) # Get parent directory, from notebooks
DATA_DIR = os.path.join(head, 'data')

# DATA LOADING

In [46]:
def load_data(filename: str, prefix: str="", data_dir: os.PathLike=DATA_DIR):
    """Load data file with a specific prefix"""

    filename = f"{prefix}-{filename}" if len(prefix) else filename

    filepath = os.path.join(data_dir, filename)
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Data file not found: {filepath}. Run scraping first.")

    df = pd.read_csv(filepath)
    df.columns = df.columns.str.strip()
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])

    # if 'date' in df.columns:
    #     df['date'] = pd.to_datetime(df['date'])

    return df

# ============================================================================
# DATA LOADING FUNCTIONS WINDSPEED AND DIRECTION
# ============================================================================

def load_all_stations(input_dir: os.PathLike =DATA_DIR, file_pattern: str=""):
    """Load data for all stations"""
    import glob
    pattern = os.path.join(input_dir, file_pattern)
    station_files = glob.glob(pattern)

    station_dfs = []
    for filepath in station_files:
        filename = os.path.basename(filepath)
        station_dfs.append(load_data(filename, data_dir=input_dir))

    return pd.concat(station_dfs).reset_index(drop=True)

def load_stations_metadata(input_dir: os.PathLike=DATA_DIR):
    """Load stations metadata"""
    filename = os.path.join(input_dir, "stations_metadata.csv")
    if not os.path.exists(filename):
        raise FileNotFoundError(f"Metadata file not found: {filename}")

    return pd.read_csv(filename)


In [47]:
REGIONS_PREFIX = ['east', 'west', 'north', 'south', 'central']

pm25_hourly_df_temp = {region: load_data(f"pm2.5-hourly-230701-251101.csv", prefix=region, data_dir=os.path.join(DATA_DIR, "pm2.5-hourly")) for region in REGIONS_PREFIX}
pm25_hourly_df = []
for region, df in pm25_hourly_df_temp.items():
    df['region'] = region
    pm25_hourly_df.append(df)
pm25_hourly_df = pd.concat(pm25_hourly_df).reset_index(drop=True)
pm25_hourly_df['region'] = pm25_hourly_df['region'].astype('category')
pm25_hourly_df['pm25'] = pd.to_numeric(pm25_hourly_df['pm25'], errors='coerce')
pm25_hourly_df.drop(columns=['date'], axis=1, inplace=True)
del pm25_hourly_df_temp


pm25_daily_df_temp = {region: load_data(f"singapore-air-quality.csv", prefix=region, data_dir=os.path.join(DATA_DIR, "pm2.5-daily")) for region in REGIONS_PREFIX}
pm25_daily_df = []
for region, df in pm25_daily_df_temp.items():
    df['region'] = region
    pm25_daily_df.append(df)
pm25_daily_df = pd.concat(pm25_daily_df).reset_index(drop=True)
pm25_daily_df['region'] = pm25_daily_df['region'].astype('category')
pm25_daily_df.drop(columns=['pm10', 'o3', 'no2', 'so2', 'co', 'psi'], axis=1, inplace=True)
pm25_daily_df['pm25'] = pd.to_numeric(pm25_daily_df['pm25'], errors='coerce')
del pm25_daily_df_temp


wind_speed_df = load_all_stations(os.path.join(DATA_DIR, 'wind-speed'), file_pattern="*-wind-speed-hourly-*.csv")
wind_speed_df.drop(columns=['last_update'], axis=1, inplace=True)
wind_speed_df['station_name'] = wind_speed_df['station_name'].astype('category')

wind_direction_df = load_all_stations(os.path.join(DATA_DIR, 'wind-direction'), file_pattern="*-wind-direction-hourly-*.csv")
wind_direction_df.drop(columns=['last_update'], axis=1, inplace=True)
wind_direction_df['station_name'] = wind_direction_df['station_name'].astype('category')

air_temperature_df = load_all_stations(os.path.join(DATA_DIR, 'air-temperature'), file_pattern="*-air-temperature-hourly-*.csv")
air_temperature_df.drop(columns=['last_update'], axis=1, inplace=True)
air_temperature_df['station_name'] = air_temperature_df['station_name'].astype('category')

In [48]:
REGION_COORDS = pd.DataFrame({
    "region": ["central", "north", "south", "east", "west"],
    "latitude": [1.3521, 1.4180, 1.2800, 1.3500, 1.3400],
    "longitude": [103.8198, 103.8270, 103.8500, 103.9400, 103.7000]
})

def _map_coords_to_region(df: pd.DataFrame, region_coords: pd.DataFrame) -> pd.Series:
    """
    For each row in df (with latitude/longitude) find the nearest region from region_coords.
    Returns a pandas Series of region names aligned with df.
    """
    # df: N x 2 (lat, lon)
    sensor_xy = df[["latitude", "longitude"]].to_numpy()  # shape (N, 2)
    # region_coords: M x 2 (lat, lon)
    region_xy = region_coords[["latitude", "longitude"]].to_numpy()  # shape (M, 2)

    # compute squared distances (N, M)
    # distance^2 = (lat1 - lat2)^2 + (lon1 - lon2)^2
    diff_lat = sensor_xy[:, [0]] - region_xy[:, 0]  # (N, 1) - (M,) -> (N, M)
    diff_lon = sensor_xy[:, [1]] - region_xy[:, 1]
    dist_sq = diff_lat**2 + diff_lon**2  # (N, M)

    # index of closest region for each sensor row
    nearest_idx = dist_sq.argmin(axis=1)  # (N,)

    # map to region names
    regions = region_coords["region"].to_numpy()
    return pd.Series(regions[nearest_idx], index=df.index, name="region")

In [49]:
# 1) make sure timestamp is datetime64 and date is also datetime64 (not python date)
wind_speed_df["date"] = pd.to_datetime(wind_speed_df["timestamp"].dt.date)

# (optional) make station_name a plain string to avoid category weirdness
wind_speed_df["station_name"] = wind_speed_df["station_name"].astype(str)
wind_speed_df.drop(columns=['station_name'], axis=1, inplace=True)

# Map coordinates to a sensor region
wind_speed_df["region"] = _map_coords_to_region(wind_speed_df, REGION_COORDS)

# 2) group JUST the numeric column, then reset_index
wind_speed_daily_df = (
    wind_speed_df
    .groupby(["region", "date"])["wind_speed_avg"]
    .agg(["mean", "max", "min", "std"])
    .reset_index()
)
wind_speed_daily_df.rename({'date': 'timestamp'}, axis=1, inplace=True)
wind_speed_df.drop(columns=['date'], axis=1, inplace=True)

# 3) rename columns to something nice
wind_speed_daily_df = wind_speed_daily_df.rename(columns={
    "mean": "wind_speed_avg_mean",
    "max": "wind_speed_avg_max",
    "min": "wind_speed_avg_min",
    "std": "wind_speed_avg_std",
})

# get one lat/lon per station
station_coords = (
    wind_speed_df
    .groupby("region")[["latitude", "longitude"]]
    .first()
    .reset_index()
)

wind_speed_daily_df = wind_speed_daily_df.merge(station_coords, on="region", how="left")

# WIND DIRECTION
wind_direction_df["date"] = pd.to_datetime(wind_direction_df["timestamp"].dt.date)
wind_direction_df["region"] = _map_coords_to_region(wind_direction_df, REGION_COORDS)
wind_direction_df.drop(columns=['station_name'], axis=1, inplace=True)

wind_direction_daily_df = (
    wind_direction_df
    .groupby(["region", "date"])["wind_direction_avg"]
    .agg(["mean", "max", "min", "std"])
    .reset_index()
    .rename(columns={
        "mean": "wind_direction_avg_mean",
        "max": "wind_direction_avg_max",
        "min": "wind_direction_avg_min",
        "std": "wind_direction_avg_std",
    })
)
wind_direction_daily_df.rename({'date': 'timestamp'}, axis=1, inplace=True)
wind_direction_df.drop(columns=['date'], axis=1, inplace=True)


station_coords = (
    wind_direction_df
    .groupby("region")[["latitude", "longitude"]]
    .first()
    .reset_index()
)

wind_direction_daily_df = wind_direction_daily_df.merge(station_coords, on="region", how="left")

# AIR TEMPERATURE
air_temperature_df["date"] = pd.to_datetime(air_temperature_df["timestamp"].dt.date)
air_temperature_df["region"] = _map_coords_to_region(air_temperature_df, REGION_COORDS)
air_temperature_df.drop(columns=['station_name'], axis=1, inplace=True)
air_temperature_daily_df = (
    air_temperature_df
    .groupby(["region", "date"])["air_temperature_avg"]
    .agg(["mean", "max", "min", "std"])
    .reset_index()
    .rename(columns={
        "mean": "air_temperature_avg_mean",
        "max": "air_temperature_avg_max",
        "min": "air_temperature_avg_min",
        "std": "air_temperature_avg_std",
    })
)
air_temperature_daily_df.rename({'date': 'timestamp'}, axis=1, inplace=True)
air_temperature_df.drop(columns=['date'], axis=1, inplace=True)


station_coords = (
    air_temperature_df
    .groupby("region")[["latitude", "longitude"]]
    .first()
    .reset_index()
)

air_temperature_daily_df = air_temperature_daily_df.merge(station_coords, on="region", how="left")


In [50]:
# PLAYGROUND AREA TO VIEW LOADED DATAFRAMES

wind_direction_df.head(50000)


Unnamed: 0,timestamp,wind_direction_avg,wind_direction_std,reading_count,latitude,longitude,region
0,2023-01-01 00:00:00+08:00,56.066598,0.068556,60.0,1.3678,103.9826,east
1,2023-01-01 01:00:00+08:00,56.867296,0.112889,60.0,1.3678,103.9826,east
2,2023-01-01 02:00:00+08:00,63.101041,0.128357,60.0,1.3678,103.9826,east
3,2023-01-01 03:00:00+08:00,66.554556,0.075083,60.0,1.3678,103.9826,east
4,2023-01-01 04:00:00+08:00,66.099435,0.042358,60.0,1.3678,103.9826,east
...,...,...,...,...,...,...,...
49995,2023-03-16 09:00:00+08:00,56.614044,0.142326,60.0,1.3764,103.8492,central
49996,2023-03-16 10:00:00+08:00,60.493069,0.212711,60.0,1.3764,103.8492,central
49997,2023-03-16 11:00:00+08:00,65.805034,0.101880,60.0,1.3764,103.8492,central
49998,2023-03-16 12:00:00+08:00,70.220346,0.096111,60.0,1.3764,103.8492,central


In [51]:
pm25_daily_df.head()

wind_direction_daily_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5093 entries, 0 to 5092
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   region                   5093 non-null   object        
 1   timestamp                5093 non-null   datetime64[ns]
 2   wind_direction_avg_mean  5093 non-null   float64       
 3   wind_direction_avg_max   5093 non-null   float64       
 4   wind_direction_avg_min   5093 non-null   float64       
 5   wind_direction_avg_std   5093 non-null   float64       
 6   latitude                 5093 non-null   float64       
 7   longitude                5093 non-null   float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 318.4+ KB


# FEATURE ENGINEERING FOR REGRESSION

In [52]:
from typing import Callable

def regression_features_pm25_daily(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Lag features
    for lag in [1, 2, 3, 4, 5, 6, 7, 14, 28]:
        df[f'pm25_lag_{lag}d'] = df['pm25'].shift(lag)

    # Rolling statistics
    for window in [3, 7, 14, 28]:
        min_valid = max(1, window // 2)
        df[f'pm25_rolling_mean_{window}d'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'pm25_rolling_std_{window}d'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).std()
        df[f'pm25_rolling_min_{window}d'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).min()
        df[f'pm25_rolling_max_{window}d'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).max()

    df_clean = df.dropna()

    return df_clean


def regression_features_pm25_hourly(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Lag features
    for lag in [1, 2, 3, 6, 12, 24]:
        df[f'pm25_lag_{lag}h'] = df['pm25'].shift(lag)

    # Rolling statistics
    for window in [6, 12, 24, 72, 168]:
        min_valid = max(1, window // 2)
        df[f'pm25_rolling_mean_{window}h'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'pm25_rolling_std_{window}h'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).std()
        df[f'pm25_rolling_min_{window}h'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).min()
        df[f'pm25_rolling_max_{window}h'] = df['pm25'].shift(1).rolling(window, min_periods=min_valid).max()

    df_clean = df.dropna()

    return df_clean


def regression_features_wind_direction(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Convert direction to vector components (recommended for modeling)
    df['wind_u'] = -np.sin(np.deg2rad(df['wind_direction_avg']))
    df['wind_v'] = -np.cos(np.deg2rad(df['wind_direction_avg']))

    # Lag features (using vector components)
    for lag in [1, 2, 3, 6, 12, 24, 48, 72, 168]:
        df[f'wind_u_lag_{lag}h'] = df['wind_u'].shift(lag)
        df[f'wind_v_lag_{lag}h'] = df['wind_v'].shift(lag)
        df[f'wind_direction_lag_{lag}h'] = df['wind_direction_avg'].shift(lag)

    # Rolling statistics
    for window in [6, 12, 24, 72, 168]:
        min_valid = max(1, window // 2)
        df[f'wind_u_rolling_mean_{window}h'] = df['wind_u'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'wind_v_rolling_mean_{window}h'] = df['wind_v'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'direction_std_rolling_mean_{window}h'] = df['wind_direction_std'].shift(1).rolling(window, min_periods=min_valid).mean()

    df_clean = df.dropna()

    return df_clean


def regression_features_wind_speed(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Lag features
    for lag in [1, 2, 3, 6, 12, 24, 48, 72, 168]:
        df[f'wind_speed_lag_{lag}h'] = df['wind_speed_avg'].shift(lag)

    # Rolling statistics
    for window in [6, 12, 24, 72, 168]:
        min_valid = max(1, window // 2)
        df[f'wind_speed_rolling_mean_{window}h'] = df['wind_speed_avg'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'wind_speed_rolling_std_{window}h'] = df['wind_speed_avg'].shift(1).rolling(window, min_periods=min_valid).std()
        df[f'wind_speed_rolling_min_{window}h'] = df['wind_speed_avg'].shift(1).rolling(window, min_periods=min_valid).min()
        df[f'wind_speed_rolling_max_{window}h'] = df['wind_speed_avg'].shift(1).rolling(window, min_periods=min_valid).max()

    df_clean = df.dropna()

    return df_clean


def regression_features_air_temperature(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Lag features
    for lag in [1, 2, 3, 6, 12, 24, 48, 72, 168]:
        df[f'air_temperature_lag_{lag}h'] = df['air_temperature_avg'].shift(lag)

    # Rolling statistics
    for window in [6, 12, 24, 72, 168]:
        min_valid = max(1, window // 2)
        df[f'air_temperature_rolling_mean_{window}h'] = df['air_temperature_avg'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'air_temperature_rolling_std_{window}h'] = df['air_temperature_avg'].shift(1).rolling(window, min_periods=min_valid).std()
        df[f'air_temperature_rolling_min_{window}h'] = df['air_temperature_avg'].shift(1).rolling(window, min_periods=min_valid).min()
        df[f'air_temperature_rolling_max_{window}h'] = df['air_temperature_avg'].shift(1).rolling(window, min_periods=min_valid).max()

    df_clean = df.dropna()

    return df_clean

def regression_features_wind_direction_daily(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Convert direction to vector components (recommended for modeling)
    df['wind_u'] = -np.sin(np.deg2rad(df['wind_direction_avg_mean']))
    df['wind_v'] = -np.cos(np.deg2rad(df['wind_direction_avg_mean']))

    # Lag features (using vector components)
    for lag in [1, 2, 3, 4, 5, 6, 7, 14, 28]:
        df[f'wind_u_lag_{lag}d'] = df['wind_u'].shift(lag)
        df[f'wind_v_lag_{lag}d'] = df['wind_v'].shift(lag)
        df[f'wind_direction_lag_{lag}d'] = df['wind_direction_avg_mean'].shift(lag)

    # Rolling statistics
    for window in [3, 7, 14, 28]:
        min_valid = max(1, window // 2)
        df[f'wind_u_rolling_mean_{window}d'] = df['wind_u'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'wind_v_rolling_mean_{window}d'] = df['wind_v'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'direction_std_rolling_mean_{window}d'] = df['wind_direction_avg_std'].shift(1).rolling(window, min_periods=min_valid).mean()

    df_clean = df.dropna()

    return df_clean


def regression_features_wind_speed_daily(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Lag features
    for lag in [1, 2, 3, 4, 5, 6, 7, 14, 28]:
        df[f'wind_speed_lag_{lag}d'] = df['wind_speed_avg_mean'].shift(lag)

    # Rolling statistics
    for window in [3, 7, 14, 28]:
        min_valid = max(1, window // 2)
        df[f'wind_speed_rolling_mean_{window}d'] = df['wind_speed_avg_mean'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'wind_speed_rolling_std_{window}d'] = df['wind_speed_avg_mean'].shift(1).rolling(window, min_periods=min_valid).std()
        df[f'wind_speed_rolling_min_{window}d'] = df['wind_speed_avg_mean'].shift(1).rolling(window, min_periods=min_valid).min()
        df[f'wind_speed_rolling_max_{window}d'] = df['wind_speed_avg_mean'].shift(1).rolling(window, min_periods=min_valid).max()

    df_clean = df.dropna()

    return df_clean


def regression_features_air_temperature_daily(df: pd.DataFrame):
    """Prepare features for time series regression modeling"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Lag features
    for lag in [1, 2, 3, 4, 5, 6, 7, 14, 28]:
        df[f'air_temperature_lag_{lag}d'] = df['air_temperature_avg_mean'].shift(lag)

    # Rolling statistics
    for window in [3, 7, 14, 28]:
        min_valid = max(1, window // 2)
        df[f'air_temperature_rolling_mean_{window}d'] = df['air_temperature_avg_mean'].shift(1).rolling(window, min_periods=min_valid).mean()
        df[f'air_temperature_rolling_std_{window}d'] = df['air_temperature_avg_mean'].shift(1).rolling(window, min_periods=min_valid).std()
        df[f'air_temperature_rolling_min_{window}d'] = df['air_temperature_avg_mean'].shift(1).rolling(window, min_periods=min_valid).min()
        df[f'air_temperature_rolling_max_{window}d'] = df['air_temperature_avg_mean'].shift(1).rolling(window, min_periods=min_valid).max()

    df_clean = df.dropna()

    return df_clean

def apply_func_to_groups(df: pd.DataFrame, group_col: list[str], func: Callable[[pd.DataFrame], pd.DataFrame]) -> pd.DataFrame:
    """Apply a function to each group in the DataFrame and combine results"""
    grouped = df.groupby(group_col)
    processed_groups = []

    for name, group in grouped:
        processed_group = func(group)
        processed_groups.append(processed_group)

    return pd.concat(processed_groups).reset_index(drop=True)

def add_time_features(df: pd.DataFrame, add_hour: bool = False) -> pd.DataFrame:
    """Add time-based features to the DataFrame"""
    df = df.copy().sort_values('timestamp').reset_index(drop=True)

    # Time-based features
    if add_hour:
        df['hour'] = (df['timestamp'].dt.hour).astype('int8')
    df['day_of_week'] = (df['timestamp'].dt.dayofweek).astype('int8')
    df['day_of_month'] = (df['timestamp'].dt.day).astype('int8')
    df['month'] = (df['timestamp'].dt.month).astype('int8')
    df['year'] = df['timestamp'].dt.year
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(bool)

    return df

In [53]:
pm25_daily_df = add_time_features(pm25_daily_df, add_hour=False)
pm25_hourly_df = add_time_features(pm25_hourly_df, add_hour=True)
wind_direction_df = add_time_features(wind_direction_df, add_hour=True)
wind_speed_df = add_time_features(wind_speed_df, add_hour=True)
air_temperature_df = add_time_features(air_temperature_df, add_hour=True)
wind_direction_daily_df = add_time_features(wind_direction_daily_df, add_hour=False)
wind_speed_daily_df = add_time_features(wind_speed_daily_df, add_hour=False)
air_temperature_daily_df = add_time_features(air_temperature_daily_df, add_hour=False)


In [54]:
pm25_daily_df = apply_func_to_groups(pm25_daily_df, ['region'], regression_features_pm25_daily)
pm25_hourly_df = apply_func_to_groups(pm25_hourly_df, ['region', 'hour'], regression_features_pm25_hourly)
wind_direction_df = apply_func_to_groups(wind_direction_df, ['region', 'hour'], regression_features_wind_direction)
wind_speed_df = apply_func_to_groups(wind_speed_df, ['region', 'hour'], regression_features_wind_speed)
air_temperature_df = apply_func_to_groups(air_temperature_df, ['region', 'hour'], regression_features_air_temperature)


wind_direction_daily_df = apply_func_to_groups(wind_direction_daily_df, ['region'], regression_features_wind_direction_daily)
wind_speed_daily_df = apply_func_to_groups(wind_speed_daily_df, ['region'], regression_features_wind_speed_daily)
air_temperature_daily_df = apply_func_to_groups(air_temperature_daily_df, ['region'], regression_features_air_temperature_daily)






In [55]:
wind_direction_daily_df.sort_values('timestamp').head(20000)

Unnamed: 0,region,timestamp,wind_direction_avg_mean,wind_direction_avg_max,wind_direction_avg_min,wind_direction_avg_std,latitude,longitude,day_of_week,day_of_month,...,direction_std_rolling_mean_3d,wind_u_rolling_mean_7d,wind_v_rolling_mean_7d,direction_std_rolling_mean_7d,wind_u_rolling_mean_14d,wind_v_rolling_mean_14d,direction_std_rolling_mean_14d,wind_u_rolling_mean_28d,wind_v_rolling_mean_28d,direction_std_rolling_mean_28d
0,central,2023-02-08,46.295062,263.239046,1.993722,42.555060,1.37640,103.84920,2,8,...,113.009784,-0.169062,0.605966,127.451698,-0.003106,0.519555,132.707377,-0.227158,0.303812,118.436202
1990,north,2023-02-08,56.005667,295.596917,12.867162,48.156629,1.41720,103.74855,2,8,...,120.139920,-0.597240,0.293284,129.887700,-0.480924,0.297988,130.093244,-0.472814,0.085041,108.020489
2963,south,2023-02-08,50.115725,356.849305,1.415319,81.010038,1.27990,103.87030,2,8,...,102.995899,0.524135,0.558861,125.687060,0.544105,0.361664,130.788732,0.202195,0.331144,121.712098
3958,west,2023-02-08,63.356286,358.044898,0.043789,68.723804,1.29377,103.61843,2,8,...,139.310156,-0.382633,0.755992,136.465881,-0.380924,0.704683,140.343757,-0.460989,0.558398,125.684419
995,east,2023-02-08,131.056182,359.612307,0.434006,133.328964,1.36780,103.98260,2,8,...,132.761439,-0.388600,0.727038,136.790497,-0.437226,0.758647,141.955019,-0.522598,0.652511,133.075351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,east,2025-11-01,136.491805,307.241909,13.847271,59.869253,1.36780,103.98260,5,1,...,62.753692,0.409612,0.784414,67.793125,0.604165,0.583245,66.173770,0.414640,0.666946,70.905828
994,central,2025-11-01,122.206443,355.913403,9.109982,60.513963,1.37640,103.84920,5,1,...,67.413768,0.689673,0.481305,72.690416,0.719592,0.360424,68.043173,0.429629,0.462003,73.762552
2962,north,2025-11-01,147.934004,358.804757,14.816042,105.272599,1.41720,103.74855,5,1,...,42.039246,0.689588,0.625158,52.621348,0.804889,0.428611,55.738561,0.614232,0.497384,64.461917
3957,south,2025-11-01,96.239255,186.355135,38.556611,35.035283,1.27990,103.87030,5,1,...,51.736483,0.240970,0.815759,62.865415,0.497753,0.676922,59.591691,0.128468,0.710987,65.934724


In [56]:
# print(pm25_daily_df.info())
print(pm25_daily_df.dtypes)
# print(pm25_daily_df.columns)
# print(pm25_hourly_df.info())
print(pm25_hourly_df.dtypes)
# print(pm25_hourly_df.columns)
# print(wind_direction_df.info())
print(wind_direction_df.dtypes)
# print(wind_direction_df.columns)
# print(wind_speed_df.info())
print(wind_speed_df.dtypes)
# print(wind_speed_df.columns)
# print(air_temperature_df.region.head(200000))
print(air_temperature_df.dtypes)
# print(air_temperature_df.columns)
# print(wind_direction_daily_df.info())
print(wind_direction_daily_df.dtypes)
# print(wind_direction_daily_df.columns)
# print(wind_speed_daily_df.info())
print(wind_speed_daily_df.dtypes)
# print(wind_speed_daily_df.columns)
# print(air_temperature_daily_df.info())
print(air_temperature_daily_df.dtypes)


timestamp                datetime64[ns]
pm25                            float64
region                         category
day_of_week                        int8
day_of_month                       int8
month                              int8
year                              int32
is_weekend                         bool
pm25_lag_1d                     float64
pm25_lag_2d                     float64
pm25_lag_3d                     float64
pm25_lag_4d                     float64
pm25_lag_5d                     float64
pm25_lag_6d                     float64
pm25_lag_7d                     float64
pm25_lag_14d                    float64
pm25_lag_28d                    float64
pm25_rolling_mean_3d            float64
pm25_rolling_std_3d             float64
pm25_rolling_min_3d             float64
pm25_rolling_max_3d             float64
pm25_rolling_mean_7d            float64
pm25_rolling_std_7d             float64
pm25_rolling_min_7d             float64
pm25_rolling_max_7d             float64


# Data Ingestion

In [57]:
import hopsworks
project = hopsworks.login(engine="python", project="akeelaf")
fs = project.get_feature_store()

2025-11-16 15:35:27,261 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-11-16 15:35:27,266 INFO: Initializing external client
2025-11-16 15:35:27,266 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2025-11-16 15:35:28,566 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1277076


In [58]:
# %%script echo skipping --no-raise-error

# Feature Group 1: Hourly PM2.5 features
fg_pm25_hourly = fs.get_or_create_feature_group(
    name="pm25_hourly",
    description="Hourly PM2.5 features with short-term patterns",
    version=4,
    primary_key=["region", "timestamp"],
    partition_key=["region"],
    event_time="timestamp",
)
fg_pm25_hourly.insert(pm25_hourly_df)

# # Feature Group 2: Daily PM2.5 features
fg_pm25_daily = fs.get_or_create_feature_group(
    name="pm25_daily",
    description="Daily PM2.5 aggregations for long-term trends",
    version=4,
    primary_key=["region", "timestamp"],
    partition_key=["region"],
    event_time="timestamp",
)
fg_pm25_daily.insert(pm25_daily_df)

# # Feature Group 3: Wind Direction features
fg_wind_direction = fs.get_or_create_feature_group(
    name="wind_direction_hourly",
    description="Wind direction features with vector components",
    version=4,
    primary_key=["region", "timestamp"],
    partition_key=["region"],
    event_time="timestamp",
)
fg_wind_direction.insert(wind_direction_df)

# # Feature Group 4: Wind Speed features
fg_wind_speed = fs.get_or_create_feature_group(
    name="wind_speed_hourly",
    description="Wind speed features",
    version=4,
    primary_key=["region", "timestamp"],
    partition_key=["region"],
    event_time="timestamp",
)
fg_wind_speed.insert(wind_speed_df)

# # Feature Group 5: Air Temperature features
fg_air_temperature = fs.get_or_create_feature_group(
    name="air_temperature_hourly",
    description="Air temperature features",
    version=4,
    primary_key=["region", "timestamp"],
    partition_key=["region"],
    event_time="timestamp",
)
fg_air_temperature.insert(air_temperature_df)

time.sleep(150) # this is to avoid exceeding 5 parallel job limit

# Feature Group 6: Wind Direction features (daily)
fg_wind_direction_daily = fs.get_or_create_feature_group(
    name="wind_direction_daily",
    description="Daily wind direction features with vector components",
    version=4,
    primary_key=["region", "timestamp"],
    partition_key=["region"],
    event_time="timestamp",
)
fg_wind_direction_daily.insert(wind_direction_daily_df)

# Feature Group 7: Wind Speed features (daily)
fg_wind_speed_daily = fs.get_or_create_feature_group(
    name="wind_speed_daily",
    description="Daily wind speed features",
    version=4,
    primary_key=["region", "timestamp"],
    partition_key=["region"],
    event_time="timestamp",
)
fg_wind_speed_daily.insert(wind_speed_daily_df)

# Feature Group 8: Air Temperature features (daily)
fg_air_temperature_daily = fs.get_or_create_feature_group(
    name="air_temperature_daily",
    description="Daily air temperature features",
    version=4,
    primary_key=["region", "timestamp"],
    partition_key=["region"],
    event_time="timestamp",
)
fg_air_temperature_daily.insert(air_temperature_daily_df)


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1277076/fs/1263683/fg/1721739


Uploading Dataframe: 100.00% |██████████| Rows 91197/91197 | Elapsed Time: 00:03 | Remaining Time: 00:00


Launching job: pm25_hourly_4_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1277076/jobs/named/pm25_hourly_4_offline_fg_materialization/executions
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1277076/fs/1263683/fg/1721740


Uploading Dataframe: 100.00% |██████████| Rows 20774/20774 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: pm25_daily_4_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1277076/jobs/named/pm25_daily_4_offline_fg_materialization/executions
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1277076/fs/1263683/fg/1721741


Uploading Dataframe: 100.00% |██████████| Rows 286424/286424 | Elapsed Time: 00:19 | Remaining Time: 00:00


Launching job: wind_direction_hourly_4_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1277076/jobs/named/wind_direction_hourly_4_offline_fg_materialization/executions
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1277076/fs/1263683/fg/1721742


Uploading Dataframe: 100.00% |██████████| Rows 286425/286425 | Elapsed Time: 00:13 | Remaining Time: 00:00


Launching job: wind_speed_hourly_4_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1277076/jobs/named/wind_speed_hourly_4_offline_fg_materialization/executions
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1277076/fs/1263683/fg/1721743


Uploading Dataframe: 100.00% |██████████| Rows 308753/308753 | Elapsed Time: 00:14 | Remaining Time: 00:00


Launching job: air_temperature_hourly_4_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1277076/jobs/named/air_temperature_hourly_4_offline_fg_materialization/executions
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1277076/fs/1263683/fg/1721745


Uploading Dataframe: 100.00% |██████████| Rows 4953/4953 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: wind_direction_daily_4_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1277076/jobs/named/wind_direction_daily_4_offline_fg_materialization/executions
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1277076/fs/1263683/fg/1721746


Uploading Dataframe: 100.00% |██████████| Rows 4953/4953 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: wind_speed_daily_4_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1277076/jobs/named/wind_speed_daily_4_offline_fg_materialization/executions
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1277076/fs/1263683/fg/1721747


Uploading Dataframe: 100.00% |██████████| Rows 4954/4954 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: air_temperature_daily_4_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1277076/jobs/named/air_temperature_daily_4_offline_fg_materialization/executions


(Job('air_temperature_daily_4_offline_fg_materialization', 'SPARK'), None)

In [59]:
# FORECASRT_HOURS_AHEAD = 24
# # Target variables
# df['target'] = df['wind_speed_avg'].shift(-FORECASRT_HOURS_AHEAD)

# df['target'] = df['wind_direction_avg'].shift(-FORECASRT_HOURS_AHEAD)

# df['target'] = df['pm25'].shift(-FORECASRT_HOURS_AHEAD)
