<span style="font-width:bold; font-size: 3rem; color:#333;">Part 01: Feature Backfill for Air Quality Data</span>

Backfill historical air quality and weather data for all sensors.



In [2]:
import datetime
import requests
import pandas as pd
import hopsworks
import json
import dotenv
import os
import great_expectations as ge

In [3]:
dotenv.load_dotenv()
project = hopsworks.login(engine="python")
fs = project.get_feature_store()
secrets = hopsworks.get_secrets_api()


2025-11-10 13:51:41,000 INFO: Initializing external client
2025-11-10 13:51:41,001 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2025-11-10 13:51:42,513 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1286307


In [4]:
AQICN_API_KEY = os.getenv("AQICN_API_KEY")
secret = secrets.get_secret("AQICN_API_KEY")
if secret is not None:
    secret.delete()
secrets.create_secret("AQICN_API_KEY", AQICN_API_KEY)

Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets


Secret('AQICN_API_KEY', 'PRIVATE')

In [5]:
# Sensor configuration
SENSOR_IDS = {
    "central": 1666,
    "east": 1664,
    "west": 1665,
    "north": 1662,
    "south": 1663
}

CSV_FILES = {
    1666: "../data/central,-singapore-air-quality.csv",
    1664: "../data/east,-singapore-air-quality.csv",
    1665: "../data/west,-singapore-air-quality.csv",
    1662: "../data/north,-singapore-air-quality.csv",
    1663: "../data/south,-singapore-air-quality.csv"
}

REGION_COORDS = pd.DataFrame({
    "region": ["central", "north", "south", "east", "west"],
    "latitude": [1.3521, 1.4180, 1.2800, 1.3500, 1.3400],
    "longitude": [103.8198, 103.8270, 103.8500, 103.9400, 103.7000]
})

CITY = "Singapore"
COUNTRY = "Singapore"
LATITUDE = 1.3667
LONGITUDE = 103.8



In [6]:
# Load and process air quality data for all sensors
def load_air_quality_data(sensor_id, csv_file):
    df = pd.read_csv(csv_file, parse_dates=['date'], skipinitialspace=True)
    df_aq = df[['date', 'pm25']].copy()
    df_aq['pm25'] = df_aq['pm25'].astype('float32')
    df_aq['city'] = CITY
    df_aq['sensor_id'] = sensor_id
    df_aq = df_aq[df_aq["date"] >= "2016-01-01"]

    # Count nulls
    nulls_removed = df_aq['pm25'].isna().sum()
    df_aq = df_aq.dropna()
    return df_aq, nulls_removed

all_aq_data = []
nulls_removed_list = []
for sensor_name, sensor_id in SENSOR_IDS.items():
    csv_file = CSV_FILES[sensor_id]
    df_aq, nulls_removed = load_air_quality_data(sensor_id, csv_file)
    all_aq_data.append(df_aq)
    nulls_removed_list.append(nulls_removed)
    print(f"Loaded {len(df_aq)} records for {sensor_name} (sensor_id: {sensor_id})")

# assert nulls removed are equal and less than 20
if not all(n == nulls_removed_list[0] for n in nulls_removed_list):
    raise AssertionError("Nulls removed are not equal")
if not all(n < 20 for n in nulls_removed_list):
    raise AssertionError("More than 20 nulls removed")

df_aq = pd.concat(all_aq_data, ignore_index=True)
print(f"\nTotal air quality records: {len(df_aq)}")

Loaded 3587 records for central (sensor_id: 1666)
Loaded 3586 records for east (sensor_id: 1664)
Loaded 3586 records for west (sensor_id: 1665)
Loaded 3587 records for north (sensor_id: 1662)
Loaded 3586 records for south (sensor_id: 1663)

Total air quality records: 17932


In [7]:
# Load weather data
earliest_aq_date = df_aq['date'].min().strftime('%Y-%m-%d')
url = "https://archive-api.open-meteo.com/v1/archive"

params = {
    "latitude": LATITUDE,
    "longitude": LONGITUDE,
    "start_date": earliest_aq_date,
    "format": "json",
    "end_date": datetime.datetime.now().strftime("%Y-%m-%d"),
    "daily": ["temperature_2m_mean", "precipitation_sum", "wind_speed_10m_max", "wind_direction_10m_dominant"]
}

response = requests.get(url, params=params)
if response.status_code != 200:
    raise requests.exceptions.RequestException(f"Failed to retrieve weather data. Status Code: {response.status_code}")

data = response.json()
d = data["daily"]

weather_df = pd.DataFrame({
    "date": pd.to_datetime(d["time"]),
    "temperature_2m_mean": d["temperature_2m_mean"],
    "precipitation_sum": d["precipitation_sum"],
    "wind_speed_10m_max": d["wind_speed_10m_max"],
    "wind_direction_10m_dominant": d["wind_direction_10m_dominant"],
})
weather_df["city"] = CITY

print(f"Weather records: {len(weather_df)}")

Weather records: 3602


In [8]:
weather_df

Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,city
0,2016-01-01,25.6,6.2,18.5,21,Singapore
1,2016-01-02,25.4,10.3,11.0,9,Singapore
2,2016-01-03,26.0,2.1,13.1,19,Singapore
3,2016-01-04,26.8,2.2,10.8,32,Singapore
4,2016-01-05,26.8,3.1,12.2,36,Singapore
...,...,...,...,...,...,...
3597,2025-11-06,28.1,0.1,14.2,258,Singapore
3598,2025-11-07,27.9,0.1,12.7,281,Singapore
3599,2025-11-08,28.5,0.0,14.3,288,Singapore
3600,2025-11-09,27.6,6.6,13.2,283,Singapore


In [9]:
nulls_removed = weather_df.isna().sum().sum()
nulls_removed

0

In [10]:
# Define data validation expectations
aq_expectation_suite = ge.core.ExpectationSuite(expectation_suite_name="aq_expectation_suite")
aq_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={"column": "pm25", "min_value": -0.1, "max_value": 500.0, "strict_min": True}
    )
)

weather_expectation_suite = ge.core.ExpectationSuite(expectation_suite_name="weather_expectation_suite")
for col in ["precipitation_sum", "wind_speed_10m_max"]:
    weather_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_min_to_be_between",
            kwargs={"column": col, "min_value": -0.1, "max_value": 1000.0, "strict_min": True}
        )
    )

In [11]:
# Save sensor locations as secret
sensor_locations = []
for sensor_name, sensor_id in SENSOR_IDS.items():
    sensor_locations.append({
        "country": COUNTRY,
        "city": CITY,
        "street": sensor_name.capitalize(),
        "sensor_id": sensor_id,
        "latitude": LATITUDE,
        "longitude": LONGITUDE
    })

secret = secrets.get_secret("SENSOR_LOCATION_JSON")
if secret is not None:
    secret.delete()

secret_json = json.dumps(sensor_locations)
print(secret_json)
secrets.create_secret("SENSOR_LOCATION_JSON", secret_json)
print(f"Saved {len(sensor_locations)} sensor locations")

[{"country": "Singapore", "city": "Singapore", "street": "Central", "sensor_id": 1666, "latitude": 1.3667, "longitude": 103.8}, {"country": "Singapore", "city": "Singapore", "street": "East", "sensor_id": 1664, "latitude": 1.3667, "longitude": 103.8}, {"country": "Singapore", "city": "Singapore", "street": "West", "sensor_id": 1665, "latitude": 1.3667, "longitude": 103.8}, {"country": "Singapore", "city": "Singapore", "street": "North", "sensor_id": 1662, "latitude": 1.3667, "longitude": 103.8}, {"country": "Singapore", "city": "Singapore", "street": "South", "sensor_id": 1663, "latitude": 1.3667, "longitude": 103.8}]
Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets
Saved 5 sensor locations


In [12]:
# Create and populate air quality feature group
air_quality_fg = fs.get_or_create_feature_group(
    name='air_quality',
    description='Air Quality characteristics of each day',
    version=1,
    primary_key=['city', 'sensor_id'],
    event_time="date",
    expectation_suite=aq_expectation_suite
)

air_quality_fg.insert(df_aq)

air_quality_fg.update_feature_description("date", "Date of measurement of air quality")
air_quality_fg.update_feature_description("city", "City where the air quality was measured")
air_quality_fg.update_feature_description("pm25", "Particles less than 2.5 micrometers in diameter (fine particles) pose health risk")
air_quality_fg.update_feature_description("sensor_id", "Sensor ID of the air quality measurement")

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1286307/fs/1265775/fg/1668624
2025-11-10 13:53:13,640 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1286307/fs/1265775/fg/1668624


Uploading Dataframe: 100.00% |██████████| Rows 17932/17932 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: air_quality_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286307/jobs/named/air_quality_1_offline_fg_materialization/executions


<hsfs.feature_group.FeatureGroup at 0x17e594590>

In [13]:
# Create and populate weather feature group
weather_fg = fs.get_or_create_feature_group(
    name='weather',
    description='Weather characteristics of each day',
    version=1,
    primary_key=['city'],
    event_time="date",
    expectation_suite=weather_expectation_suite
)

weather_fg.insert(weather_df, wait=True)

weather_fg.update_feature_description("date", "Date of measurement of weather")
weather_fg.update_feature_description("city", "City where weather is measured/forecast for")
weather_fg.update_feature_description("temperature_2m_mean", "Temperature in Celsius")
weather_fg.update_feature_description("precipitation_sum", "Precipitation (rain/snow) in mm")
weather_fg.update_feature_description("wind_speed_10m_max", "Wind speed at 10m above ground")
weather_fg.update_feature_description("wind_direction_10m_dominant", "Dominant Wind direction over the day")


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1286307/fs/1265775/fg/1668625
2025-11-10 13:53:35,671 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1286307/fs/1265775/fg/1668625


Uploading Dataframe: 100.00% |██████████| Rows 3602/3602 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286307/jobs/named/weather_1_offline_fg_materialization/executions
2025-11-10 13:53:53,286 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-11-10 13:53:59,721 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-11-10 13:56:16,233 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-11-10 13:56:16,390 INFO: Waiting for log aggregation to finish.
2025-11-10 13:56:25,079 INFO: Execution finished successfully.


<hsfs.feature_group.FeatureGroup at 0x17e5fcdd0>