<span style="font-width:bold; font-size: 3rem; color:#333;">Part 02: Daily Feature Pipeline</span>

Daily pipeline to fetch and store air quality and weather data for all sensors.

In [1]:
import datetime
import requests
import pandas as pd
import hopsworks
import json
import warnings
warnings.filterwarnings("ignore")

In [2]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store() 
secrets = hopsworks.get_secrets_api()

# This line will fail if you have not registered the AQICN_API_KEY as a secret in Hopsworks
AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value
location_str = secrets.get_secret("SENSOR_LOCATION_JSON").value
locations = json.loads(location_str)


today = datetime.date.today()


2025-11-10 13:55:52,646 INFO: Initializing external client
2025-11-10 13:55:52,646 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-11-10 13:55:54,287 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1286307


In [3]:
CITY = "Singapore"
COUNTRY = "Singapore"
LATITUDE = 1.3667
LONGITUDE = 103.8

In [4]:
# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name='air_quality',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather',
    version=1,
)

In [5]:
# Get air quality data for all sensors
def get_pm25(sensor_id, city, date, api_key):
    url = f"https://api.waqi.info/feed/@{sensor_id}/?token={api_key}"
    response = requests.get(url)
    if response.status_code != 200:
        raise requests.exceptions.RequestException(f"Failed to retrieve data. Status Code: {response.status_code}")
    
    data = response.json()
    if data['status'] != 'ok':
        raise Exception(f"Error: {data['status']}")
    
    aqi_data = data['data']
    pm25_value = aqi_data['iaqi'].get('pm25', {}).get('v', None)
    
    return pd.DataFrame({
        'date': [pd.to_datetime(date)],
        'pm25': [float(pm25_value) if pm25_value is not None else None],
        'city': [city],
        'sensor_id': [sensor_id]
    })

all_aq_data = []
for location in locations:
    sensor_id = location['sensor_id']
    city = location['city']
    df_aq = get_pm25(sensor_id, city, today, AQICN_API_KEY)
    all_aq_data.append(df_aq)
    print(f"Retrieved PM2.5 for sensor {sensor_id} ({location['street']}): {df_aq['pm25'].iloc[0]}")

aq_today_df = pd.concat(all_aq_data, ignore_index=True)
aq_today_df['pm25'] = aq_today_df['pm25'].astype('float32')
print(f"\nTotal sensors processed: {len(all_aq_data)}")

Retrieved PM2.5 for sensor 1666 (Central): 34.0
Retrieved PM2.5 for sensor 1664 (East): 53.0
Retrieved PM2.5 for sensor 1665 (West): 25.0
Retrieved PM2.5 for sensor 1662 (North): 42.0
Retrieved PM2.5 for sensor 1663 (South): 34.0

Total sensors processed: 5


In [6]:
# Insert air quality data for all sensors
air_quality_fg.insert(aq_today_df)
print("Air quality data inserted successfully")

2025-11-10 14:09:50,358 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1286307/fs/1265775/fg/1668624


Uploading Dataframe: 100.00% |██████████| Rows 5/5 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: air_quality_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286307/jobs/named/air_quality_1_offline_fg_materialization/executions
Air quality data inserted successfully


In [10]:
url = "https://api.open-meteo.com/v1/ecmwf"

params = {
    "latitude": LATITUDE,
    "longitude": LONGITUDE,
    "format": "json",
    "hourly": ["temperature_2m", "precipitation", "wind_speed_10m", "wind_direction_10m"]
}

response = requests.get(url, params=params)
if response.status_code != 200:
    raise requests.exceptions.RequestException(f"Failed to retrieve weather data. Status Code: {response.status_code}")

data = response.json()


print(data)



hourly = data["hourly"]
hourly_df = pd.DataFrame({
    "time": pd.to_datetime(hourly["time"]),
    "temperature_2m": hourly["temperature_2m"],
    "precipitation": hourly["precipitation"],
    "wind_speed_10m": hourly["wind_speed_10m"],
    "wind_direction_10m": hourly["wind_direction_10m"]
})

hourly_df = hourly_df.set_index('time')

mean_temp = hourly_df["temperature_2m"].mean()
precipitation_sum = hourly_df["precipitation"].sum()
max_wind_speed = hourly_df["wind_speed_10m"].max()
dominant_wind_direction = hourly_df["wind_direction_10m"].mode()[0] if not hourly_df["wind_direction_10m"].mode().empty else np.nan

daily_weather_df = pd.DataFrame({
    "date": [datetime.datetime.now().strftime("%Y-%m-%d")],
    "temperature_2m_mean": [mean_temp],
    "precipitation_sum": [precipitation_sum],
    "wind_speed_10m_max": [max_wind_speed],
    "wind_direction_10m_dominant": [dominant_wind_direction],
    "city": [CITY]
})



{'latitude': 1.5, 'longitude': 103.5, 'generationtime_ms': 0.05996227264404297, 'utc_offset_seconds': 0, 'timezone': 'GMT', 'timezone_abbreviation': 'GMT', 'elevation': 39.0, 'hourly_units': {'time': 'iso8601', 'temperature_2m': '°C', 'precipitation': 'mm', 'wind_speed_10m': 'km/h', 'wind_direction_10m': '°'}, 'hourly': {'time': ['2025-11-10T00:00', '2025-11-10T01:00', '2025-11-10T02:00', '2025-11-10T03:00', '2025-11-10T04:00', '2025-11-10T05:00', '2025-11-10T06:00', '2025-11-10T07:00', '2025-11-10T08:00', '2025-11-10T09:00', '2025-11-10T10:00', '2025-11-10T11:00', '2025-11-10T12:00', '2025-11-10T13:00', '2025-11-10T14:00', '2025-11-10T15:00', '2025-11-10T16:00', '2025-11-10T17:00', '2025-11-10T18:00', '2025-11-10T19:00', '2025-11-10T20:00', '2025-11-10T21:00', '2025-11-10T22:00', '2025-11-10T23:00', '2025-11-11T00:00', '2025-11-11T01:00', '2025-11-11T02:00', '2025-11-11T03:00', '2025-11-11T04:00', '2025-11-11T05:00', '2025-11-11T06:00', '2025-11-11T07:00', '2025-11-11T08:00', '2025-11

In [11]:
daily_weather_df

Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,city
0,2025-11-10,25.45,54.1,14.5,360,Singapore


In [13]:
# Ensure the 'date' column is a timestamp type compatible with the Feature Group schema
daily_weather_df['date'] = pd.to_datetime(daily_weather_df['date'])

# Insert daily weather features into the feature group
weather_fg.insert(daily_weather_df, wait=True)
print("Weather data inserted successfully")


2025-11-10 14:12:43,887 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1286307/fs/1265775/fg/1668625


Uploading Dataframe: 100.00% |██████████| Rows 1/1 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286307/jobs/named/weather_1_offline_fg_materialization/executions
2025-11-10 14:13:00,799 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-11-10 14:13:03,992 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-11-10 14:14:58,999 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-11-10 14:14:59,185 INFO: Waiting for log aggregation to finish.
2025-11-10 14:15:07,856 INFO: Execution finished successfully.
Weather data inserted successfully
