<span style="font-width:bold; font-size: 3rem; color:#333;">Part 02: Daily Feature Pipeline</span>

Daily pipeline to fetch and store air quality and weather data for all sensors.

In [None]:
import datetime
import requests
import pandas as pd
import hopsworks
import json
import warnings
warnings.filterwarnings("ignore")

In [4]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store() 
secrets = hopsworks.get_secrets_api()

# This line will fail if you have not registered the AQICN_API_KEY as a secret in Hopsworks
AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value
location_str = secrets.get_secret("SENSOR_LOCATION_JSON").value
locations = json.loads(location_str)


today = datetime.date.today()


2025-11-06 01:30:41,489 INFO: Initializing external client
2025-11-06 01:30:41,497 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-11-06 01:30:43,438 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279137


In [10]:
CITY = "Singapore"
COUNTRY = "Singapore"
LATITUDE = 1.3667
LONGITUDE = 103.8

In [8]:
# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name='air_quality',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather',
    version=1,
)

In [5]:
# Get air quality data for all sensors
def get_pm25(sensor_id, city, date, api_key):
    url = f"https://api.waqi.info/feed/@{sensor_id}/?token={api_key}"
    response = requests.get(url)
    if response.status_code != 200:
        raise requests.exceptions.RequestException(f"Failed to retrieve data. Status Code: {response.status_code}")
    
    data = response.json()
    if data['status'] != 'ok':
        raise Exception(f"Error: {data['status']}")
    
    aqi_data = data['data']
    pm25_value = aqi_data['iaqi'].get('pm25', {}).get('v', None)
    
    return pd.DataFrame({
        'date': [pd.to_datetime(date)],
        'pm25': [float(pm25_value) if pm25_value is not None else None],
        'city': [city],
        'sensor_id': [sensor_id]
    })

all_aq_data = []
for location in locations:
    sensor_id = location['sensor_id']
    city = location['city']
    df_aq = get_pm25(sensor_id, city, today, AQICN_API_KEY)
    all_aq_data.append(df_aq)
    print(f"Retrieved PM2.5 for sensor {sensor_id} ({location['street']}): {df_aq['pm25'].iloc[0]}")

aq_today_df = pd.concat(all_aq_data, ignore_index=True)
aq_today_df['pm25'] = aq_today_df['pm25'].astype('float32')
print(f"\nTotal sensors processed: {len(all_aq_data)}")

Retrieved PM2.5 for sensor 1666 (Central): 95.0
Retrieved PM2.5 for sensor 1664 (East): 93.0
Retrieved PM2.5 for sensor 1665 (West): 55.0
Retrieved PM2.5 for sensor 1662 (North): 72.0
Retrieved PM2.5 for sensor 1663 (South): 74.0

Total sensors processed: 5


In [9]:
# Insert air quality data for all sensors
air_quality_fg.insert(aq_today_df)
print("Air quality data inserted successfully")

2025-11-06 01:31:53,311 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279137/fs/1265747/fg/1596048


Uploading Dataframe: 100.00% |██████████| Rows 5/5 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: air_quality_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279137/jobs/named/air_quality_1_offline_fg_materialization/executions
Air quality data inserted successfully


In [None]:
url = "https://api.open-meteo.com/v1/ecmwf"

params = {
    "latitude": LATITUDE,
    "longitude": LONGITUDE,
    "format": "json",
    "hourly": ["temperature_2m", "precipitation", "wind_speed_10m", "wind_direction_10m"]
}

response = requests.get(url, params=params)
if response.status_code != 200:
    raise requests.exceptions.RequestException(f"Failed to retrieve weather data. Status Code: {response.status_code}")

data = response.json()


print(data)



hourly = data["hourly"]
hourly_df = pd.DataFrame({
    "time": pd.to_datetime(hourly["time"]),
    "temperature_2m": hourly["temperature_2m"],
    "precipitation": hourly["precipitation"],
    "wind_speed_10m": hourly["wind_speed_10m"],
    "wind_direction_10m": hourly["wind_direction_10m"]
})

hourly_df = hourly_df.set_index('time')

mean_temp = hourly_df["temperature_2m"].mean()
precipitation_sum = hourly_df["precipitation"].sum()
max_wind_speed = hourly_df["wind_speed_10m"].max()
dominant_wind_direction = hourly_df["wind_direction_10m"].mode()[0] if not hourly_df["wind_direction_10m"].mode().empty else np.nan

daily_weather_df = pd.DataFrame({
    "date": [datetime.datetime.now().strftime("%Y-%m-%d")],
    "temperature_2m_mean": [mean_temp],
    "precipitation_sum": [precipitation_sum],
    "wind_speed_10m_max": [max_wind_speed],
    "wind_direction_10m_dominant": [dominant_wind_direction],
    "city": [CITY]
})



{'latitude': 1.5, 'longitude': 103.5, 'generationtime_ms': 0.05984306335449219, 'utc_offset_seconds': 0, 'timezone': 'GMT', 'timezone_abbreviation': 'GMT', 'elevation': 39.0, 'hourly_units': {'time': 'iso8601', 'temperature_2m': '°C', 'precipitation': 'mm', 'wind_speed_10m': 'km/h', 'wind_direction_10m': '°'}, 'hourly': {'time': ['2025-11-06T00:00', '2025-11-06T01:00', '2025-11-06T02:00', '2025-11-06T03:00', '2025-11-06T04:00', '2025-11-06T05:00', '2025-11-06T06:00', '2025-11-06T07:00', '2025-11-06T08:00', '2025-11-06T09:00', '2025-11-06T10:00', '2025-11-06T11:00', '2025-11-06T12:00', '2025-11-06T13:00', '2025-11-06T14:00', '2025-11-06T15:00', '2025-11-06T16:00', '2025-11-06T17:00', '2025-11-06T18:00', '2025-11-06T19:00', '2025-11-06T20:00', '2025-11-06T21:00', '2025-11-06T22:00', '2025-11-06T23:00', '2025-11-07T00:00', '2025-11-07T01:00', '2025-11-07T02:00', '2025-11-07T03:00', '2025-11-07T04:00', '2025-11-07T05:00', '2025-11-07T06:00', '2025-11-07T07:00', '2025-11-07T08:00', '2025-11

In [20]:
daily_weather_df

Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,city
0,2025-11-06,26.475,52.1,16.5,360,Singapore


In [21]:
weather_fg.insert(daily_weather_df, wait=True)


FeatureStoreException: Features are not compatible with Feature Group schema: 
 - date (expected type: 'timestamp', derived from input: 'date') has the wrong type.
Note that feature (or column) names are case insensitive and spaces are automatically replaced with underscores.