# Weather data enrichment

This notebook shows how to fetch hourly weather information from the [Open-Meteo](https://open-meteo.com/) API and combine it with the cleaned trip data. We select a small subset of trips, request temperature, rain and weather code for each corresponding hour, then merge the results back.


In [None]:
import pandas as pd
import requests_cache
from retry_requests import retry
import openmeteo_requests


In [None]:
# set up a cached HTTP session so repeated runs do not hit the API every time
cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)


In [None]:
# load trips and round start times to the nearest hour in UTC
df = pd.read_csv('jupyter/cleaned_trip_data.csv', parse_dates=['start_time'])
local = df['start_time'].dt.tz_localize('America/Los_Angeles', nonexistent='shift_forward', ambiguous='NaT')
df['time_rounded'] = local.dt.tz_convert('UTC').dt.round('h')
df.head()


In [None]:
# keep only the first 50 distinct hours to limit the number of API calls
selected_hours = df['time_rounded'].drop_duplicates().sort_values().head(50)
filtered = df[df['time_rounded'].isin(selected_hours)]
requests_df = filtered[['start_lat','start_lon','time_rounded']].drop_duplicates()
requests_df.head()


In [None]:
def fetch_hourly_weather(lat, lon, timestamp):
    """Return temperature, rain and weather code for the given UTC hour."""
    url = 'https://api.open-meteo.com/v1/forecast'
    day = timestamp.strftime("%Y-%m-%d")
    params = {
        'latitude': lat,
        'longitude': lon,
        'hourly': ['temperature_2m', 'rain', 'weather_code'],
        'start_date': day,
        'end_date': day,
        'timezone': 'UTC',
        'timeformat': 'unixtime',
    }
    responses = openmeteo.weather_api(url, params=params)
    hourly = responses[0].Hourly()
    times = pd.to_datetime(hourly.Time(), unit="s", utc=True)
    idx = times.get_indexer([timestamp], method="nearest")[0]
    if idx == -1:
        return None, None, None
    temperature = hourly.Variables(0).ValuesAsNumpy()[idx]
    rain = hourly.Variables(1).ValuesAsNumpy()[idx]
    code = hourly.Variables(2).ValuesAsNumpy()[idx]
    return temperature, rain, code

# collect weather for each location and hour
records = []
for _, row in requests_df.iterrows():
    temp, rain, code = fetch_hourly_weather(row["start_lat"], row["start_lon"], row["time_rounded"])
    records.append({
        "start_lat": row["start_lat"],
        "start_lon": row["start_lon"],
        "time_rounded": row["time_rounded"],
        "temperature_2m": temp,
        "rain": rain,
        "weather_code": code,
    })
weather_df = pd.DataFrame(records)
result = filtered.merge(weather_df, on=["start_lat", "start_lon", "time_rounded"], how="left")
result.head()


In [None]:
# collect weather for each location and hour
records = []
for _, row in requests_df.iterrows():
    try:
        temp, rain, code = fetch_hourly_weather(row['start_lat'], row['start_lon'], row['time_rounded'])
    except Exception as e:
        print('weather fetch failed:', e)
        temp = rain = code = None
    records.append({
        'start_lat': row['start_lat'],
        'start_lon': row['start_lon'],
        'time_rounded': row['time_rounded'],
        'temperature_2m': temp,
        'rain': rain,
        'weather_code': code,
    })
weather_df = pd.DataFrame(records)
merged = filtered.merge(weather_df, on=['start_lat','start_lon','time_rounded'], how='left')
merged.head()
