In [7]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple

class WeatherDataLoader:
    """Load historical weather data from Open-Meteo API for dual granularity models."""
    
    BASE_URL = "https://archive-api.open-meteo.com/v1/archive"
    SYDNEY_COORDS = {"latitude": -33.8678, "longitude": 151.2073}
    
    # Features for daily rain classification (aggregated daily values)
    DAILY_FEATURES = [
        "temperature_2m_max", "temperature_2m_min", "temperature_2m_mean",
        "relative_humidity_2m_max", "relative_humidity_2m_min", 
        "pressure_msl_mean", "wind_speed_10m_max", "wind_speed_10m_mean",
        "wind_direction_10m_dominant", "precipitation_sum", "rain_sum",
        "shortwave_radiation_sum", "daylight_duration"
    ]
    
    # Features for hourly precipitation regression (detailed hourly values)
    HOURLY_FEATURES = [
        "temperature_2m", "relative_humidity_2m", "dew_point_2m",
        "precipitation", "rain", "pressure_msl", "cloud_cover",
        "wind_speed_10m", "wind_direction_10m", "shortwave_radiation",
        "surface_pressure", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high"
    ]
    
    def fetch_daily_data(
        self, 
        start_date: str, 
        end_date: str,
        features: Optional[List[str]] = None
    ) -> pd.DataFrame:

        if features is None:
            features = self.DAILY_FEATURES
            
        params = {
            **self.SYDNEY_COORDS,
            "start_date": start_date,
            "end_date": end_date,
            "daily": features,
            "timezone": "Australia/Sydney"
        }
        
        response = requests.get(self.BASE_URL, params=params)
        response.raise_for_status()
        
        data = response.json()
        daily_data = data["daily"]
        
        df = pd.DataFrame({
            "date": pd.to_datetime(daily_data["time"]),
            **{k: v for k, v in daily_data.items() if k != "time"}
        })
        
        return df.set_index("date")
    
    def fetch_hourly_data(
        self, 
        start_date: str, 
        end_date: str,
        features: Optional[List[str]] = None
    ) -> pd.DataFrame:

        if features is None:
            features = self.HOURLY_FEATURES
            
        params = {
            **self.SYDNEY_COORDS,
            "start_date": start_date,
            "end_date": end_date,
            "hourly": features,
            "timezone": "Australia/Sydney"
        }
        
        response = requests.get(self.BASE_URL, params=params)
        response.raise_for_status()
        
        data = response.json()
        hourly_data = data["hourly"]
        
        df = pd.DataFrame({
            "datetime": pd.to_datetime(hourly_data["time"]),
            **{k: v for k, v in hourly_data.items() if k != "time"}
        })
        
        return df.set_index("datetime")
    
    def create_target_variables(
        self, 
        daily_df: pd.DataFrame, 
        hourly_df: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:

        # Create 7-day rain classification target (daily)
        daily_df = daily_df.copy()
        daily_df['rain_in_7_days'] = (daily_df['rain_sum'].shift(-7) > 0).astype(int)
        
        # Create 3-day cumulative precipitation target (hourly)
        hourly_df = hourly_df.copy()
        hourly_df['precipitation_next_72h'] = (
            hourly_df['precipitation']
            .rolling(window=72)
            .sum()
            .shift(-72)
        )
        
        # Remove rows where targets cannot be calculated
        daily_df = daily_df.dropna(subset=['rain_in_7_days'])
        hourly_df = hourly_df.dropna(subset=['precipitation_next_72h'])
        
        return daily_df, hourly_df
    
    def load_complete_datasets(
        self, 
        start_date: str = "2020-01-01",
        end_date: str = "2025-06-30"
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        
        print("Fetching daily weather data...")
        daily_df = self.fetch_daily_data(start_date, end_date)
        
        print("Fetching hourly weather data...")
        hourly_df = self.fetch_hourly_data(start_date, end_date)
        
        print("Creating target variables...")
        daily_with_targets, hourly_with_targets = self.create_target_variables(
            daily_df, hourly_df
        )
        
        print(f"Daily dataset: {len(daily_with_targets)} days")
        print(f"Hourly dataset: {len(hourly_with_targets)} hours")
        
        return daily_with_targets, hourly_with_targets

In [8]:
daily_with_targets, hourly_with_targets = WeatherDataLoader().load_complete_datasets()

Fetching daily weather data...
Fetching hourly weather data...
Creating target variables...
Daily dataset: 2008 days
Hourly dataset: 48120 hours


In [9]:
daily_with_targets

Unnamed: 0_level_0,temperature_2m_max,temperature_2m_min,temperature_2m_mean,relative_humidity_2m_max,relative_humidity_2m_min,pressure_msl_mean,wind_speed_10m_max,wind_speed_10m_mean,wind_direction_10m_dominant,precipitation_sum,rain_sum,shortwave_radiation_sum,daylight_duration,rain_in_7_days
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-01-01,23.9,19.2,21.6,82,56,1014.3,19.6,16.3,168,0.0,0.0,28.18,51704.47,1
2020-01-02,25.2,20.2,22.1,86,55,1017.9,19.5,14.3,137,0.9,0.9,24.85,51671.07,1
2020-01-03,28.8,20.8,23.7,95,56,1014.4,26.3,14.4,67,1.7,1.7,26.92,51634.70,0
2020-01-04,41.8,19.9,29.4,97,18,1008.0,41.8,15.5,29,0.0,0.0,28.23,51595.43,1
2020-01-05,24.4,20.4,21.9,73,59,1017.9,34.6,24.8,169,0.2,0.2,22.91,51553.35,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-06-26,14.1,6.7,10.5,66,41,1024.0,25.0,16.6,238,0.0,0.0,11.13,35670.66,0
2025-06-27,14.4,7.2,10.9,78,59,1030.0,16.4,11.9,230,0.0,0.0,6.27,35685.55,0
2025-06-28,16.3,7.9,12.0,89,63,1027.8,10.9,6.7,267,0.0,0.0,9.40,35703.02,0
2025-06-29,17.2,6.1,11.7,94,48,1024.9,10.7,5.3,269,0.0,0.0,10.95,35723.03,0


In [11]:
hourly_with_targets

Unnamed: 0_level_0,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,pressure_msl,cloud_cover,wind_speed_10m,wind_direction_10m,shortwave_radiation,surface_pressure,cloud_cover_low,cloud_cover_mid,cloud_cover_high,precipitation_next_72h
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-01-01 00:00:00,19.7,78,15.8,0.0,0.0,1013.5,99,19.3,189,0.0,1007.5,63,97,18,2.6
2020-01-01 01:00:00,19.5,79,15.7,0.0,0.0,1012.8,100,19.6,193,0.0,1006.8,100,99,86,2.6
2020-01-01 02:00:00,19.5,78,15.5,0.0,0.0,1012.9,100,17.5,199,0.0,1006.9,100,77,70,2.6
2020-01-01 03:00:00,19.2,77,15.2,0.0,0.0,1012.5,100,17.7,197,0.0,1006.5,100,0,0,2.6
2020-01-01 04:00:00,19.6,74,14.8,0.0,0.0,1012.6,100,19.0,195,0.0,1006.6,100,0,0,2.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-06-27 19:00:00,13.2,72,8.3,0.0,0.0,1030.3,29,14.3,185,0.0,1024.1,15,18,0,0.0
2025-06-27 20:00:00,13.3,72,8.4,0.0,0.0,1030.3,14,13.8,187,0.0,1024.1,3,12,0,0.0
2025-06-27 21:00:00,13.0,73,8.3,0.0,0.0,1030.1,25,12.0,194,0.0,1023.9,18,18,0,0.1
2025-06-27 22:00:00,12.4,76,8.2,0.0,0.0,1030.0,38,10.0,214,0.0,1023.7,4,37,0,0.4


In [12]:
import os

# Save datasets as CSV to the specified local path
base_dir = "/Users/afrazrupak/weather_forecast/weather_forecast/data/raw/"
os.makedirs(base_dir, exist_ok=True)

daily_path = os.path.join(base_dir, "daily_with_targets.csv")
hourly_path = os.path.join(base_dir, "hourly_with_targets.csv")

daily_with_targets.to_csv(daily_path, index=True)
hourly_with_targets.to_csv(hourly_path, index=True)

print(f"Saved daily dataset to: {daily_path}")
print(f"Saved hourly dataset to: {hourly_path}")

Saved daily dataset to: /Users/afrazrupak/weather_forecast/weather_forecast/data/raw/daily_with_targets.csv
Saved hourly dataset to: /Users/afrazrupak/weather_forecast/weather_forecast/data/raw/hourly_with_targets.csv
