In [None]:
import os
from datetime import timedelta, datetime
import pickle
import pandas as pd

from private_uoapi import (LightsailWrapper, LSAuth, LSConfig, DateRangeParams, convert_to_dataframe)
from gnn_package.src.utils.sensor_utils import get_sensor_name_id_map
from gnn_package.config.paths import RAW_TIMESERIES_DIR

In [None]:
RAW_TIMESERIES_DIR
VEH_CLASS = 'person'

In [None]:
async def fetch_traffic_data(start_date, end_date, output_filename, window_size_days=7):
    """
    Fetch traffic data from the API and save it in the format required by run_experiment.py.

    Parameters:
    -----------
    start_date : datetime
        Start date for data fetching
    end_date : datetime
        End date for data fetching
    output_filename : str
        Name of the output file to save the data
    window_size_days : int
        Size of the time window for each API request (to avoid timeout issues)
    """
    print(f"Fetching traffic data from {start_date} to {end_date}")

    # Initialize API client
    config = LSConfig()
    auth = LSAuth(config)
    client = LightsailWrapper(config, auth)

    print(f"Using base URL: {config.base_url}")
    print(f"Using username: {config.username}")
    print(f"Using secret key: {'*' * len(config.secret_key)}")

    # Get sensor name to ID mapping
    name_id_map = get_sensor_name_id_map()

    # Create time windows to fetch data in chunks
    total_days = (end_date - start_date).days
    num_windows = (total_days + window_size_days - 1) // window_size_days

    all_data = {}
    unique_veh_classes = set()

    for i in range(num_windows):
        window_start = start_date + timedelta(days=i * window_size_days)
        window_end = min(window_start + timedelta(days=window_size_days), end_date)

        print(f"Fetching window {i+1}/{num_windows}: {window_start} to {window_end}")

        # Create date range parameters
        date_range_params = DateRangeParams(
            start_date=window_start,
            end_date=window_end,
            max_date_range=timedelta(days=window_size_days + 1),
        )

        try:
            # Fetch data for this window
            count_data = await client.get_traffic_data(date_range_params)
            counts_df = convert_to_dataframe(count_data)

            # filter to only include vehicle classes of interest
            counts_df = counts_df[counts_df["veh_class"] == VEH_CLASS]
            unique_veh_classes.update(counts_df["veh_class"].unique())

            # Aggregate data on direction
            counts_df = counts_df.groupby(["dt", "veh_class", "location"]).agg({"value": "sum"}).reset_index()

            # Convert to time series dictionary format
            for location in set(counts_df["location"]):
                location_df = counts_df[counts_df["location"] == location]

                if len(location_df) == 0:
                    continue

                # Get the sensor ID from the mapping
                if location in name_id_map:
                    sensor_id = name_id_map[location]
                else:
                    print(f"Warning: Location {location} not found in mapping, skipping")
                    continue

                # Create the time series
                if sensor_id not in all_data:
                    all_data[sensor_id] = pd.Series(index=pd.DatetimeIndex([]))

                # Extract values and timestamps
                for _, row in location_df.iterrows():
                    dt = row["dt"]
                    value = row["value"]

                    # Add to the existing series
                    if dt not in all_data[sensor_id].index:
                        all_data[sensor_id].at[dt] = value

            print(f"Processed {len(location_df)} records for window {i+1}")

        except Exception as e:
            print(f"Error fetching data for window {i+1}: {str(e)}")

    # Process and clean the series
    for sensor_id in all_data:
        # Sort by timestamp
        all_data[sensor_id] = all_data[sensor_id].sort_index()

        # Remove duplicates
        all_data[sensor_id] = all_data[sensor_id][~all_data[sensor_id].index.duplicated(keep='first')]

    # Filter out sensors with very little data
    MIN_DATA_POINTS = 24  # At least 24 hours of data
    filtered_data = {k: v for k, v in all_data.items() if len(v) >= MIN_DATA_POINTS}

    print(f"Collected data for {len(filtered_data)} sensors out of {len(name_id_map)} total sensors")

    # Save the data
    output_path = RAW_TIMESERIES_DIR / output_filename
    with open(output_path, "wb") as f:
        pickle.dump(filtered_data, f)

    print(f"Unique vehicle classes found: {unique_veh_classes}")
    print(f"Data saved to {output_path}")

    return filtered_data

In [None]:
# Example 1: Fetch data for 1 week
async def fetch_data_example_1wk():
    end_date = datetime.now() - timedelta(days=10)  # Exclude last 10 days
    start_date = end_date - timedelta(days=17)

    return await fetch_traffic_data(
        start_date=start_date,
        end_date=end_date,
        output_filename=f"test_data_1wk_{VEH_CLASS}.pkl",
        window_size_days=1  # Fetch in 1-day chunks to avoid timeout
    )

# Example 2: Fetch data for 1 month
async def fetch_data_example_1mo():
    end_date = datetime.now() - timedelta(days=10)  # Exclude last 10 days
    start_date = end_date - timedelta(days=40)

    return await fetch_traffic_data(
        start_date=start_date,
        end_date=end_date,
        output_filename=f"test_data_1mo_{VEH_CLASS}.pkl",
        window_size_days=7  # Fetch in 7-day chunks
    )

# Example 3: Fetch data for 1 3months
async def fetch_data_example_3mo():
    end_date = datetime.now() - timedelta(days=10)  # Exclude last 10 days
    start_date = end_date - timedelta(days=90)

    return await fetch_traffic_data(
        start_date=start_date,
        end_date=end_date,
        output_filename=f"test_data_3mo_{VEH_CLASS}.pkl",
        window_size_days=7  # Fetch in 7-day chunks
    )

# Example 4: Fetch data for 1 year
async def fetch_data_example_1yr():
    end_date = datetime.now() - timedelta(days=10)  # Exclude last 10 days
    start_date = end_date - timedelta(days=365)

    return await fetch_traffic_data(
        start_date=start_date,
        end_date=end_date,
        output_filename=f"test_data_1yr_{VEH_CLASS}.pkl",
        window_size_days=30  # Fetch in 30-day chunks
    )

In [None]:
data_1wk = await fetch_data_example_1wk()
data_1mo = await fetch_data_example_1mo()
data_3mo = await fetch_data_example_3mo()
data_1yr = await fetch_data_example_1yr()

In [None]:
data_1wk