In [0]:
%pip install lxml

In [0]:
import pandas as pd
from modules.clients import SnotelClient, NWSClient
from modules.database import upsert_to_delta, get_table_watermark
from modules.transformer import generate_combined_forecast
from config.settings import *

In [0]:
# 1. Setup Parameters
run_mode = "full"  # options: ["incremental", "full"]

snotel = SnotelClient()
nws = NWSClient()

# 2. Determine Dates
if run_mode == "incremental":
    last_date = get_table_watermark(TBL_SNOW_OBS, "date")
    start_date = (last_date).strftime('%Y-%m-%d') if last_date else "2026-01-05"
    end_date = (pd.Timestamp.now() + pd.Timedelta(days=1)).strftime('%Y-%m-%d')
else:
    start_date = "2026-01-05"
    end_date = pd.Timestamp.now().strftime('%Y-%m-%d')

print(f"Running in {run_mode} mode with dates: {start_date} to {end_date}")

In [0]:
# 3. Process Observations
stations = spark.table(TBL_WEATHER_STATIONS).toPandas()
stations = stations[:5]
all_obs = []
for _, row in stations.iterrows():
    data = snotel.fetch_historical_data(row['site_id'], row['ntwk'], start_date, end_date)
    if data is not None: 
        all_obs.append(data)

if all_obs:
    from modules.transformer import transform_historical_data
    
    # Combine raw dataframes
    obs_df = pd.concat(all_obs)
    
    # Transform raw columns (Station Id -> site_id, etc.) to match Delta schema
    # This addresses the [DELTA_MERGE_UNRESOLVED_EXPRESSION] error
    clean_obs_df = transform_historical_data(obs_df)
    
    # Use 'date_hr' and 'site_id' as join keys for a more granular unique constraint
    upsert_to_delta(clean_obs_df, TBL_SNOW_OBS, ["date_hr", "site_id"], mode=run_mode)

In [0]:
# 4. Process Forecasts (Always upsert/refresh)
all_hourly = []
all_snow_grid = []
for _, row in stations.iterrows():
    all_hourly.append(nws.get_hourly_forecast(row['lat'], row['lon']).assign(site_id=row['site_id']))
    all_snow_grid.append(nws.get_snow_grid_data(row['lat'], row['lon']).assign(site_id=row['site_id']))

upsert_to_delta(pd.concat(all_hourly), TBL_WEATHER_FCST, ["startTime", "site_id"], mode="overwrite")
upsert_to_delta(pd.concat(all_snow_grid), TBL_SNOW_FCST, ["snow_start", "site_id"], mode="overwrite")


In [0]:
# 5. Aggregate (Efficient Spark Join)
generate_combined_forecast()

In [0]:
%sql
select *
from gold.weather.combined_forecast

In [0]:
display(spark.table("bronze.raw.weather_forecast"))