# New Metadata parser
Instead of looping though each individually, build the metadata dict all at once.

In [1]:
import synoptic

In [2]:
S = synoptic.SynopticAPI(
    "latest",
    # vars="air_temp,wind_speed,ozone_concentration",
    state="ut",
    # network=1,
    complete=True,
)
S

🚚💨 Speedy delivery from Synoptic's [32mlatest[0m service.
📦 Received data from 1,447 stations.


╭─ Synoptic latest service ─────
│ Stations : 1,447
│ QC Checks: 1
╰──────────────────────────────────────╯

In [3]:
import polars as pl


def unnest_period_of_record(
    df: pl.DataFrame | pl.LazyFrame,
) -> pl.DataFrame | pl.LazyFrame:
    """Un-nest the PERIOD_OF_RECORD column struct."""
    return df.with_columns(
        pl.struct(
            pl.col("PERIOD_OF_RECORD")
            .struct.field("start")
            .cast(pl.String)
            .str.to_datetime(time_zone="UTC")
            .alias("PERIOD_OF_RECORD_START"),
            pl.col("PERIOD_OF_RECORD")
            .struct.field("end")
            .cast(pl.String)
            .str.to_datetime(time_zone="UTC")
            .alias("PERIOD_OF_RECORD_END"),
        ).alias("PERIOD_OF_RECORD"),
    ).unnest("PERIOD_OF_RECORD")


def station_metadata_to_dataframe(STATION: list[dict]):
    """From STATION, produce the metadata DataFrame."""
    a = []
    for metadata in STATION:
        metadata = metadata.copy()
        metadata.pop("OBSERVATIONS", None)
        metadata.pop("SENSOR_VARIABLES", None)
        metadata.pop("LATENCY", None)
        metadata.pop("QC", None)
        a.append(metadata)
    df = pl.DataFrame(a, infer_schema_length=None).lazy()
    df = df.with_columns(
        pl.col("STID").cast(pl.String),
        pl.col("ID", "MNET_ID").cast(pl.UInt32),
        pl.col("ELEVATION", "LATITUDE", "LONGITUDE").cast(pl.Float64),
        is_active=pl.when(pl.col("STATUS") == "ACTIVE")
        .then(True)
        .otherwise(pl.when(pl.col("STATUS") == "INACTIVE").then(False)),
    ).drop("UNITS", "STATUS")

    if "RESTRICTED" in df.collect_schema().names():
        df = df.rename({"RESTRICTED": "is_restricted"})

    if "ELEV_DEM" in df.collect_schema().names():
        # This isn't in the Latency request
        df = df.with_columns(pl.col("ELEV_DEM").cast(pl.Float64))

    df = df.pipe(unnest_period_of_record)
    df = df.rename({i: i.lower() for i in df.collect_schema().names()})

    return df.collect()


def NEW_parse_stations_latest_nearesttime(STATION):
    pass


In [4]:
# Unpack Latest/Nearest time JSON into parts

observations = []
qc = []
latency = []
sensor_variables = []

for s in S.STATION:
    observations.append({"stid": s["STID"]} | s.pop("OBSERVATIONS", {}))
    qc.append({"stid": s["STID"]} | s.pop("qc", {}))
    latency.append({"stid": s["STID"]} | s.pop("latency", {}))
    sensor_variables.append({"stid": s["STID"]} | s.pop("sensor_variables", {}))


# Get Metadata DataFrame
metadata = station_metadata_to_dataframe(S.STATION)

# Get Observations DataFrame (needs more processing)
df = pl.DataFrame(observations, infer_schema_length=None)

In [5]:
# BUG: Synoptic API ozone_concentration_value_1, the value is returned as string and not float
df = df.with_columns(
    pl.struct(
        [
            pl.col("ozone_concentration_value_1")
            .struct.field("value")
            .replace("", None)
            .cast(pl.Float64),
            pl.col("ozone_concentration_value_1").struct.field("date_time"),
        ]
    ).alias("ozone_concentration_value_1")
)


In [6]:
# Separate columns by value type
# TODO: Still need to handle sky_condition types

cols_with_float = []
cols_with_string = []
cols_with_other = []
for col, schema in df.schema.items():
    if hasattr(schema, "fields"):
        if pl.Field("value", pl.Float64) in schema.fields:
            cols_with_float.append(col)
        elif pl.Field("value", pl.String) in schema.fields:
            cols_with_string.append(col)
        elif pl.Field("value", pl.Struct) in schema.fields:
            cols_with_other.append(col)
            print(f"WARNING: Unknown struct for {col=} {schema=}")
    else:
        print(f"{col=}, {schema=}")

col='stid', schema=String


In [7]:
# Unpack the Float observations

observed_float = (
    df.select(["stid"] + cols_with_float)
    .select("stid", "^.*value.*$")
    .unpivot(index="stid")
    .with_columns(
        pl.col("variable").str.extract_groups(
            r"(?<variable>.+)_value_(?<sensor_index>\d)(?<is_derived>d?)"
        )
    )
    .unnest("variable")
    .with_columns(
        pl.col("is_derived") == "d",
        pl.col("sensor_index").cast(pl.UInt32),
        pl.col("variable").replace(S.UNITS).alias("units"),
    )
    .unnest("value")
    .with_columns(pl.col("date_time").str.to_datetime())
    .drop_nulls()
)
observed_float

stid,variable,sensor_index,is_derived,value,date_time,units
str,str,u32,bool,f64,"datetime[μs, UTC]",str
"""WBB""","""pressure""",1,false,85321.0,2024-11-17 05:45:00 UTC,"""Pascals"""
"""GNI""","""pressure""",1,false,87058.0,2024-10-31 07:40:00 UTC,"""Pascals"""
"""HATUT""","""pressure""",1,false,87010.0,2024-11-17 05:45:00 UTC,"""Pascals"""
"""LMS""","""pressure""",1,false,87237.0,2024-11-17 05:30:00 UTC,"""Pascals"""
"""LMR""","""pressure""",1,false,84175.0,2024-11-17 05:30:00 UTC,"""Pascals"""
…,…,…,…,…,…,…
"""UGSPG""","""wet_bulb_temp""",2,true,-0.81,2024-11-16 23:00:00 UTC,"""Celsius"""
"""UUCMF""","""wet_bulb_temp""",2,true,-1.59,2024-11-17 05:30:00 UTC,"""Celsius"""
"""UUPYF""","""wet_bulb_temp""",2,true,-4.74,2024-11-17 05:30:00 UTC,"""Celsius"""
"""NGLO161462""","""precip_interval""",1,false,0.0,2024-11-15 19:16:00 UTC,"""Millimeters"""


In [8]:
# Unpack the string observations


observed_string = (
    df.select(["stid"] + cols_with_string)
    .select("stid", "^.*value.*$")
    .unpivot(index="stid")
    .with_columns(
        pl.col("variable").str.extract_groups(
            r"(?<variable>.+)_value_(?<sensor_index>\d)(?<is_derived>d?)"
        )
    )
    .unnest("variable")
    .with_columns(
        pl.col("is_derived") == "d",
        pl.col("sensor_index").cast(pl.UInt32),
        pl.col("variable").replace(S.UNITS).alias("units"),
    )
    .unnest("value")
    .rename({"value": "value_string"})
    .with_columns(pl.col("date_time").str.to_datetime())
    .drop_nulls()
)
observed_string


stid,variable,sensor_index,is_derived,date_time,value_string,units
str,str,u32,bool,"datetime[μs, UTC]",str,str
"""WBB""","""wind_cardinal_direction""",1,true,2024-11-17 05:45:00 UTC,"""E""","""code"""
"""HOL""","""wind_cardinal_direction""",1,true,2024-11-17 05:45:00 UTC,"""NNE""","""code"""
"""SBE""","""wind_cardinal_direction""",1,true,2024-11-17 05:00:00 UTC,"""SSE""","""code"""
"""SB2""","""wind_cardinal_direction""",1,true,2024-11-17 05:45:00 UTC,"""NW""","""code"""
"""PCB""","""wind_cardinal_direction""",1,true,2024-11-17 05:00:00 UTC,"""E""","""code"""
…,…,…,…,…,…,…
"""UGSPG""","""wind_cardinal_direction""",2,true,2024-11-16 23:00:00 UTC,"""E""","""code"""
"""UGSCM""","""wind_cardinal_direction""",2,true,2024-11-17 05:30:00 UTC,"""NNW""","""code"""
"""UGSDV""","""wind_cardinal_direction""",2,true,2024-11-04 17:30:00 UTC,"""W""","""code"""
"""UGSES""","""wind_cardinal_direction""",2,true,2024-11-17 05:30:00 UTC,"""N""","""code"""


In [9]:
# Join all observation values

observed = pl.concat([observed_float, observed_string], how="diagonal_relaxed")

In [10]:
# Join the metadata to the observed values

observed.join(metadata, on="stid")  # , how="full", coalesce=True)

stid,variable,sensor_index,is_derived,value,date_time,units,value_string,id,name,elevation,latitude,longitude,mnet_id,state,timezone,elev_dem,nwszone,nwsfirezone,gacc,shortname,sgid,county,country,wims_id,cwa,period_of_record_start,period_of_record_end,providers,qc_flagged,is_restricted,is_active
str,str,u32,bool,f64,"datetime[μs, UTC]",str,str,u32,str,f64,f64,f64,u32,str,str,f64,str,str,str,str,str,str,str,str,str,"datetime[μs, UTC]","datetime[μs, UTC]",list[struct[2]],bool,bool,bool
"""WBB""","""pressure""",1,false,85321.0,2024-11-17 05:45:00 UTC,"""Pascals""",,1,"""U of U William Browning Buildi…",4806.0,40.76623,-111.84755,153,"""UT""","""America/Denver""",4727.7,"""UT105""","""SLC478""","""GBCC""","""UUNET""","""GB25""","""Salt Lake""","""US""",,"""SLC""",1997-01-01 00:00:00 UTC,2024-11-17 04:50:00 UTC,"[{""U of U MesoWest Group"",""http://meso1.chpc.utah.edu/mesowest_overview/""}, {""U-ATAQ"",""http://air.utah.edu/""}]",false,false,true
"""GNI""","""pressure""",1,false,87058.0,2024-10-31 07:40:00 UTC,"""Pascals""",,34,"""Gunnison Island""",4242.0,41.33216,-112.85432,153,"""UT""","""America/Denver""",4202.8,"""UT101""","""SLC478""","""GBCC""","""UUNET""","""GB25""","""Box Elder""","""US""",,"""SLC""",1998-05-22 00:00:00 UTC,2024-11-17 04:45:00 UTC,"[{""U of U MesoWest Group"",""http://meso1.chpc.utah.edu/mesowest_overview/""}, {""Utah Department of Natural Resources"",""http://www.dnr.utah.gov""}, {""SLC WFO/NWS Western Region"",""http://www.wrh.noaa.gov/slc""}]",false,false,true
"""HATUT""","""pressure""",1,false,87010.0,2024-11-17 05:45:00 UTC,"""Pascals""",,35,"""Hat Island""",4242.0,41.07073,-112.58621,153,"""UT""","""America/Denver""",4245.4,"""UT101""","""SLC478""","""GBCC""","""UUNET""","""GB25""","""Box Elder""","""US""",,"""SLC""",1998-09-02 00:00:00 UTC,2024-11-17 04:45:00 UTC,"[{""U of U MesoWest Group"",""http://meso1.chpc.utah.edu/mesowest_overview/""}, {""Utah Department of Natural Resources"",""http://www.dnr.utah.gov""}, {""SLC WFO/NWS Western Region"",""http://www.wrh.noaa.gov/slc""}]",false,false,true
"""LMS""","""pressure""",1,false,87237.0,2024-11-17 05:30:00 UTC,"""Pascals""",,36,"""Locomotive Springs""",4242.0,41.701,-112.86181,153,"""UT""","""America/Denver""",4215.9,"""UT101""","""SLC478""","""GBCC""","""UUNET""","""GB25""","""Box Elder""","""US""",,"""SLC""",1999-07-02 00:00:00 UTC,2024-11-17 04:45:00 UTC,"[{""U of U MesoWest Group"",""http://meso1.chpc.utah.edu/mesowest_overview/""}, {""SLC WFO/NWS Western Region"",""http://www.wrh.noaa.gov/slc""}]",false,false,true
"""LMR""","""pressure""",1,false,84175.0,2024-11-17 05:30:00 UTC,"""Pascals""",,39,"""Lakeside Mountain""",5039.0,41.06084,-112.89173,153,"""UT""","""America/Denver""",5150.9,"""UT101""","""SLC478""","""GBCC""","""UUNET""","""GB25""","""Box Elder""","""US""",,"""SLC""",1999-12-16 00:00:00 UTC,2024-11-17 04:45:00 UTC,"[{""U of U MesoWest Group"",""http://meso1.chpc.utah.edu/mesowest_overview/""}, {""SLC WFO/NWS Western Region"",""http://www.wrh.noaa.gov/slc""}]",false,false,true
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""UGSPG""","""wind_cardinal_direction""",2,true,,2024-11-16 23:00:00 UTC,"""code""","""E""",239721,"""Phrag EC""",4203.0,41.0604,-112.1013,153,"""UT""","""America/Denver""",,"""UT104""","""SLC478""","""GBCC""","""UUNET""","""GB25""","""Davis""","""US""",,"""SLC""",2024-07-09 15:30:00 UTC,2024-11-16 23:00:00 UTC,[],false,false,true
"""UGSCM""","""wind_cardinal_direction""",2,true,,2024-11-17 05:30:00 UTC,"""code""","""NNW""",239855,"""Cedar Mesa EC""",6135.0,37.52475,-109.7458,153,"""UT""","""America/Denver""",,"""UT029""","""GJT491""","""GBCC""","""UUNET""","""GB32""","""San Juan""","""US""",,"""GJT""",2024-07-12 20:00:00 UTC,2024-11-17 04:30:00 UTC,[],false,false,true
"""UGSDV""","""wind_cardinal_direction""",2,true,,2024-11-04 17:30:00 UTC,"""code""","""W""",239856,"""Desert View (Myton) EC""",5032.0,40.12644,-109.9547,153,"""UT""","""America/Denver""",,"""UT024""","""GJT486""","""GBCC""","""UUNET""","""GB29""","""Uintah""","""US""",,"""GJT""",2024-07-12 19:30:00 UTC,2024-11-04 17:30:00 UTC,[],false,false,true
"""UGSES""","""wind_cardinal_direction""",2,true,,2024-11-17 05:30:00 UTC,"""code""","""N""",239857,"""Escalante EC""",5674.0,37.73532,-111.5708,153,"""UT""","""America/Denver""",,"""UT128""","""SLC496""","""GBCC""","""UUNET""","""GB35""","""Garfield""","""US""",,"""SLC""",2024-07-12 19:30:00 UTC,2024-11-17 04:30:00 UTC,[],false,false,true
