In [1]:
from datetime import datetime, timedelta, date
from dateutil.relativedelta import relativedelta
from pprint import pprint
import json
import requests
import numpy as np
import pandas as pd

from src.params import *
from src.nws import NWSClient
from src.nws import (
    fahrenheit_to_celsius,
    celsius_to_fahrenheit,
    load_clis,
    load_one_minutes,
    end_of_day,
    start_of_day,
    in_date_range
)

In [2]:
nws = NWSClient()

# Initial Questions

In the long term, I'd like to build a model that can accurately predict the probability of the high temp being in a certain 2 degree range given the available information. Using the link from @BigJ, I now have 1-minute observations on the 4 thermometers MIA, NYC, MDW, and AUS. The first questions:
1. @BigJ claims that temps in CLIs are exactly the max of these one-minute readings. Verify this.
    - So this is not true... Raises the question of where the CLI result even comes from. Maybe worth checking with NWS again.
2. Given (1) is true, what is the variation between the linear interpolation of the 5 minute readings and the 1 minute readings? I'd like to see this both as a timeseries and as a histogram. Same for the precise 1 hour readings. Does it follow a particular distribution?

In [3]:
# Verify that CLI maxes are same as max of 1-minute observations
# Load all the CLIS
station = StationID.MIA

clis = load_clis(station)

pprint([cli.without_raw_text() for cli in clis])

[CLI(issuance_time=datetime.datetime(2024, 10, 25, 4, 25, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>),
     issuing_office='MFL',
     summary_date=datetime.date(2024, 10, 24),
     raw_text='...',
     max_temp=84,
     max_temp_time=datetime.datetime(2024, 10, 24, 14, 22, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>),
     min_temp=73,
     min_temp_time=datetime.datetime(2024, 10, 24, 5, 31, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>),
     avg_temp=79,
     valid_time=None),
 CLI(issuance_time=datetime.datetime(2024, 11, 11, 16, 22, tzinfo=<DstTzInfo 'America/New_York' EST-1 day, 19:00:00 STD>),
     issuing_office='MFL',
     summary_date=datetime.date(2024, 11, 11),
     raw_text='...',
     max_temp=87,
     max_temp_time=datetime.datetime(2024, 11, 11, 11, 17, tzinfo=<DstTzInfo 'America/New_York' EST-1 day, 19:00:00 STD>),
     min_temp=74,
     min_temp_time=datetime.datetime(2024, 11, 11, 3, 50, tzinfo=<DstTzInfo 'America

In [4]:
# Throw out any CLIs missing max temperature
clis = [cli for cli in clis if cli.max_temp is not None]
# Sort clis by date
clis = sorted(clis, key=lambda cli: cli.summary_date)

In [5]:
# Now load all the 1-minute observations within the same time range
start_date = min(cli.summary_date for cli in clis)
end_date = max(cli.summary_date for cli in clis)

one_min_df = load_one_minutes(
    station,
    start=start_of_day(start_date, tz=STATION_TZ[station]),
    end=end_of_day(end_date, tz=STATION_TZ[station]),
)

[PosixPath('../data/inputs/one_minute/MIA/one_minute.MIA.start-2010-05-01T00-00-00.end-2010-05-31T23-59-00.csv'), PosixPath('../data/inputs/one_minute/MIA/one_minute.MIA.start-2019-03-29T17-13-00.end-2019-03-31T23-59-00.csv'), PosixPath('../data/inputs/one_minute/MIA/one_minute.MIA.start-2003-11-03T04-39-00.end-2003-11-30T23-59-00.csv'), PosixPath('../data/inputs/one_minute/MIA/one_minute.MIA.start-2018-09-01T00-00-00.end-2018-09-30T23-59-00.csv'), PosixPath('../data/inputs/one_minute/MIA/one_minute.MIA.start-2019-07-01T00-00-00.end-2019-07-31T23-59-00.csv'), PosixPath('../data/inputs/one_minute/MIA/one_minute.MIA.start-2008-02-01T00-00-00.end-2008-02-28T15-34-00.csv'), PosixPath('../data/inputs/one_minute/MIA/one_minute.MIA.start-2002-08-01T00-00-00.end-2002-08-31T23-59-00.csv'), PosixPath('../data/inputs/one_minute/MIA/one_minute.MIA.start-2005-07-01T00-00-00.end-2005-07-31T23-59-00.csv'), PosixPath('../data/inputs/one_minute/MIA/one_minute.MIA.start-2022-11-01T01-00-00.end-2022-11-3

  df[col] = pd.to_datetime(df[col])


In [6]:
# Extract a dataframe for each CLI covering the same time range
clis_df: list[pd.DataFrame] = []
for cli in clis:
    cli_start = start_of_day(cli.summary_date, tz=STATION_TZ[station])
    if cli.valid_time is not None:
        cli_end = cli.valid_time
    else:
        cli_end = end_of_day(cli.summary_date, tz=STATION_TZ[station])
    cli_df = one_min_df[
        one_min_df["valid"].apply(
            lambda row: in_date_range(row, cli_start, cli_end)
        )
    ]
    clis_df.append(cli_df)

# Filter out clis and one_min_df when we have no one_min observations
clis = [cli for cli, cli_df in zip(clis, clis_df) if not cli_df.empty]
clis_df = [cli_df for cli_df in clis_df if not cli_df.empty]


In [8]:
# Plot the ranges of the CLI timeframes
import plotly.express as px

timeline_df = pd.DataFrame(
    [
        {"CLI": i, "Start": cli_df["valid"].apply(lambda dt: dt.astimezone(pytz.utc)).min(), "End": cli_df["valid"].apply(lambda dt: dt.astimezone(pytz.utc)).max()}
        for i, cli_df in enumerate(clis_df)
    ]
)

fig = px.timeline(timeline_df, x_start="Start", x_end="End", y="CLI")
fig.update_yaxes(autorange="reversed") # otherwise tasks are listed from the bottom up
fig.show()

In [9]:
temp_match = []
time_match = []
temp_lb = []

one_min_temp_max = []
one_min_temp_time = []

for cli in clis:
    cli_start = start_of_day(cli.summary_date, tz=STATION_TZ[station])
    if cli.valid_time is not None:
        cli_end = cli.valid_time
    else:
        cli_end = end_of_day(cli.summary_date, tz=STATION_TZ[station])

    relevant_meas = one_min_df[
        one_min_df["valid"].apply(
            lambda x: in_date_range(x, start=cli_start, end=cli_end)
        )
    ]
    if relevant_meas.empty:
        print(f"No 1-minute observations for {cli_start} to {cli_end}")
        continue
    idx = relevant_meas["tmpf"].idxmax()
    one_min_max = relevant_meas.loc[idx, "tmpf"]
    one_min_max_time = relevant_meas.loc[idx, "valid"]

    one_min_temp_max.append(one_min_max)
    one_min_temp_time.append(one_min_max_time)

    # Now give a summary of the comparison
    print(f"CLI: {cli_start} to {cli_end}")
    print(f"CLI max: {cli.max_temp}, 1-min max: {one_min_max}")
    print(
        f"CLI max temp time: {cli.max_temp_time}, 1-min max temp time: {one_min_max_time}"
    )

    temp_match.append(cli.max_temp == one_min_max)
    time_match.append(cli.max_temp_time == one_min_max_time)
    temp_lb.append(cli.max_temp <= one_min_max)

print(f"{sum(int(x) for x in temp_match)} / {len(clis)}")
print(f"{sum(int(x) for x in time_match)} / {len(clis)}")
print(f"{sum(int(x) for x in temp_lb)} / {len(clis)}")

CLI: 2024-10-21 00:00:00-04:00 to 2024-10-21 23:59:59.999999-04:00
CLI max: 86, 1-min max: 86
CLI max temp time: 2024-10-21 14:45:00-04:00, 1-min max temp time: 2024-10-21 12:20:00-04:00
CLI: 2024-10-22 00:00:00-04:00 to 2024-10-22 23:59:59.999999-04:00
CLI max: 86, 1-min max: 87
CLI max temp time: 2024-10-22 13:47:00-04:00, 1-min max temp time: 2024-10-22 13:44:00-04:00
CLI: 2024-10-22 00:00:00-04:00 to 2024-10-22 16:00:00-04:00
CLI max: 86, 1-min max: 87
CLI max temp time: 2024-10-22 13:47:00-04:00, 1-min max temp time: 2024-10-22 13:44:00-04:00
CLI: 2024-10-23 00:00:00-04:00 to 2024-10-23 16:00:00-04:00
CLI max: 87, 1-min max: 88
CLI max temp time: 2024-10-23 13:26:00-04:00, 1-min max temp time: 2024-10-23 13:08:00-04:00
CLI: 2024-10-23 00:00:00-04:00 to 2024-10-23 23:59:59.999999-04:00
CLI max: 87, 1-min max: 88
CLI max temp time: 2024-10-23 13:26:00-04:00, 1-min max temp time: 2024-10-23 13:08:00-04:00
CLI: 2024-10-24 00:00:00-04:00 to 2024-10-24 23:59:59.999999-04:00
CLI max: 84,

In [10]:
for i, cli in enumerate(clis):
    if not temp_lb[i]:
        print(f"CLI: {cli.summary_date}")
        print(f"CLI max: {cli.max_temp}")
        print(f"CLI max time: {cli.max_temp_time}")

        print(f"1-min max: {one_min_temp_max[i]}")
        print(f"1-min max time: {one_min_temp_time[i]}")


CLI: 2024-10-26
CLI max: 86
CLI max time: 2024-10-26 12:30:00-04:00
1-min max: 79
1-min max time: 2024-10-26 23:45:00-04:00


In [6]:
type(pd.StringDtype())



pandas.core.arrays.string_.StringDtype