In [1]:
from datetime import datetime, timedelta, date
from dateutil.relativedelta import relativedelta
from pprint import pprint
import json
import requests
import numpy as np
import pandas as pd

from src.params import *
from src.nws import NWSClient
from src.nws import (
    fahrenheit_to_celsius,
    celsius_to_fahrenheit,
    load_clis,
    load_one_minutes,
    end_of_day,
    start_of_day,
    in_date_range,
    CLI
)

In [2]:
stations = [station for station in StationID if station != StationID.PIR]

clis = [cli  for station in stations for cli in load_clis(station)]
clis = sorted(clis, key=lambda cli: (cli.station.value, cli.summary_date, cli.issuance_time))

pprint([cli.without_raw_text()  for cli in sorted(clis, key=lambda cli: (cli.summary_date, cli.issuance_time, cli.station.value))])

[CLI(station=<StationID.MDW: 'MDW'>,
     issuance_time=datetime.datetime(2024, 10, 22, 1, 33, tzinfo=<DstTzInfo 'America/Chicago' CDT-1 day, 19:00:00 DST>),
     issuing_office='LOT',
     summary_date=datetime.date(2024, 10, 21),
     raw_text='...',
     is_afternoon_report=False,
     max_temp=82,
     max_temp_time=datetime.datetime(2024, 10, 21, 16, 6, tzinfo=<DstTzInfo 'America/Chicago' CDT-1 day, 19:00:00 DST>),
     min_temp=53,
     min_temp_time=datetime.datetime(2024, 10, 21, 4, 7, tzinfo=<DstTzInfo 'America/Chicago' CDT-1 day, 19:00:00 DST>),
     avg_temp=68,
     valid_time=None,
     is_correction=False),
 CLI(station=<StationID.MIA: 'MIA'>,
     issuance_time=datetime.datetime(2024, 10, 22, 4, 22, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>),
     issuing_office='MFL',
     summary_date=datetime.date(2024, 10, 21),
     raw_text='...',
     is_afternoon_report=False,
     max_temp=86,
     max_temp_time=datetime.datetime(2024, 10, 21, 14, 45, tzinfo=<

In [3]:
from dataclasses import asdict

# Turn the clis into a dateframe
df = pd.DataFrame([asdict(cli.without_raw_text()) for cli in clis])
df.set_index(['station', 'summary_date', 'issuance_time'], inplace=True)

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,issuing_office,raw_text,is_afternoon_report,max_temp,max_temp_time,min_temp,min_temp_time,avg_temp,valid_time,is_correction
station,summary_date,issuance_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AUS,2024-11-03,2024-11-03 17:58:00-06:00,EWX,...,True,89.0,2024-11-03 15:39:00-06:00,73,2024-11-03 06:29:00-06:00,81.0,2024-11-03 17:00:00-06:00,False
AUS,2024-11-03,2024-11-04 02:18:00-06:00,EWX,...,False,89.0,2024-11-03 15:39:00-06:00,73,2024-11-03 06:29:00-06:00,81.0,,False
AUS,2024-11-04,2024-11-04 07:32:00-06:00,EWX,...,False,76.0,2024-11-04 03:07:00-06:00,75,2024-11-04 05:59:00-06:00,76.0,2024-11-04 07:00:00-06:00,False
AUS,2024-11-04,2024-11-04 16:36:00-06:00,EWX,...,True,88.0,2024-11-04 15:17:00-06:00,75,2024-11-04 07:47:00-06:00,82.0,2024-11-04 16:00:00-06:00,False
AUS,2024-11-04,2024-11-04 17:46:00-06:00,EWX,...,True,88.0,2024-11-04 15:17:00-06:00,75,2024-11-04 07:47:00-06:00,82.0,2024-11-04 17:00:00-06:00,False


In [16]:
def gby_func(x, op: str = 'max'):
    x = x.reset_index()
    if op == 'max':
        x = x[x['issuance_time'] == x['issuance_time'].max()]
    elif op == 'min':
        x = x[x['issuance_time'] == x['issuance_time'].min()]
    return x

# Do filters
corrected_df = df[df["is_correction"]]
afternoon_df = df[df["is_afternoon_report"]]
end_of_day = df.groupby(['station', 'summary_date'])
end_of_day = end_of_day.apply(lambda x: gby_func(x, 'max'))
end_of_day = end_of_day.reset_index(drop=True)
end_of_day = end_of_day.set_index(['station', 'summary_date'])

first_afternoon = afternoon_df.groupby(['station', 'summary_date'])
first_afternoon = first_afternoon.apply(lambda x: gby_func(x, 'min'))
first_afternoon = first_afternoon.reset_index(drop=True)
first_afternoon = first_afternoon.set_index(['station', 'summary_date'])


In [17]:
display(end_of_day.head())
display(first_afternoon.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,issuance_time,issuing_office,raw_text,is_afternoon_report,max_temp,max_temp_time,min_temp,min_temp_time,avg_temp,valid_time,is_correction
station,summary_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AUS,2024-11-03,2024-11-04 02:18:00-06:00,EWX,...,False,89.0,2024-11-03 15:39:00-06:00,73,2024-11-03 06:29:00-06:00,81.0,,False
AUS,2024-11-04,2024-11-05 02:18:00-06:00,EWX,...,False,88.0,2024-11-04 15:17:00-06:00,62,2024-11-04 23:59:00-06:00,75.0,,False
AUS,2024-11-05,2024-11-06 02:37:00-06:00,EWX,...,False,74.0,2024-11-05 16:14:00-06:00,49,2024-11-05 23:40:00-06:00,62.0,,False
AUS,2024-11-06,2024-11-07 03:12:00-06:00,EWX,...,False,81.0,2024-11-06 16:19:00-06:00,43,2024-11-06 06:33:00-06:00,62.0,,False
AUS,2024-11-07,2024-11-08 03:10:00-06:00,EWX,...,False,89.0,2024-11-07 15:24:00-06:00,69,2024-11-07 06:44:00-06:00,79.0,,False


Unnamed: 0_level_0,Unnamed: 1_level_0,issuance_time,issuing_office,raw_text,is_afternoon_report,max_temp,max_temp_time,min_temp,min_temp_time,avg_temp,valid_time,is_correction
station,summary_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AUS,2024-11-03,2024-11-03 17:58:00-06:00,EWX,...,True,89.0,2024-11-03 15:39:00-06:00,73,2024-11-03 06:29:00-06:00,81.0,2024-11-03 17:00:00-06:00,False
AUS,2024-11-04,2024-11-04 16:36:00-06:00,EWX,...,True,88.0,2024-11-04 15:17:00-06:00,75,2024-11-04 07:47:00-06:00,82.0,2024-11-04 16:00:00-06:00,False
AUS,2024-11-05,2024-11-05 16:49:00-06:00,EWX,...,True,74.0,2024-11-05 15:57:00-06:00,61,2024-11-05 05:25:00-06:00,68.0,2024-11-05 16:00:00-06:00,False
AUS,2024-11-06,2024-11-06 16:35:00-06:00,EWX,...,True,81.0,2024-11-06 15:59:00-06:00,43,2024-11-06 06:33:00-06:00,62.0,2024-11-06 16:00:00-06:00,False
AUS,2024-11-07,2024-11-07 17:19:00-06:00,EWX,...,True,89.0,2024-11-07 15:24:00-06:00,69,2024-11-07 06:44:00-06:00,79.0,2024-11-07 17:00:00-06:00,False


In [21]:
# Does end of day temp differ from the afternoon temp?
difference: pd.Series = end_of_day['max_temp'] - first_afternoon['max_temp']
difference.value_counts()


max_temp
0.0    149
1.0      5
4.0      1
3.0      1
Name: count, dtype: int64

In [16]:
# print(f"Number of Corrected CLIs: {len(corrected_df)} -- Number of stations: {len(stations)}")
# for i, cli in corrected_clis:
#     prev_cli = clis[i-1]
#     if prev_cli.station == cli.station and prev_cli.summary_date == cli.summary_date:
#         print(f"Corrected CLI:\n{cli.raw_text}\n\n")
#         print(f"Previous CLI: \n{prev_cli.raw_text}\n\n")



Number of Corrected CLIs: 2 -- Number of stations: 7
Corrected CLI:
419
CDUS41 KOKX 090652 CCA
CLINYC

CLIMATE REPORT...CORRECTED
NATIONAL WEATHER SERVICE NEW YORK, NY
152 AM EST SAT NOV 09 2024

...................................

...THE CENTRAL PARK NY CLIMATE SUMMARY FOR NOVEMBER 8 2024...

CLIMATE NORMAL PERIOD 1991 TO 2020
CLIMATE RECORD PERIOD 1869 TO 2024


WEATHER ITEM   OBSERVED TIME   RECORD YEAR NORMAL DEPARTURE LAST
                VALUE   (LST)  VALUE       VALUE  FROM      YEAR
                                                  NORMAL
...................................................................
TEMPERATURE (F)
 YESTERDAY
  MAXIMUM         69    230 PM  76    1975  57     12       54
  MINIMUM         50    643 AM  29    1886  44      6       43
                                      2019
  AVERAGE         60                        50     10       49

PRECIPITATION (IN)
  YESTERDAY        0.00          7.40 1977   0.12  -0.12     0.00
  MONTH TO DATE    T            

In [17]:
from datetime import time
from src.nws import CLI



# How many times is the afternoon report corrected?
total_temp_corrected_count = 0
total_corrected_count = 0
total_count = len([cli for cli in clis if is_afternoon_report(cli)])
for i, cli in corrected_clis:
    prev_cli = clis[i-1]
    if prev_cli.station == cli.station and prev_cli.summary_date == cli.summary_date:
        if is_afternoon_report(prev_cli):
            total_corrected_count += 1
            if cli.max_temp != prev_cli.max_temp:
                total_temp_corrected_count += 1

print(f"Total Corrected Count: {total_corrected_count} / {total_count}")
print(f"Total Temp Corrected Count: {total_temp_corrected_count} / {total_count}")


Total Corrected Count: 0 / 173
Total Temp Corrected Count: 0 / 173


In [18]:
from collections import defaultdict

# How many times is the daily max temp different from the afternoon report
total_temp_diff_count = 0
afternoon_reports = [cli for cli in clis if is_afternoon_report(cli)]
# assert len(afternoon_reports) == set((cli.station, cli.summary_date) for cli in afternoon_reports)


# Did we overcount any summary dates?
afternoon_reports_by_station_summary_date = {k: [cli for cli in clis if is_afternoon_report(cli)] for k, clis in clis_by_station_summary_date.items()}
for k, clis in clis_by_station_summary_date.items():
    if len(afternoon_reports_by_station_summary_date[k]) > 1:
        print(f"Station: {k[0]} -- Summary Date: {k[1]} -- Afternoon Report Count: {len(afternoon_reports_by_station_summary_date[k])} -- Total Count: {len(clis)}")
        print(f"Afternoon Report Times: {[cli.issuance_time for cli in afternoon_reports_by_station_summary_date[k]]}")

# Get the daily report for the day


Station: AUS -- Summary Date: 2024-11-04 -- Afternoon Report Count: 2 -- Total Count: 4
Afternoon Report Times: [datetime.datetime(2024, 11, 4, 16, 36, tzinfo=<DstTzInfo 'America/Chicago' CST-1 day, 18:00:00 STD>), datetime.datetime(2024, 11, 4, 17, 46, tzinfo=<DstTzInfo 'America/Chicago' CST-1 day, 18:00:00 STD>)]
Station: AUS -- Summary Date: 2024-11-05 -- Afternoon Report Count: 3 -- Total Count: 5
Afternoon Report Times: [datetime.datetime(2024, 11, 5, 16, 49, tzinfo=<DstTzInfo 'America/Chicago' CST-1 day, 18:00:00 STD>), datetime.datetime(2024, 11, 5, 16, 53, tzinfo=<DstTzInfo 'America/Chicago' CST-1 day, 18:00:00 STD>), datetime.datetime(2024, 11, 5, 17, 41, tzinfo=<DstTzInfo 'America/Chicago' CST-1 day, 18:00:00 STD>)]
Station: AUS -- Summary Date: 2024-11-06 -- Afternoon Report Count: 2 -- Total Count: 4
Afternoon Report Times: [datetime.datetime(2024, 11, 6, 16, 35, tzinfo=<DstTzInfo 'America/Chicago' CST-1 day, 18:00:00 STD>), datetime.datetime(2024, 11, 6, 17, 51, tzinfo=<Ds

In [30]:
from dataclasses import asdict

# Make a dataframe that includes all the issuance times of all the clis
df = pd.DataFrame([asdict(cli.without_raw_text()) for cli in clis])
df.drop(columns=['raw_text'], inplace=True)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype                          
---  ------          --------------  -----                          
 0   issuance_time   55 non-null     datetime64[ns, America/Chicago]
 1   issuing_office  55 non-null     object                         
 2   summary_date    55 non-null     object                         
 3   max_temp        55 non-null     int64                          
 4   max_temp_time   54 non-null     datetime64[ns, America/Chicago]
 5   min_temp        55 non-null     int64                          
 6   min_temp_time   54 non-null     datetime64[ns, America/Chicago]
 7   avg_temp        55 non-null     int64                          
 8   valid_time      27 non-null     datetime64[ns, America/Chicago]
dtypes: datetime64[ns, America/Chicago](4), int64(3), object(2)
memory usage: 4.0+ KB


Unnamed: 0,issuance_time,issuing_office,summary_date,max_temp,max_temp_time,min_temp,min_temp_time,avg_temp,valid_time
0,2024-10-22 01:33:00-05:00,LOT,2024-10-21,82,2024-10-21 16:06:00-05:00,53,2024-10-21 04:07:00-05:00,68,NaT
1,2024-10-22 16:36:00-05:00,LOT,2024-10-22,74,2024-10-22 14:59:00-05:00,55,2024-10-22 06:18:00-05:00,65,2024-10-22 16:00:00-05:00
2,2024-10-23 01:34:00-05:00,LOT,2024-10-22,75,2024-10-22 15:45:00-05:00,55,2024-10-22 06:18:00-05:00,65,NaT
3,2024-10-23 16:36:00-05:00,LOT,2024-10-23,65,2024-10-23 14:47:00-05:00,58,2024-10-23 06:37:00-05:00,62,2024-10-23 16:00:00-05:00
4,2024-10-24 01:37:00-05:00,LOT,2024-10-23,65,NaT,46,NaT,56,NaT


In [31]:
# Plot the ranges of the CLI timeframes
import plotly.express as px

issue_df = df[["issuance_time"]].copy()
issue_df["issuance_time"] = issue_df["issuance_time"].dt.time
# Filter on those that are between 4-5pm
hours = range(16, 18)
issue_df = issue_df[issue_df["issuance_time"].apply(lambda t: t.hour in hours)]
issue_df["issuance_time"] = issue_df["issuance_time"].apply(lambda t: ((t.hour - min(hours)) * 60 + t.minute))

binsize = 1 # minute
fig = px.histogram(issue_df, x="issuance_time", nbins=180//binsize)
fig.show()


In [None]:
# MIA Release times
# 16:21 to 16:40 Eastern
# NYC Release times
# 16:32 to 16:52 Eastern
# AUS Release times
# 16:22 - 17:59 Central
# MDW Release times
# 16:32 to 16:51 Central
