## Optimized version

In [None]:
# BOOTSTRAP: define everything needed in one go, then demo, 
# because for some reason it doesn't work if I spread things out

from __future__ import annotations
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import calendar
import matplotlib as mpl

AWS_BUCKET = "s3://noaa-ghcn-pds"

# 1) Helpers
def _standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    variants = {
        "ID": ["ID", "id"],
        "DATE": ["DATE", "date"],
        "ELEMENT": ["ELEMENT", "element"],
        "VALUE": ["VALUE", "DATA VALUE", "DATA_VALUE", "data value"],
        "MFLAG": ["MFLAG", "M-FLAG", "M_FLAG"],
        "QFLAG": ["QFLAG", "Q-FLAG", "Q_FLAG"],
        "SFLAG": ["SFLAG", "S-FLAG", "S_FLAG"],
        "OBS_TIME": ["OBS_TIME", "OBS-TIME", "OBS TIME"],
    }
    rename = {}
    lower = {c.lower(): c for c in df.columns}
    for std, opts in variants.items():
        for o in opts:
            if o in df.columns:
                rename[o] = std
                break
            lo = o.lower()
            if lo in lower:
                rename[lower[lo]] = std
                break
    return df.rename(columns=rename)

# Anonymous Access Here
def load_station_timeseries(station_id: str) -> pd.DataFrame: 
    """Read a single station from AWS (Parquet preferred; CSV fallback)."""
    base = AWS_BUCKET
    try:
        df = pd.read_parquet(f"{base}/parquet/by_station/STATION={station_id}/",
                             storage_options={"anon": True}) 
    except Exception:
        df = pd.read_csv(f"{base}/csv/by_station/{station_id}.csv",
                         storage_options={"anon": True}, low_memory=False)
    df = _standardize_columns(df)
    if not np.issubdtype(df["DATE"].dtype, np.datetime64):
        df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce")
    if "QFLAG" in df.columns:
        df = df[df["QFLAG"].isna()]
    keep = [c for c in ["ID","DATE","ELEMENT","VALUE","MFLAG","QFLAG","SFLAG","OBS_TIME"] if c in df.columns]
    return df[keep].copy()

def _to_celsius(series: pd.Series) -> pd.Series:
    return pd.to_numeric(series, errors="coerce") / 10.0

def _maybe_to_units(series: pd.Series, units: str) -> pd.Series:
    u = units.upper()
    if u == "C":
        return series
    if u == "F":
        return series * 9/5 + 32
    raise ValueError("units must be 'C' or 'F'")

# 2) REQUIRED BY ASSIGNMENT 3
def daily_normals_and_records(station_id: str, units: str = "F") -> pd.DataFrame:
    """Per-calendar-day stats for 1991–2020. Returns columns in rubric order."""
    df = load_station_timeseries(station_id)
    df = df[
        (df["ELEMENT"].isin(["TMAX", "TMIN"])) &
        (df["DATE"] >= "1991-01-01") & (df["DATE"] <= "2020-12-31")
    ].copy()
    df["temp_c"] = _to_celsius(df["VALUE"])
    wide = (df.pivot_table(index="DATE", columns="ELEMENT", values="temp_c", aggfunc="first")
              .reset_index())
    wide["mmdd"] = wide["DATE"].dt.strftime("%m-%d")
    grp = wide.groupby("mmdd")
    normals = pd.DataFrame({
        "record_min_temp": grp["TMIN"].min(),
        "average_min_temp": grp["TMIN"].mean(),
        "average_max_temp": grp["TMAX"].mean(),
        "record_max_temp": grp["TMAX"].max(),
    }).sort_index()
    for col in normals.columns:
        normals[col] = _maybe_to_units(normals[col], units)
    all_mmdd = pd.date_range("2000-01-01", "2000-12-31", freq="D").strftime("%m-%d")
    return normals.reindex(all_mmdd)

def actuals_for_year(station_id: str, year: int, units: str = "F") -> pd.DataFrame:
    df = load_station_timeseries(station_id)
    df = df[(df["ELEMENT"].isin(["TMAX","TMIN"])) & (df["DATE"].dt.year == year)].copy()
    df["temp_c"] = _to_celsius(df["VALUE"])
    wide = (df.pivot_table(index="DATE", columns="ELEMENT", values="temp_c", aggfunc="first")
              .reset_index())
    out = pd.DataFrame({
        "date": wide["DATE"],
        "mmdd": wide["DATE"].dt.strftime("%m-%d"),
        "tmax": wide["TMAX"],
        "tmin": wide["TMIN"],
    }).sort_values("date").reset_index(drop=True)
    out[["tmax","tmin"]] = out[["tmax","tmin"]].apply(_maybe_to_units, units=units)
    return out

def plot_weather_climatology(normals_df: pd.DataFrame, actuals_df: pd.DataFrame,
                             title: str | None = None, units: str = "F", show: bool = True):
    """Return (fig, ax)."""
    mmdd = pd.date_range("2000-01-01","2000-12-31",freq="D").strftime("%m-%d")
    N = normals_df.reindex(mmdd)
    A = actuals_df.set_index("mmdd").reindex(mmdd)
    x = np.arange(len(mmdd)) + 1
    fig, ax = plt.subplots(figsize=(14,6), dpi=120)
    ax.fill_between(x, N["record_min_temp"], N["record_max_temp"], alpha=0.15, label="Record range")
    ax.fill_between(x, N["average_min_temp"], N["average_max_temp"], alpha=0.25, label="Normal range (1991–2020)")
    ax.plot(x, A["tmax"], linewidth=1.2, label="Actual High")
    ax.plot(x, A["tmin"], linewidth=1.2, label="Actual Low")
    ax.set_xlim(1, len(x))
    ax.set_xticks(np.linspace(15, 365, 12))
    ax.set_xticklabels([calendar.month_abbr[m] for m in range(1,13)])
    ax.set_ylabel(f"Temperature (°{units})")
    if title:
        ax.set_title(title, pad=10)
    ax.grid(True, alpha=0.3)
    ax.legend(loc="upper right")
    if show:
        plt.show()
    return fig, ax

# 3) DEMO (adjust these and run)
STATION_ID = "USW00013871"   # Huntsville Intl
UNITS = "F"
PLOT_YEAR = 2020

# Build outputs
normals_df = daily_normals_and_records(STATION_ID, units=UNITS)
actuals_df = actuals_for_year(STATION_ID, PLOT_YEAR, units=UNITS)

# Quick preview
display(normals_df.head(5))
display(actuals_df.head(5))



  wide = (df.pivot_table(index="DATE", columns="ELEMENT", values="temp_c", aggfunc="first")
  wide = (df.pivot_table(index="DATE", columns="ELEMENT", values="temp_c", aggfunc="first")


Unnamed: 0,record_min_temp,average_min_temp,average_max_temp,record_max_temp
01-01,17.24,36.656,55.298,71.96
01-02,17.06,38.102,54.632,78.08
01-03,12.02,36.548,53.738,73.94
01-04,15.98,34.034,54.434,75.02
01-05,10.04,33.026,54.914,71.96


Unnamed: 0,date,mmdd,tmax,tmin
0,2020-01-01,01-01,55.04,28.22
1,2020-01-02,01-02,68.0,44.96
2,2020-01-03,01-03,71.06,57.02
3,2020-01-04,01-04,57.02,32.0
4,2020-01-05,01-05,53.96,29.12


In [21]:
STATION_ID = "USW00013871"  # Huntsville International, AL (change as needed)
UNITS = "F"

normals_df = daily_normals_and_records(STATION_ID, units=UNITS)
normals_df.head(10)


  wide = (df.pivot_table(index="DATE", columns="ELEMENT", values="temp_c", aggfunc="first")


Unnamed: 0,record_min_temp,average_min_temp,average_max_temp,record_max_temp
01-01,17.24,36.656,55.298,71.96
01-02,17.06,38.102,54.632,78.08
01-03,12.02,36.548,53.738,73.94
01-04,15.98,34.034,54.434,75.02
01-05,10.04,33.026,54.914,71.96
01-06,13.1,33.872,54.47,73.94
01-07,8.24,32.306,54.464,73.94
01-08,10.22,32.492,53.21,73.04
01-09,15.98,34.952,54.872,73.04
01-10,15.98,36.254,55.316,71.96


In [None]:
# Save tables
out_dir = Path("outputs")
out_dir.mkdir(exist_ok=True)

normals_path = out_dir / f"{STATION_ID}_normals_1991_2020_{UNITS}.csv"
actuals_path = out_dir / f"{STATION_ID}_actuals_{PLOT_YEAR}_{UNITS}.csv"

normals_df.to_csv(normals_path)
actuals_df.to_csv(actuals_path, index=False)

mpl.use("Agg")  # switch backend for file writing
fig, ax = plot_weather_climatology(
    normals_df, actuals_df,
    title=f"{STATION_ID} — Records, Normals (1991–2020), and {PLOT_YEAR} Actuals",
    units=UNITS, show=False
)
png_path = out_dir / f"{STATION_ID}_climatology_{PLOT_YEAR}_{UNITS}.png"
pdf_path = out_dir / f"{STATION_ID}_climatology_{PLOT_YEAR}_{UNITS}.pdf"
fig.savefig(png_path, dpi=200, bbox_inches="tight", facecolor="white")
fig.savefig(pdf_path, bbox_inches="tight")
plt.close(fig)

print("Saved:", normals_path)
print("Saved:", actuals_path)
print("Saved:", png_path)
print("Saved:", pdf_path)


Saved: outputs\USW00013871_normals_1991_2020_F.csv
Saved: outputs\USW00013871_actuals_2020_F.csv
Saved: outputs\USW00013871_climatology_2020_F.png
Saved: outputs\USW00013871_climatology_2020_F.pdf
