<a href="https://colab.research.google.com/github/bernebas/Bernebas/blob/main/Untitled51.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
import requests
import pandas as pd
import datetime as dt
from pathlib import Path
from urllib.parse import quote

# =======================
# Config
# =======================
BASE_URL = ("https://ftp.cpc.ncep.noaa.gov/htdocs/products/analysis_monitoring/"
            "cdus/degree_days/archives/Heating%20degree%20Days/monthly%20states")
YEARS = list(range(1997, 2026))          # 1997..2025 inclusive
MONTHS = [
    (1,  ["Jan","jan","JAN"]),
    (2,  ["Feb","feb","FEB"]),
    (3,  ["Mar","mar","MAR"]),
    (4,  ["Apr","apr","APR"]),
    (5,  ["May","may","MAY"]),
    (6,  ["Jun","jun","JUN"]),
    (7,  ["Jul","jul","JUL"]),
    (8,  ["Aug","aug","AUG"]),
    (9,  ["Sep","Sept","sep","SEPT","SEP"]),
    (10, ["Oct","oct","OCT"]),
    (11, ["Nov","nov","NOV"]),
    (12, ["Dec","dec","DEC"]),
]
HEADERS = {"User-Agent": "Mozilla/5.0 (academic data-collection script)"}

OUTDIR = Path("Heat_data_full_1997_2025")
OUTDIR.mkdir(parents=True, exist_ok=True)
SAVE_RAW = False  # set True if you also want to save each monthly text file

CSV_NAME = "cpc_hdd_full_1997_2025_sorted.csv"
INFO_TXT = "info_about_the_data.txt"

# =======================
# Helpers
# =======================
def to_float(x: str):
    """Convert numeric (optionally with %) to float; blanks -> NaN."""
    x = x.strip()
    if not x:
        return float('nan')
    if x.endswith('%'):
        return float(x[:-1])
    return float(x)

def parse_month_text(year: int, mon_abbr: str, mnum: int, raw_text: str):
    """
    Parse a CPC monthly states file text into row dicts with all fields.
    Returns list of dicts (one per state).
    """
    records = []
    for line in raw_text.splitlines():
        if not line.strip():
            continue
        # Skip headers/decorative lines
        if line.strip().startswith(("STATE", "TOTAL", "FROM", "NORM", "PRCT", "====", "---")):
            continue

        # Split on 2+ spaces; the last 8 tokens should be numeric fields
        parts = re.split(r"\s{2,}", line.strip())
        if len(parts) < 9:
            continue  # not a data row

        nums = parts[-8:]
        state = " ".join(parts[:-8]).strip()
        if not state:
            continue

        try:
            mon_total                 = to_float(nums[0])
            mon_dev_from_norm         = to_float(nums[1])
            mon_dev_from_lyr          = to_float(nums[2])
            cum_total                 = to_float(nums[3])
            cum_dev_from_norm         = to_float(nums[4])
            cum_dev_from_lyr          = to_float(nums[5])
            cum_dev_from_norm_prct    = to_float(nums[6])
            cum_dev_from_lyr_prct     = to_float(nums[7])
        except ValueError:
            continue

        records.append({
            "State": state,
            "Year": year,
            "Month": mnum,
            "Month_Name": mon_abbr,
            "MON_TOTAL": mon_total,
            "MON_DEV_FROM_NORM": mon_dev_from_norm,
            "MON_DEV_FROM_LYR": mon_dev_from_lyr,
            "CUM_TOTAL": cum_total,
            "CUM_DEV_FROM_NORM": cum_dev_from_norm,
            "CUM_DEV_FROM_LYR": cum_dev_from_lyr,
            "CUM_DEV_FROM_NORM_PRCT": cum_dev_from_norm_prct,
            "CUM_DEV_FROM_LYR_PRCT": cum_dev_from_lyr_prct
        })
    return records

# =======================
# Fetch + Parse
# =======================
all_rows = []
misses = []

for year in YEARS:
    if SAVE_RAW:
        (OUTDIR / str(year)).mkdir(parents=True, exist_ok=True)

    for mnum, variants in MONTHS:
        got = False
        for v in variants:
            fname = f"{v} {year}.txt"     # try each casing/variant
            url = f"{BASE_URL}/{year}/{quote(fname)}"
            try:
                r = requests.get(url, headers=HEADERS, timeout=30)
                if r.status_code == 200 and r.text.strip():
                    print(f"[OK ] {url}")
                    if SAVE_RAW:
                        raw_path = OUTDIR / str(year) / fname.replace(" ", "_")
                        raw_path.write_text(r.text, encoding="utf-8")

                    # Use the canonical month name for output (first variant list item)
                    mon_abbr = variants[0].title() if variants else f"M{mnum}"
                    rows = parse_month_text(year, mon_abbr, mnum, r.text)
                    all_rows.extend(rows)
                    got = True
                    break
            except requests.RequestException:
                pass
        if not got:
            print(f"[MISS] {year} month {mnum:02d}")
            misses.append((year, mnum))

# =======================
# Save CSV
# =======================
csv_path = OUTDIR / CSV_NAME
if all_rows:
    df = pd.DataFrame(all_rows)
    df.sort_values(["State", "Year", "Month"], inplace=True)
    df.to_csv(csv_path, index=False)
    print(f"\nSaved CSV: {csv_path} (rows: {len(df)})")
else:
    print("\nNo rows parsed — check connectivity or BASE_URL.")

# Report any misses grouped by year
if misses:
    print("\nSummary of missing months:")
    from collections import defaultdict
    mm = defaultdict(list)
    for y, m in misses:
        mm[y].append(m)
    for y in sorted(mm):
        print(f"  {y}: {', '.join(f'{m:02d}' for m in sorted(mm[y]))}")
else:
    print("\nNo missing months reported.")

# =======================
# Write Info Text
# =======================
today = dt.date.today().isoformat()
info_path = OUTDIR / INFO_TXT
contents = f"""INFO ABOUT THE DATA
====================

Last updated: {today}

WHAT THIS DATASET IS
--------------------
This dataset contains monthly **Heating Degree Days (HDD)** by U.S. state, compiled from the
Climate Prediction Center (CPC) "Monthly States" archives. HDD measures heating demand:
for each day, HDD = max(0, 65°F - daily average temperature). Monthly HDD is the sum over days in a month.

WHY HDD MATTERS (simple explanation)
------------------------------------
- Bigger HDD number => it was colder overall, so more heating was needed.
- Smaller HDD number => it was warmer, so less heating was needed.

TIME COVERAGE
-------------
1997–2025 (inclusive), one row per (State, Year, Month). Some months may be missing if not posted by CPC.

SOURCE
------
CPC archives (FTP over HTTP):
{BASE_URL}/
Each year typically has 12 text files (one per month), but names/casing vary across years.

UNITS
-----
HDD values are in degree-days (°F·days) using a 65°F base.

WHAT FILES YOU HAVE LOCALLY
---------------------------
- Combined CSV: {csv_path.name}
- This README:  {info_path.name}
- Raw monthly text files (optional) under year folders if SAVE_RAW=True.

HOW THE CSV IS SORTED
---------------------
Rows are sorted by State → Year → Month (1–12).

COLUMNS (PLAIN ENGLISH)
-----------------------
- State: U.S. state name.
- Year: Four-digit year.
- Month: Month number (1=Jan, …, 12=Dec).
- Month_Name: Month abbreviation.
- MON_TOTAL: HDD total for the month.
- MON_DEV_FROM_NORM: Difference from the long-term normal (same month/state).
- MON_DEV_FROM_LYR: Difference from the same month last year.
- CUM_TOTAL: Cumulative HDD from the start of the season up to this month.
- CUM_DEV_FROM_NORM: Cumulative difference from normal for the season to date.
- CUM_DEV_FROM_LYR: Cumulative difference from last season for the season to date.
- CUM_DEV_FROM_NORM_PRCT: Same as above, in percent.
- CUM_DEV_FROM_LYR_PRCT: Same as above, in percent.

MISSING/ODD VALUES
------------------
- Some months/years may not be listed by CPC (especially earlier years); those will be missing.
- Empty fields are stored as blank/NaN in the CSV.
- Filenames vary by case (e.g., 'Apr 1999.txt' vs 'apr 1999.txt'); this script tries multiple variants.

SWITCHING TO CDD
----------------
To collect **Cooling Degree Days**, change BASE_URL to the 'Cooling%20Degree%20Days/monthly%20states' folder.
Everything else stays the same.

"""
info_path.write_text(contents, encoding="utf-8")
print(f"Wrote info file: {info_path.resolve()}")



[MISS] 1997 month 01
[MISS] 1997 month 02
[MISS] 1997 month 03
[MISS] 1997 month 04
[MISS] 1997 month 05
[MISS] 1997 month 06
[MISS] 1997 month 07
[MISS] 1997 month 08
[MISS] 1997 month 09
[OK ] https://ftp.cpc.ncep.noaa.gov/htdocs/products/analysis_monitoring/cdus/degree_days/archives/Heating%20degree%20Days/monthly%20states/1997/Oct%201997.txt
[OK ] https://ftp.cpc.ncep.noaa.gov/htdocs/products/analysis_monitoring/cdus/degree_days/archives/Heating%20degree%20Days/monthly%20states/1997/nov%201997.txt
[OK ] https://ftp.cpc.ncep.noaa.gov/htdocs/products/analysis_monitoring/cdus/degree_days/archives/Heating%20degree%20Days/monthly%20states/1997/dec%201997.txt
[OK ] https://ftp.cpc.ncep.noaa.gov/htdocs/products/analysis_monitoring/cdus/degree_days/archives/Heating%20degree%20Days/monthly%20states/1998/jan%201998.txt
[OK ] https://ftp.cpc.ncep.noaa.gov/htdocs/products/analysis_monitoring/cdus/degree_days/archives/Heating%20degree%20Days/monthly%20states/1998/feb%201998.txt
[OK ] https://ft

In [2]:
try:
    from google.colab import files
    files.download(str(csv_path))
    files.download(str(info_path))
except Exception:
    pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>