In [6]:
## NOAA GSOM DATA 
import requests
import pandas as pd
import json
from datetime import datetime
import hashlib
import time 
import os

#request API token from https://www.ncdc.noaa.gov/cdo-web/token
NOAA_TOKEN = "LPNBoLnCBrwwGQMWwLMlAQjEmoagiRqi"
BASE_URL = "https://www.ncei.noaa.gov/access/services/data/v1"

#define our parameters to be used 
dataset = "global-summary-of-the-month"
station_id="USC00118740"
end_date="2025-10-31"
start_date="1902-08-01"
format="csv"


OUTPUT_FILENAME = f"{station_id}_GSOM_{start_date}_to_{end_date}.csv"
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
data_dir = os.path.join(project_root, "data", "raw")
docs_dir = os.path.join(project_root, "documentation")

output_path = os.path.join(data_dir, OUTPUT_FILENAME)
metadata_path = os.path.join(docs_dir, "gsom_data_acquisition.txt")

#create directories if needed for user
os.makedirs(data_dir, exist_ok=True)
os.makedirs(docs_dir, exist_ok=True)


params = {
    "dataset":   dataset,
    "stations":  station_id,
    "startDate": start_date,
    "endDate":   end_date,
    "format":    format,
}
headers = {"token": NOAA_TOKEN}

#define our checksum 
def compute_sha256(file_path):
    sha256_hash = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            sha256_hash.update(chunk)
    return sha256_hash.hexdigest()


try:
    response = requests.get(BASE_URL, params=params, headers=headers, timeout=60)
    response.raise_for_status()
except requests.exceptions.Timeout:
    print("Request timed out. Try again or check your connection.")
    raise
except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
    print(f"Response content: {response.text[:300]} ...")
    raise
except Exception as err:
    print(f"Other error occurred: {err}")
    raise
else:
    print("Request successful!")


with open(output_path, "wb") as f:
    f.write(response.content)

checksum = compute_sha256(output_path)
print(f"Checksum: {checksum}")

Request successful!
Checksum: 04318a24fe9bea7df4dbcc76c88611c16042ec268c77189f766f666c5fc3de43


In [7]:
# NOAA GSOM DF 
df = pd.read_csv(output_path)
print("\nFirst 5 rows")
print(df.head())
print(f"Number of records {len(df)}")



First 5 rows
       STATION     DATE  ADPT  ASLP  ASTP  AWBT  AWND  CDSD  CLDD  DP01  ...  \
0  USC00118740  1902-08   NaN   NaN   NaN   NaN   NaN   NaN  96.0   8.0  ...   
1  USC00118740  1902-09   NaN   NaN   NaN   NaN   NaN   NaN  22.9   9.0  ...   
2  USC00118740  1902-10   NaN   NaN   NaN   NaN   NaN   NaN   3.3   6.0  ...   
3  USC00118740  1902-11   NaN   NaN   NaN   NaN   NaN   NaN   0.3   8.0  ...   
4  USC00118740  1902-12   NaN   NaN   NaN   NaN   NaN   NaN   0.0  12.0  ...   

   WDF2  WDF5  WDFG  WDFM  WDMV  WSF1  WSF2  WSF5  WSFG  WSFM  
0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
1   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
2   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
3   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
4   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  

[5 rows x 150 columns]
Number of records 1479


In [None]:
# NOAA GSOM METADATA
metadata = f"""
Date Retrieved: {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}
Dataset: {dataset}
Station ID: {station_id}
Station Name: Champaign 3 S, IL US
Period of Record: {start_date} to {end_date}
Source URL: {BASE_URL}
Local File: {output_path}
File SHA-256 Checksum: {checksum}

Reproduction Instructions:
1. Request an API token from https://www.ncdc.noaa.gov/cdo-web/token
2. Run the above GSOM_Acquisition.ipynb Notebook cells with the same parameters
3. Verify the resulting file's integrity using:
bash
$ shasum -a 256 {OUTPUT_FILENAME}

The output hash should match:
{checksum}

Notes:
- This data is saved to 'data/raw' for our pipeline
- Metadata and checksum documentation are stored in '/documentation'
"""
with open(metadata_path, "w") as f:
    f.write(metadata)