In [6]:
"""
NOAA GSOM DATA ALTERATION NOTEBOOK

PURPOSE:
This notebook utilizes the acquired raw NOAA Global Summary of the Month (GSOM) weather
data to select relevant climate variables for agricultrual analysis and filter the data 
on the corn growing season. This notebook does not produce cleaned data nor annualized data. 

Input:
Raw data: "../data/raw/USC00118740_GSOM_1902-08-01_to_2025-10-31.csv"

Output:
File: data/processed/gsom_monthly_selected.csv
Key Variables: Temperature, precipitation, extreme weather events, and growing degree days

Variable Selection Rationale: 
Temperature metrics: Critical for crop growth and development
Precipitation: Essential for corn production (we are looking at non-irrigated agriculture)
Extreme events: Heat stress and frost can damage crops
Growing degree days: Standard metric for crop development
Days above/below thresholds: Capture extreme weather impacts

Documentation on each variable is provided here: 
https://www.ncei.noaa.gov/data/global-summary-of-the-month/doc/GSOM_documentation.pdf

Reproducibility:
Rerunning this notebook with the same parameters will retrieve the same dataset.
Ensure the checksum generated in /documentation/gsom_data_acquisition.txt matches what was
acquired in /Notebooks/GSOM_Acquisition.ipynb

Authors: Brady Brooks
Date: November 2025
"""

import os
import pandas as pd 
from datetime import datetime


df = pd.read_csv("../data/raw/USC00118740_GSOM_1902-08-01_to_2025-10-31.csv")
print(f"Raw GSOM data {df.shape}")
print(f"Original columns: {len(df.columns)}")

project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
processed_dir = os.path.join(project_root, "data", "processed")

#create dir for user if it does not exist
os.makedirs(processed_dir, exist_ok=True)


variables_to_keep = {
    #identification
    'STATION': 'station_id',
    'DATE': 'date',
    
    #temperature variables
    'TAVG': 'temp_avg_f',          # Average monthly temperature
    'TMAX': 'temp_max_f',          # Average of daily max temps
    'TMIN': 'temp_min_f',          # Average of daily min temps
    'EMXT': 'temp_extreme_max_f',  # Highest temperature in month
    'EMNT': 'temp_extreme_min_f',  # Lowest temperature in month
    
    #precipitation (essential for non-irrigated corn)
    'PRCP': 'precip_total_in',     # Total monthly precipitation
    'EMXP': 'precip_max_daily_in', # Highest single-day precipitation
    
    #precipitation Days (Drought/wetness indicators)
    'DP01': 'days_precip_01in',    # Days with >= 0.01" precip
    'DP10': 'days_precip_10in',    # Days with >= 0.10" precip
    'DP1X': 'days_precip_1in',     # Days with >= 1.00" precip
    
    #growing/heating degree days
    'CLDD': 'cooling_degree_days', # Monthly sum of cooling degree days (base day average 65°F)
    'HTDD': 'heating_degree_days', # Monthly sum of heating degree days (base day average 65°F)
    'CDSD': 'cooling_dd_season',   # Season-to-date cooling DD
    'HDSD': 'heating_dd_season',   # Season-to-date heating DD
    
    #extreme temperature days (Stress indicators)
    'DT32': 'days_min_lte_32f',    # Days with min temp <= 32F (frost)
    'DX32': 'days_max_lte_32f',    # Days with max temp <= 32F (hard freeze)
    'DX70': 'days_max_gte_70f',    # Days with max temp >= 70F
    'DX90': 'days_max_gte_90f',    # Days with max temp >= 90F (heat stress)
    'DT00': 'days_max_lte_0f',     # Days with max temp <= 0F (extreme cold)
    
    # Additional Relevant Variables
    'AWND': 'wind_speed_avg_mph',  # Average wind speed
}
df_selected = df[list(variables_to_keep.keys())].copy()
df_selected = df_selected.rename(columns=variables_to_keep)


Raw GSOM data (1479, 150)
Original columns: 150


In [7]:
df_selected['date'] = pd.to_datetime(df_selected['date'], format='%Y-%m')
df_selected['year'] = df_selected['date'].dt.year
df_selected['month'] = df_selected['date'].dt.month

# add growing season indicator (on average April-September for corn)
# per https://ag.purdue.edu/news/department/agry/kernel-news/2022/03/best-time-plant-corn.html#:~:text=Soil%20temperature%20is%20also%20always,the%20decision%20to%20start%20planting
df_selected['is_growing_season'] = df_selected['month'].isin([4, 5, 6, 7, 8, 9])

print(f"Date parsed successfully")
print(f"Year range: {df_selected['year'].min()} to {df_selected['year'].max()}")
print(f"Total months: {len(df_selected)}")
print(f"Growing season months: {df_selected['is_growing_season'].sum()}")

Date parsed successfully
Year range: 1902 to 2025
Total months: 1479
Growing season months: 740


In [8]:
output_filename = "gsom_monthly_selected.csv"
output_path = os.path.join(processed_dir, output_filename)

#write the selected variable & growing months df to CSV 
df_selected.to_csv(output_path, index=False)
print(f"\nProcessed USDA NASS data written to: {output_path}")



Processed USDA NASS data written to: c:\Users\adenm\Documents\GitHub\IS477\data\processed\gsom_monthly_selected.csv
