In [None]:
"""
NOAA GSOM CLEANING AND ANNUALIZATION NOTEBOOK

PURPOSE:
This notebook cleans the monthly GSOM weather data and aggregates it to annual values
suitable for integration with USDA NASS corn production data. The focus is on growing season
(April-September) climate variables that influence corn yields

Input:
File: data/processed/gsom_monthly_selected.csv
Format: Monthly climate observations (1902-present)
Source: Output from GSOM_Acquistion notebook

Output: 
File: data/cleaned/gsom_annual_clean.csv
Format: Annual climate summaries (one row per year)
Ready for: Integration with NASS corn data by year 

NOTE: This file does not acquire the GSOM monthly selected data and instead assumes the user has acquired
the dataset from GSOM_Acquisition.ipynb and additionally has run GSOM_Alteration.ipynb. This code performs data validation, cleaning, aggregation to
annual-level metrics and computation of derived indicators. 

Authors: Brady Brooks and Aden Krueger
Date: November 2025
"""

import pandas as pd 
import numpy as np 
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
cleaned_dir = os.path.join(project_root, "data", "cleaned")

#create dir for user if it does not exist
os.makedirs(cleaned_dir, exist_ok=True)

df_monthly = pd.read_csv("../data/processed/gsom_monthly_selected.csv")
df_monthly.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1479 entries, 0 to 1478
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   station_id           1479 non-null   object 
 1   date                 1479 non-null   object 
 2   temp_avg_c           1475 non-null   float64
 3   temp_max_c           1475 non-null   float64
 4   temp_min_c           1476 non-null   float64
 5   temp_extreme_max_c   1475 non-null   float64
 6   temp_extreme_min_c   1476 non-null   float64
 7   precip_total_mm      1478 non-null   float64
 8   precip_max_daily_mm  1478 non-null   float64
 9   days_precip_01in     1478 non-null   float64
 10  days_precip_10in     1478 non-null   float64
 11  days_precip_1in      1478 non-null   float64
 12  cooling_degree_days  1475 non-null   float64
 13  heating_degree_days  1475 non-null   float64
 14  cooling_dd_season    1441 non-null   float64
 15  heating_dd_season    1447 non-null   f

In [95]:
#ensuring our data is correct
df_monthly['date'] = pd.to_datetime(df_monthly['date'])
if 'year' not in df_monthly.columns:
    df_monthly['year'] = df_monthly['date'].dt.year
if 'month' not in df_monthly.columns:
    df_monthly['month'] = df_monthly['date'].dt.month
if 'is_growing_season' not in df_monthly.columns:
    df_monthly['is_growing_season'] = df_monthly['month'].isin([4, 5, 6, 7, 8, 9])

print(f"Loaded {len(df_monthly)} monthly records")
print(f"Date range: {df_monthly['date'].min().strftime('%Y-%m')} to {df_monthly['date'].max().strftime('%Y-%m')}")
print(f"Years: {df_monthly['year'].nunique()}")
print(f"Columns: {len(df_monthly.columns)}")

print(df_monthly[['date', 'year', 'month', 'temp_avg_c', 'precip_total_mm', 'is_growing_season']].head(5))


Loaded 1479 monthly records
Date range: 1902-08 to 2025-10
Years: 124
Columns: 25
        date  year  month  temp_avg_c  precip_total_mm  is_growing_season
0 1902-08-01  1902      8       21.22            248.5               True
1 1902-09-01  1902      9       16.81            124.4               True
2 1902-10-01  1902     10       13.42             53.4              False
3 1902-11-01  1902     11        8.50             61.7              False
4 1902-12-01  1902     12       -2.92             74.7              False


In [96]:
#missing value assessment
missing_summary = df_monthly.isnull().sum()
print("\nVariables with missing data:")
for col in df_monthly.columns:
    missing = missing_summary[col]
    if missing > 0:
        pct = (missing / len(df_monthly)) * 100
        print (f" {col}: {missing} ({pct:.1f}%)")


Variables with missing data:
 temp_avg_c: 4 (0.3%)
 temp_max_c: 4 (0.3%)
 temp_min_c: 3 (0.2%)
 temp_extreme_max_c: 4 (0.3%)
 temp_extreme_min_c: 3 (0.2%)
 precip_total_mm: 1 (0.1%)
 precip_max_daily_mm: 1 (0.1%)
 days_precip_01in: 1 (0.1%)
 days_precip_10in: 1 (0.1%)
 days_precip_1in: 1 (0.1%)
 cooling_degree_days: 4 (0.3%)
 heating_degree_days: 4 (0.3%)
 cooling_dd_season: 38 (2.6%)
 heating_dd_season: 32 (2.2%)
 days_min_lte_32f: 3 (0.2%)
 days_max_lte_32f: 4 (0.3%)
 days_max_gte_70f: 4 (0.3%)
 days_max_gte_90f: 4 (0.3%)
 days_max_lte_0f: 3 (0.2%)
 wind_speed_avg_mph: 1479 (100.0%)


In [97]:
temp_cols = ['temp_avg_c', 'temp_max_c', 'temp_min_c', 
              'temp_extreme_max_c', 'temp_extreme_min_c']

# convert celsius to fahrenheit
for col in temp_cols:
    if col in df_monthly.columns:
        df_monthly[col.replace('_c', '_f')] = (df_monthly[col] * 9/5) + 32

#drop old unit columns
drop_cols = temp_cols
df = df_monthly.drop(columns=[c for c in drop_cols if c in df_monthly.columns])
df

Unnamed: 0,station_id,date,precip_total_mm,precip_max_daily_mm,days_precip_01in,days_precip_10in,days_precip_1in,cooling_degree_days,heating_degree_days,cooling_dd_season,...,days_max_lte_0f,wind_speed_avg_mph,year,month,is_growing_season,temp_avg_f,temp_max_f,temp_min_f,temp_extreme_max_f,temp_extreme_min_f
0,USC00118740,1902-08-01,248.5,78.7,8.0,6.0,4.0,96.0,6.5,,...,0.0,,1902,8,True,70.196,80.420,59.990,89.96,48.02
1,USC00118740,1902-09-01,124.4,47.0,9.0,8.0,1.0,22.9,68.6,,...,0.0,,1902,9,True,62.258,73.040,51.476,84.02,33.98
2,USC00118740,1902-10-01,53.4,25.4,6.0,2.0,1.0,3.3,155.5,,...,0.0,,1902,10,False,56.156,66.848,45.482,78.08,30.92
3,USC00118740,1902-11-01,61.7,24.1,8.0,6.0,0.0,0.3,295.4,,...,0.0,,1902,11,False,47.300,56.048,38.534,75.02,14.00
4,USC00118740,1902-12-01,74.7,12.4,12.0,8.0,0.0,0.0,658.7,,...,1.0,,1902,12,False,26.744,33.728,19.778,51.08,-0.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1474,USC00118740,2025-06-01,66.9,24.9,8.0,5.0,0.0,159.3,1.4,193.4,...,0.0,,2025,6,True,74.480,84.452,64.508,93.92,48.92
1475,USC00118740,2025-07-01,113.5,17.8,15.0,12.0,0.0,236.8,0.0,430.2,...,0.0,,2025,7,True,78.746,88.322,69.170,93.92,62.96
1476,USC00118740,2025-08-01,30.7,19.8,7.0,3.0,0.0,155.8,3.3,586.0,...,0.0,,2025,8,True,73.850,84.740,62.978,95.00,48.92
1477,USC00118740,2025-09-01,48.6,27.2,4.0,3.0,1.0,103.7,16.6,689.7,...,0.0,,2025,9,True,70.214,84.020,56.426,95.00,44.96


In [98]:
df_grow = df[df['is_growing_season'] == True].copy()
print(f"Filtered to growing season: {len(df_grow)} records")

Filtered to growing season: 740 records


In [99]:
#make a dictionary of how we're going to aggregate
agg_dict = {
    'temp_avg_f': 'mean',
    'temp_max_f': 'mean',
    'temp_min_f': 'mean',
    'temp_extreme_max_f': 'max',
    'temp_extreme_min_f': 'min',
    'precip_total_mm': 'sum',
    'precip_max_daily_mm': 'max',
    'days_max_gte_90f': 'sum',
    'days_min_lte_32f': 'sum'
}

df_annual = df_grow.groupby('year').agg(agg_dict).reset_index()
print(f"Aggregated to {len(df_annual)} annual records")

Aggregated to 124 annual records


In [100]:
df_annual

Unnamed: 0,year,temp_avg_f,temp_max_f,temp_min_f,temp_extreme_max_f,temp_extreme_min_f,precip_total_mm,precip_max_daily_mm,days_max_gte_90f,days_min_lte_32f
0,1902,66.227,76.730,55.733,89.96,33.98,372.9,78.7,2.0,0.0
1,1903,65.378,77.111,53.633,96.08,19.04,525.2,59.7,20.0,9.0
2,1904,64.028,75.488,52.568,91.94,21.92,394.7,40.9,8.0,12.0
3,1905,66.533,77.603,55.454,96.98,26.06,480.4,43.9,19.0,11.0
4,1906,67.127,79.034,55.226,93.92,26.06,452.2,56.1,23.0,4.0
...,...,...,...,...,...,...,...,...,...,...
119,2021,67.883,78.287,57.479,95.00,21.92,639.7,50.8,18.0,6.0
120,2022,67.496,77.873,57.122,98.96,19.94,499.1,56.1,27.0,10.0
121,2023,67.910,79.277,56.540,98.06,26.96,402.8,43.9,23.0,3.0
122,2024,68.702,78.797,58.607,95.00,30.02,799.4,94.5,20.0,3.0


In [101]:
if {'temp_max_f', 'temp_min_f'}.issubset(df_annual.columns):
    base = 50
    avg_temp = (df_annual['temp_max_f'] + df_annual['temp_min_f']) / 2
    gdd = (avg_temp - base).clip(lower=0) * 180 
    df_annual['gdd_base50'] = gdd

#temperature range (F)
df_annual['temp_range_f'] = df_annual['temp_max_f'] - df_annual['temp_min_f']


In [102]:
months_per_year = df_grow.groupby('year').size()
df_annual['months_available'] = df_annual['year'].map(months_per_year)
df_annual['complete_season'] = df_annual['months_available'] == 6
df_annual['data_quality_score'] = (
    df_annual['complete_season'].astype(int) * 40 +
    (~df_annual['temp_avg_f'].isna()).astype(int) * 30 +
    (~df_annual['precip_total_mm'].isna()).astype(int) * 30
)

df_annual

Unnamed: 0,year,temp_avg_f,temp_max_f,temp_min_f,temp_extreme_max_f,temp_extreme_min_f,precip_total_mm,precip_max_daily_mm,days_max_gte_90f,days_min_lte_32f,gdd_base50,temp_range_f,months_available,complete_season,data_quality_score
0,1902,66.227,76.730,55.733,89.96,33.98,372.9,78.7,2.0,0.0,2921.67,20.997,2,False,60
1,1903,65.378,77.111,53.633,96.08,19.04,525.2,59.7,20.0,9.0,2766.96,23.478,6,True,100
2,1904,64.028,75.488,52.568,91.94,21.92,394.7,40.9,8.0,12.0,2525.04,22.920,6,True,100
3,1905,66.533,77.603,55.454,96.98,26.06,480.4,43.9,19.0,11.0,2975.13,22.149,6,True,100
4,1906,67.127,79.034,55.226,93.92,26.06,452.2,56.1,23.0,4.0,3083.40,23.808,6,True,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,2021,67.883,78.287,57.479,95.00,21.92,639.7,50.8,18.0,6.0,3218.94,20.808,6,True,100
120,2022,67.496,77.873,57.122,98.96,19.94,499.1,56.1,27.0,10.0,3149.55,20.751,6,True,100
121,2023,67.910,79.277,56.540,98.06,26.96,402.8,43.9,23.0,3.0,3223.53,22.737,6,True,100
122,2024,68.702,78.797,58.607,95.00,30.02,799.4,94.5,20.0,3.0,3366.36,20.190,6,True,100


In [103]:
print("\nAnnual Dataset Summary")
print(f"Years: {df_annual['year'].min()}–{df_annual['year'].max()}")
print(f"Variables: {len(df_annual.columns)}")
print(f"Mean Growing Degree Days: {df_annual['gdd_base50'].mean():.0f}")
print(f"Mean Annual Precipitation: {df_annual['precip_total_mm'].mean():.1f} mm")
print(f"Complete Growing Seasons: {df_annual['complete_season'].sum()} years")
print(f"Average Data Quality Score: {df_annual['data_quality_score'].mean():.1f}/100")


Annual Dataset Summary
Years: 1902–2025
Variables: 15
Mean Growing Degree Days: 3046
Mean Annual Precipitation: 578.6 mm
Complete Growing Seasons: 123 years
Average Data Quality Score: 99.7/100


In [104]:
output_filename = "gsom_annual_clean.csv"
output_path = os.path.join(cleaned_dir, output_filename)

df_annual.to_csv(output_path, index=False)
print(f"\nCleaned and annualized GSOM data written to: {output_path}")


Cleaned and annualized GSOM data written to: c:\Users\adenm\Documents\GitHub\IS477\data\cleaned\gsom_annual_clean.csv


# GSOM Annual Climate Data Dictionary

| **Column** | **Description** | **Aggregation Method** | **Interpretation** |
|:--|:--|:--|:--|
| **year** | Calendar year of observation | Derived from date column | Primary grouping variable for annual aggregation |
| **temp_avg_f** | Mean of daily or monthly average temperatures (°F) | Mean of monthly averages across the growing season | Indicates overall thermal conditions for that year |
| **temp_max_f** | Average of monthly mean maximums (°F) | Mean of monthly means of daily highs | Proxy for typical daytime heat |
| **temp_min_f** | Average of monthly mean minimums (°F) | Mean of monthly means of daily lows | Proxy for nighttime cooling or stress |
| **temp_extreme_max_f** | Highest temperature recorded that year (°F) | Max of monthly extreme maximums | Extreme-heat stress indicator |
| **temp_extreme_min_f** | Lowest temperature recorded that year (°F) | Min of monthly extreme minimums | Winterkill or frost indicator |
| **precip_total_mm** | Total precipitation for the year (mm) | Sum of monthly totals | Total moisture available to crops |
| **precip_max_daily_mm** | Maximum single-day precipitation event (mm) | Max of monthly maxima | Proxy for heavy rainfall intensity |
| **days_max_gte_90f** | Days with max temperature ≥ 90°F | Sum across months | Hot-day stress index |
| **days_min_lte_32f** | Days with min temperature ≤ 32°F | Sum across months | Frost frequency indicator |
| **gdd_base50** | Growing Degree Days (base 50°F) | Computed as Σ(max(Tavg – 50, 0)) across growing months | Crop heat accumulation metric (corn/soybeans) |
| **temp_range_f** | Mean (temp_max_f – temp_min_f) | Derived | Daily thermal amplitude proxy |
| **months_available** | Number of months with valid data in growing season | Count of months not missing | Completeness check |
| **complete_season** | Boolean flag for ≥ 6 valid months (April–Sept) | Derived | Marks whether year’s data are suitable for agronomic analysis |
| **data_quality_score** | Composite completeness score (0–100) | Weighted by missingness and flag quality | Used to filter high-confidence records |
