In [2]:
import pandas as pd

"""
This notebook is to be utilized for the altering of the NASS dataset
With this notebook we intend to alter the NASS format from a long format to a wide format for our purposes 
"""

df = pd.read_csv("nass_qs_1902_to_2025.csv")
print(len(df))

# filter to the stats we want with TOTAL domain
df_filtered = df[
    (df['Domain'] == 'TOTAL') &
    (df['Data Item'].isin([
        'CORN, GRAIN - ACRES PLANTED',
        'CORN, GRAIN - ACRES HARVESTED',
        'CORN, GRAIN - YIELD, MEASURED IN BU / ACRE',
        'CORN, GRAIN - PRODUCTION, MEASURED IN BU'
    ]))
].copy()

# clean the Value column
df_filtered['Value'] = df_filtered['Value'].astype(str).str.replace(',', '')
df_filtered['Value'] = pd.to_numeric(df_filtered['Value'], errors='coerce')

# pivot to wide format
df_wide = df_filtered.pivot_table(
    index='Year',
    columns='Data Item',
    values='Value',
    aggfunc='first'
).reset_index()

# check what columns we actually got from our alterations
print("Columns after pivot:", df_wide.columns.tolist())
print("Shape:", df_wide.shape)

# Rename using a dictionary (safer than assuming column order)
df_wide = df_wide.rename(columns={
    'CORN, GRAIN - ACRES PLANTED': 'acres_planted',
    'CORN, GRAIN - ACRES HARVESTED': 'acres_harvested',
    'CORN, GRAIN - YIELD, MEASURED IN BU / ACRE': 'yield_bu_per_acre',
    'CORN, GRAIN - PRODUCTION, MEASURED IN BU': 'production_bushels'
})

# rename Year to lowercase for consistency
df_wide = df_wide.rename(columns={'Year': 'year'})
print("\nFinal columns:", df_wide.columns.tolist())
len(df_wide)

print("Year range:", df_wide['year'].min(), "to", df_wide['year'].max())
print("Number of years:", df_wide['year'].max() - df_wide['year'].min() + 1)
print("Actual rows:", len(df_wide))

# Check for missing years
all_years = set(range(int(df_wide['year'].min()), int(df_wide['year'].max()) + 1))
actual_years = set(df_wide['year'].astype(int))
missing_years = sorted(all_years - actual_years)
print(f"\nMissing years: {missing_years if missing_years else 'None'}")

4641
Columns after pivot: ['Year', 'CORN, GRAIN - ACRES HARVESTED', 'CORN, GRAIN - PRODUCTION, MEASURED IN BU', 'CORN, GRAIN - YIELD, MEASURED IN BU / ACRE']
Shape: (124, 4)

Final columns: ['year', 'acres_harvested', 'production_bushels', 'yield_bu_per_acre']
Year range: 1902 to 2025
Number of years: 124
Actual rows: 124

Missing years: None
