# Setting up API
Source: FAOSTAT


In [52]:
import pandas as pd
import requests

In [165]:
API_KEY = '9805ABF0-1BB6-30D0-BE7F-2FA955C28C81'

base_url = 'https://quickstats.nass.usda.gov/api/api_GET/'

dfs = []

for crop in ['CORN', 'WHEAT']:
    params = {
            'key' : API_KEY,
            'source_desc' : 'SURVEY',
            'sector_desc' : 'CROPS',
            'group_desc' : 'FIELD CROPS',
            'commodity_desc': crop,
            'agg_level_desc' : 'STATE',
            'freq_desc' : 'ANNUAL',
            'statisticcat_desc' : 'YIELD',
            'prodn_practice_desc': 'ALL PRODUCTION PRACTICES',
            'year__GE': '1961',
            'format' : 'JSON'}
    # Explicit class rule for Wheat
    if crop == "WHEAT":
        params["class_desc"] = "ALL CLASSES"

    r = requests.get(base_url, params=params)
    print(r.status_code)
    print(r.text[:1000])
    r.raise_for_status()

    data = r.json()["data"]
    df = pd.DataFrame(data)
    df["commodity"] = crop

    dfs.append(df)

df_raw = pd.concat(dfs, ignore_index=True)

200
{"data":[{"domain_desc":"TOTAL","short_desc":"CORN, GRAIN - YIELD, MEASURED IN BU / ACRE","asd_code":"","commodity_desc":"CORN","load_time":"2025-11-14 12:00:00.000","end_code":"00","region_desc":"","county_name":"","statisticcat_desc":"YIELD","domaincat_desc":"NOT SPECIFIED","county_ansi":"","year":2025,"country_name":"UNITED STATES","country_code":"9000","util_practice_desc":"GRAIN","freq_desc":"ANNUAL","state_alpha":"OT","asd_desc":"","sector_desc":"CROPS","prodn_practice_desc":"ALL PRODUCTION PRACTICES","zip_5":"","begin_code":"00","watershed_desc":"","state_ansi":"","state_fips_code":"98","agg_level_desc":"STATE","location_desc":"OTHER STATES","Value":"157.8","group_desc":"FIELD CROPS","congr_district_code":"","state_name":"OTHER STATES","week_ending":"","class_desc":"ALL CLASSES","source_desc":"SURVEY","CV (%)":"","unit_desc":"BU / ACRE","watershed_code":"00000000","county_code":"","reference_period_desc":"YEAR"},{"CV (%)":"","source_desc":"SURVEY","class_desc":"ALL CLASSES",

In [166]:
df_clean = df_raw[
    (df_raw["unit_desc"] == "BU / ACRE") &
    (df_raw["util_practice_desc"] == "ALL UTILIZATION PRACTICES") &
    (df_raw["reference_period_desc"] == "YEAR")
].copy()

df_clean.duplicated(["state_alpha","year","commodity"]).sum()

np.int64(0)

In [167]:
# Col list we keep
COLS_KEEP = ["state_alpha", "state_name", "year",
             "commodity_desc", "unit_desc", "Value",
             "reference_period_desc", 'source_desc']

# Filtering out unnecessary cols
df_clean = df_raw[COLS_KEEP].rename(columns={
    "commodity_desc": "crop",
    "unit_desc": "yield_unit",
    "Value": "yield"
})

In [182]:
# Checking for duplicates again
df_clean.head(30)

# Some Yearly yields are 0, need to use november forecast instead
# OT is state aggregates - need to drop
df_clean = df_clean[df_clean["state_alpha"] != "OT"].copy()

# Only keeping year if it has value using sorting trick
df_final = (
    df_clean[df_clean["reference_period_desc"].isin(["YEAR", "YEAR - NOV FORECAST"])]
      .sort_values(["state_alpha", "year", "crop", "yield"])           # YEAR=0 will come before NOV>0
      .groupby(["state_alpha", "year", "crop"], as_index=False)
      .tail(1)                                # Keep the largest value of yield
      .reset_index(drop=True)
)

# Panel duplicates should now be 0
print(df_final.duplicated(["state_alpha", "year", "crop"]).sum())

# Only keeping BU/ACRE unit
df_final = df_final[
    (df_final["yield_unit"] == "BU / ACRE")
].copy()

0


# Saving Dataset

In [183]:
df_final.to_parquet("data/processed/crop_yield_annual.parquet", index=False)