In [2]:
# Imports + constants
import os
from dotenv import load_dotenv
import requests
import pandas as pd
import re
from pathlib import Path
import numpy as np
from dateutil import parser as dateparser
import duckdb


load_dotenv(os.path.expanduser("~/.config/mysecrets/env"))
NASS_KEY = os.environ["NASS_KEY"]
NASS_BASE = "https://quickstats.nass.usda.gov/api"

AMS_BASE = "https://marsapi.ams.usda.gov/services/v1.2"
AMS_KEY = os.environ["AMS_KEY"]  # or whatever env var you used

# Physical constants — document units and sources
LB_PER_BUSHEL_SOY = 60.0                # USDA standard weight
LB_PER_METRIC_TON = 2_204.622_621_85    # exact SI definition
BUSHELS_PER_METRIC_TON_SOY = LB_PER_METRIC_TON / LB_PER_BUSHEL_SOY  # ≈ 36.744

LB_PER_BUSHEL_CORN = 56.0
BUSHELS_PER_METRIC_TON_CORN = LB_PER_METRIC_TON / LB_PER_BUSHEL_CORN  # ≈ 39.369

MONTH_MAP: dict[str, int] = {
    "JAN": 1, "FEB": 2, "MAR": 3, "APR": 4,  "MAY": 5,  "JUN": 6,
    "JUL": 7, "AUG": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DEC": 12,
}

MONTH_MAP_2: dict[str, int] = {
    "FIRST OF JAN": 1, "FIRST OF FEB": 2, "FIRST OF MAR": 3, "FIRST OF APR": 4,  "FIRST OF MAY": 5,  "FIRST OF JUN": 6,
    "FIRST OF JUL": 7, "FIRST OF AUG": 8, "FIRST OF SEP": 9, "FIRST OF OCT": 10, "FIRST OF NOV": 11, "FIRST OF DEC": 12,

}

DATA_FOLDER = '/home/akimovh/rockets_feathers/eggs/data' # enter the directory you are working in

TXT_DATA_DIR = Path(DATA_FOLDER + "/2848_txt")

con = duckdb.connect(DATA_FOLDER + "/duckdb/egg_usda.duckdb")
con.execute(f"PRAGMA threads={os.cpu_count()}")

<_duckdb.DuckDBPyConnection at 0x6d4371b1af0>

In [3]:
# Some utility functions

def column_filter_format(df:pd.DataFrame, value_name:str) -> pd.DataFrame:
    required_cols = {"year", "reference_period_desc", "Value"}
    missing = required_cols - set(df.columns)
    if missing:
        raise KeyError(f"Expected columns missing from API response: {missing}")
    df = df[list(required_cols)].rename(columns={
        "reference_period_desc": "month",
        "Value":                 value_name,})
    return df

def parse_value(df:pd.DataFrame, value_name:str) -> pd.DataFrame:
    df[value_name] = (
        df[value_name]
        .astype(str)
        .str.replace(",", "", regex=False)   
        .pipe(pd.to_numeric, errors="coerce"))
    
    return df

def data_filter_format(df:pd.DataFrame, month_map:dict) -> pd.DataFrame:
    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    df["month"] = df["month"].str.upper().map(month_map)
    unmapped = df["month"].isna()
    if unmapped.any():
        bad_rows = df.loc[unmapped]
        raise ValueError(f"Unmapped month values:\n{bad_rows}")
    df["month"] = df["month"].astype("Int64")
    df["date"] = pd.to_datetime(dict(year=df["year"], month=df["month"], day=1))
    df = df.drop(['year', 'month'], axis = 1)
    
    df = df.sort_values('date').reset_index(drop=True)
    
    return df

# NASS data

## Feed prices

### Soy prices

In [4]:
# Data extraction
params = {
    "key":              NASS_KEY,
    "format":           "JSON",
    "source_desc":      "SURVEY",
    "sector_desc":      "CROPS",
    "group_desc":       "FIELD CROPS",
    "commodity_desc":   "SOYBEANS",
    "statisticcat_desc":"PRICE RECEIVED",
    "agg_level_desc":   "NATIONAL",
    "freq_desc":        "MONTHLY",
    "unit_desc":        "$ / BU",
    "year__GE":         1990,
}

response = requests.get(
    f"{NASS_BASE}/api_GET/",
    params=params,
    timeout=120,
)
response.raise_for_status()

raw_data = response.json().get("data", [])
if not raw_data:
    raise ValueError("NASS API returned empty dataset — check query parameters")

soy_prices = pd.DataFrame(raw_data)

In [5]:
# Data formatting

soy_prices = column_filter_format(soy_prices, "cost_sb_us")
soy_prices = parse_value(soy_prices, "cost_sb_us")
soy_prices = data_filter_format(soy_prices, MONTH_MAP)
soy_prices["cost_sb_us"] = soy_prices["cost_sb_us"] * BUSHELS_PER_METRIC_TON_SOY

In [6]:
# Saving
soy_prices.to_csv(DATA_FOLDER + "/usda/cost_sb_us_monthly.csv")
con.execute("create or replace table cost_sb_us_monthly as select * from soy_prices")
con.execute('show tables').df()

Unnamed: 0,name
0,cost_sb_us_monthly


### Corn prices

In [7]:
# Data extraction
params = {
    "key": NASS_KEY,
    "format": "JSON",
    "source_desc": "SURVEY",
    "sector_desc": "CROPS",
    "group_desc": "FIELD CROPS",
    "commodity_desc": "CORN",
    "statisticcat_desc": "PRICE RECEIVED",
    "agg_level_desc": "NATIONAL",
    "unit_desc":        "$ / BU",
    "freq_desc": "MONTHLY",
    "year__GE": 1990,
}

response = requests.get(
    f"{NASS_BASE}/api_GET/",
    params=params,
    timeout=120,
)
response.raise_for_status()

raw_data = response.json().get("data", [])
if not raw_data:
    raise ValueError("NASS API returned empty dataset — check query parameters")

corn_prices = pd.DataFrame(raw_data)

In [8]:
# Data formatting

corn_prices = column_filter_format(corn_prices, "cost_corn_us")
corn_prices = parse_value(corn_prices, "cost_corn_us")
corn_prices = data_filter_format(corn_prices, MONTH_MAP)
corn_prices["cost_corn_us"] = corn_prices["cost_corn_us"] * BUSHELS_PER_METRIC_TON_CORN

In [9]:
# Saving
corn_prices.to_csv(DATA_FOLDER + "/usda/cost_corn_us_monthly.csv")
con.execute("create or replace table cost_corn_us_monthly as select * from corn_prices")
con.execute('show tables').df()

Unnamed: 0,name
0,cost_corn_us_monthly
1,cost_sb_us_monthly


## Poultry data

### Egg inventory

In [10]:
params = {
    "key": NASS_KEY,
    "format": "JSON",
    "source_desc": "SURVEY",
    "sector_desc": "ANIMALS & PRODUCTS",
    "group_desc": "POULTRY",
    "commodity_desc": "EGGS",
    "short_desc": "EGGS, TABLE - PRODUCTION, MEASURED IN DOZEN",
    "agg_level_desc": "NATIONAL",
    "freq_desc": "MONTHLY",
    "year__GE": 2000,
}

r = requests.get(f"{NASS_BASE}/api_GET/", params=params, timeout=120)
r.raise_for_status()

egg_prod_doz = pd.DataFrame(r.json().get("data", []))


In [11]:
# Data formatting

egg_prod_doz = column_filter_format(egg_prod_doz, "egg_prod_doz")
egg_prod_doz = parse_value(egg_prod_doz, "egg_prod_doz")
egg_prod_doz = data_filter_format(egg_prod_doz, MONTH_MAP)

In [12]:
# Saving
egg_prod_doz.to_csv(DATA_FOLDER + "/usda/egg_prod_doz_monthly.csv")
con.execute("create or replace table egg_prod_doz_monthly as select * from egg_prod_doz")
con.execute('show tables').df()

Unnamed: 0,name
0,cost_corn_us_monthly
1,cost_sb_us_monthly
2,egg_prod_doz_monthly


### Layers inventory

In [13]:
params = {
    "key": NASS_KEY,
    "format": "JSON",
    "source_desc": "SURVEY",
    "sector_desc": "ANIMALS & PRODUCTS",
    "group_desc": "POULTRY",
    "commodity_desc": "CHICKENS",
    "statisticcat_desc": "INVENTORY",
    "class_desc": "LAYERS, TABLE",
    "agg_level_desc": "NATIONAL",
    "year__GE": 2000,
}

r = requests.get(f"{NASS_BASE}/api_GET/", params=params, timeout=120)
r.raise_for_status()

layer_inv = pd.DataFrame(r.json().get("data", []))

In [14]:
# Data formatting

layer_inv = column_filter_format(layer_inv, "layer_inv")
layer_inv = parse_value(layer_inv, "layer_inv")
layer_inv = data_filter_format(layer_inv, MONTH_MAP_2)

In [15]:
# Saving
layer_inv.to_csv(DATA_FOLDER + "/usda/layer_inv_monthly.csv")
con.execute("create or replace table layer_inv_monthly as select * from layer_inv")
con.execute('show tables').df()

Unnamed: 0,name
0,cost_corn_us_monthly
1,cost_sb_us_monthly
2,egg_prod_doz_monthly
3,layer_inv_monthly


### Layers loss

In [16]:
params = {
    "key": NASS_KEY,
    "format": "JSON",
    "source_desc": "SURVEY",
    "sector_desc": "ANIMALS & PRODUCTS",
    "group_desc": "POULTRY",
    "commodity_desc": "CHICKENS",
    "statisticcat_desc": "LOSS, DEATH & RENDERED",
    "class_desc": "LAYERS",
    "unit_desc": "HEAD",
    "agg_level_desc": "NATIONAL",
    "freq_desc": "MONTHLY",
    "year__GE": 2000,
}

r = requests.get(f"{NASS_BASE}/api_GET/", params=params, timeout=120)
r.raise_for_status()

loss_dth_render = pd.DataFrame(r.json().get("data", []))

In [17]:
# Data formatting

loss_dth_render = column_filter_format(loss_dth_render, "loss_dth_render")
loss_dth_render = parse_value(loss_dth_render, "loss_dth_render")
loss_dth_render = data_filter_format(loss_dth_render, MONTH_MAP)


In [18]:
# Saving
loss_dth_render.to_csv(DATA_FOLDER + "/usda/loss_dth_render_monthly.csv")
con.execute("create or replace table loss_dth_render_monthly as select * from loss_dth_render")
con.execute('show tables').df()

Unnamed: 0,name
0,cost_corn_us_monthly
1,cost_sb_us_monthly
2,egg_prod_doz_monthly
3,layer_inv_monthly
4,loss_dth_render_monthly


# AMS data

## Egg prices

### Old data (in txt format, downloaded manually from MMN archive)

In [19]:
# Pasing txt files

# Template to parse date
HEADER_RE = re.compile(
    r"^Washington,\s*DC\s+(?P<date>(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\.\s+"
    r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4})\s+"
    r"USDA\s+Market\s+News\s*$"
)

def parse_one_file(fp: Path) -> pd.DataFrame:
    text = fp.read_text(encoding="latin-1", errors="replace")
    lines = text.splitlines()

    # 1) date from header line (guaranteed present in this format)
    date_str = next(HEADER_RE.match(line.strip()).group("date")
                    for line in lines if HEADER_RE.match(line.strip()))
    date = pd.to_datetime(date_str, format="%a. %b %d, %Y").normalize()

    # 2) find table start: line after "REGIONS ..."
    start = next(i for i, line in enumerate(lines) if line.strip().startswith("REGIONS")) + 1

    # 3) collect rows until footer marker
    rows = []
    for line in lines[start:]:
        s = line.strip()
        if not s:
            continue
        if s.startswith(("Computed", "Source:", "Prepared:")):
            break

        parts = s.split()
        if len(parts) < 4:
            continue

        region = " ".join(parts[:-3])
        ex_large, large, medium = map(float, parts[-3:])

        rows.append(
            {"date": date, "region": region, "ex_large": ex_large, "large": large, "medium": medium}
        )

    return pd.DataFrame(rows)

files = sorted([p for p in TXT_DATA_DIR.iterdir() if p.is_file() and p.suffix.lower() == ".txt"])

dfs = []
errors = []

for fp in files:
    try:
        dfs.append(parse_one_file(fp))
    except Exception as e:
        errors.append((fp.name, str(e)))

df = pd.concat(dfs, ignore_index=True).sort_values(["date", "region"]).reset_index(drop=True)

print("Rows:", len(df))
print("Files with errors:", len(errors))
if errors:
    pd.DataFrame(errors, columns=["file", "error"]).head(20)



Rows: 1890
Files with errors: 0


In [20]:
# Some of the reports published not on Fridays - moved them, so all dates exactly one week appart
df["date"] = pd.to_datetime(df["date"]).dt.normalize()

fri = pd.offsets.Week(weekday=4)  # Friday

# Map each publication date to the nearest "week-ending Friday"
# (Mon -> previous Fri; Thu -> next Fri; Fri -> same)
def week_end_friday(d: pd.Timestamp) -> pd.Timestamp:
    prev_fri = fri.rollback(d)     # Friday on/before d
    next_fri = fri.rollforward(d)  # Friday on/after d
    # pick whichever Friday is closer in time
    return prev_fri if (d - prev_fri) <= (next_fri - d) else next_fri

df["week_fri"] = df["date"].apply(week_end_friday)

# ---- checks on canonical week index ----
dup = df.duplicated(["week_fri", "region"]).sum()
print("Duplicate (week_fri, region) rows:", dup)

dates = pd.DatetimeIndex(sorted(df["week_fri"].unique()))
expected = pd.date_range(dates.min(), dates.max(), freq="W-FRI")

missing = expected.difference(dates)
extra = dates.difference(expected)

print("Canonical range:", dates.min().date(), "→", dates.max().date())
print("Unique canonical weeks:", len(dates))
print("Expected Fridays:", len(expected))
print("Missing weeks:", len(missing))
print("Extra weeks:", len(extra))

# show the ones that moved
moved = (
    df.loc[df["date"] != df["week_fri"], ["date", "week_fri"]]
      .drop_duplicates()
      .sort_values("date")
)
display(moved)

df['date'] = df["week_fri"]
df = df.drop("week_fri", axis = 1).copy()
region_map = {
    "COMBINED REGIONAL": "National",
    "MIDWEST": "Midwest",
    "NORTHEAST": "Northeast",
    "SOUTH CENTRAL": "South Central",
    "SOUTHEAST": "Southeast",
}

df["region"] = df["region"].map(region_map)


Duplicate (week_fri, region) rows: 0
Canonical range: 2017-11-03 → 2025-01-24
Unique canonical weeks: 378
Expected Fridays: 378
Missing weeks: 0
Extra weeks: 0


Unnamed: 0,date,week_fri
5,2017-11-09,2017-11-10
390,2019-05-06,2019-05-03
465,2019-08-19,2019-08-16
695,2020-07-02,2020-07-03
825,2020-12-31,2021-01-01
1080,2021-12-23,2021-12-24
1085,2021-12-30,2021-12-31
1310,2022-11-10,2022-11-11


### New Data (now in pdf, but I get JSON through API)

In [21]:
# Downloading

SLUG = 2848
start = "01/20/2025"
end = pd.Timestamp.today().strftime("%m/%d/%Y")

r = requests.get(
    f"{AMS_BASE}/reports/{SLUG}",
    params={"q": f"report_begin_date={start}:{end}", "allSections": "true"},
    auth=(AMS_KEY, ""),
    timeout=120,
)
print("status:", r.status_code)
print("url:", r.url)
r.raise_for_status()

rep_range = r.json()

pdf_df = pd.DataFrame(rep_range[4]['results'])


status: 200
url: https://marsapi.ams.usda.gov/services/v1.2/reports/2848?q=report_begin_date%3D01%2F20%2F2025%3A01%2F24%2F2026&allSections=true


In [22]:
# Filtering

pdf_df = pdf_df.query(
    "`class` in ['Extra Large', 'Large', 'Medium'] and delivery == 'Delivered Warehouse'"
)
pdf_df = pdf_df[['report_begin_date', 'class', 'region', 'avg_price']]
pdf_df = (
    pdf_df
    .pivot(
        index=["report_begin_date", "region"],
        columns="class",
        values="avg_price",
    )
    .reset_index()
)

pdf_df.columns = df.columns
pdf_df["date"] = pd.to_datetime(pdf_df["date"], format="%m/%d/%Y")

fri = pd.offsets.Week(weekday=4)  # Friday

pdf_df["date"] = pd.to_datetime(pdf_df["date"], format="%m/%d/%Y").dt.normalize()
pdf_df["date"] = pdf_df["date"].map(fri.rollforward)  # Sunday->Friday, Friday->Friday
pdf_df.ex_large = pdf_df.ex_large.astype(float)
pdf_df.large = pdf_df.large.astype(float)
pdf_df.medium = pdf_df.medium.astype(float)



In [23]:
# Connecting old and saving 
df = pd.concat([df, pdf_df]).sort_values(["date", "region"])
wholesale_price = df.drop_duplicates().copy()

In [24]:
# Saving
wholesale_price.to_csv(DATA_FOLDER + "/usda/wholesale_price_weekly.csv")
con.execute("create or replace table wholesale_price_weekly as select * from wholesale_price")
con.execute('show tables').df()

Unnamed: 0,name
0,cost_corn_us_monthly
1,cost_sb_us_monthly
2,egg_prod_doz_monthly
3,layer_inv_monthly
4,loss_dth_render_monthly
5,wholesale_price_weekly


In [25]:
# Aggregating weekly data to monthly
df["week_start"] = df["date"] - pd.Timedelta(days=6)  # Sunday
df["week_end"]   = df["date"] # Saturday

def expand_week_to_months(row):
    dates = pd.date_range(row.week_start, row.week_end, freq="D")
    return (
        pd.DataFrame({
            "date": dates,
            "month": dates.to_period("M").to_timestamp(),
            "price_ex_large": row.ex_large,
            "price_large": row.large,
            "price_medium": row.medium,
            "region": row.region,
        })
    )

expanded_df = pd.concat(
    df.apply(expand_week_to_months, axis=1).tolist(),
    ignore_index=True,
)
monthly_df = (
    expanded_df
    .groupby(["month", "region"], as_index=False)
    .mean()
)[['month', 'region', 'price_ex_large', 'price_large', 'price_medium']]

monthly_df.columns = ['date', 'region', 'price_ex_large', 'price_large', 'price_medium']


In [26]:
# Saving
monthly_df.to_csv(DATA_FOLDER + "/usda/wholesale_price_monthly.csv")
con.execute("create or replace table wholesale_price_monthly as select * from monthly_df")
con.execute('show tables').df()

Unnamed: 0,name
0,cost_corn_us_monthly
1,cost_sb_us_monthly
2,egg_prod_doz_monthly
3,layer_inv_monthly
4,loss_dth_render_monthly
5,wholesale_price_monthly
6,wholesale_price_weekly


# Creating one monthly df

2018-2025

In [27]:
from functools import reduce

dfs = [
    soy_prices,
    corn_prices,
    egg_prod_doz,
    layer_inv,
    loss_dth_render,
    monthly_df.query("region == 'National'"),
]

df = reduce(
    lambda left, right: pd.merge(left, right, on="date", how="inner"),
    dfs,
)

df = df.query("date >= '2018-01-01' and date < '2025-10-01'")

In [28]:
# Saving
df.to_csv(DATA_FOLDER + "/usda/egg_stat_monthly.csv")
con.execute("create or replace table egg_stat_monthly as select * from df")
con.execute('show tables').df()

Unnamed: 0,name
0,cost_corn_us_monthly
1,cost_sb_us_monthly
2,egg_prod_doz_monthly
3,egg_stat_monthly
4,layer_inv_monthly
5,loss_dth_render_monthly
6,wholesale_price_monthly
7,wholesale_price_weekly
