In [82]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

import re
import sys
import os

sys.path.append(os.path.abspath("../../")) #vscode import 

from settings.settings import USDA_NASS_API_KEY

Export sales

In [None]:
html = requests.get('https://apps.fas.usda.gov/export-sales/h107.htm').text
soup = BeautifulSoup(html, "html.parser")

table = soup.find('table')

rows = []
for tr in table.find_all("tr"):
    cells = [td.get_text(strip=True) for td in tr.find_all("td")]
    rows.append(cells)

cols = []
for i, j in zip(rows[1], rows[2]):
    if f"{i} {j}" not in cols:
        cols.append(f"{i} {j}")
    else:
        cols.append(f"NMY {i} {j}")

data = rows[4:]
df = pd.DataFrame(data, columns=cols)
df = df.dropna()
df.to_csv('../../data/US_export_sales/export_sales.csv', index=False)

Crop progress / condition

In [None]:
def download_data(category):
    endpoint = f"https://quickstats.nass.usda.gov/api/api_GET/?key={USDA_NASS_API_KEY}"
    request_params = f"source_desc=SURVEY&sector_desc=CROPS&group_desc=FIELD CROPS&commodity_desc=WHEAT&statisticcat_desc={category}" \
                    "&agg_level_desc=NATIONAL&class_desc=WINTER" \
                    "&format=JSON"

    url = '&'.join([endpoint, request_params])

    r = requests.get(url)
    df = pd.DataFrame(r.json()['data'])
    return df

In [None]:
conditions = download_data('CONDITION', 'WINTER')
progress = download_data('PROGRESS', 'WINTER')

conditions.to_csv('../../data/crop_progress/conditions.csv', index=False)
progress.to_csv('../../data/crop_progress/progress.csv', index=False)

Grain Stocks

In [356]:
def format_grain_stocks_txt(data):
    #Format and split lines
    data = data.replace('\r', '')
    splits = data.split('\n')
    lines = [line.strip() for line in splits if line]

    #Find table of contents, titles that interest us, then keep only the data table
    try:
        content_start = lines.index('Contents')
        title_domestic = lines[content_start + 1].split('.')[0]
        title_mt = lines[content_start + 2].split('.')[0]
    except:
        print('No contents table')
        return pd.DataFrame({})

    content_formatted = [line.split('.')[0] for line in lines]
    content_end = content_formatted.index('Information Contacts')

    tables = lines[content_end:]
    title_domestic_idx = tables.index(title_domestic)
    title_mt_idx = tables.index(title_mt)

    stocks = tables[title_domestic_idx:title_mt_idx]
    table_start = next((i for i, item in enumerate(stocks) if item.startswith('---')), None)
    stocks = stocks[table_start:title_mt_idx]

    #Start getting data from data table, here years, headers and separate the data
    year_line = stocks[1]
    headers = stocks[3]
    stocks_data = stocks[8:-3]

    year_list = year_line.replace(' ', '').split(':')[1:]
    cols = list(headers.replace(' ', '').split(':'))
    cols.insert(0, 'Commodity')

    #Find commodity name for each data then add the commodity name at the start of the string data
    commodity = None
    filtered_data = []
    for data in stocks_data:
        if len(data.split(':')[1]) == 0 and len(data.split(':')[0]) >= 2:
            commodity = data.split(':')[0].strip()
        elif len(data.split(':')[1]) > 0:
            filtered_data.append(f'{commodity} {data}')

    #Make a list of formatted data with a list of data string
    result = []
    for line in filtered_data:
        parts = line.split(':')
        name_part = parts[0].strip().replace('.', '')
        
        #Separate product and date
        match = re.match(r'(.+?)\s+((?:March|June|September|December)\s+\d+.*)', name_part)
        
        if match:
            product = match.group(1).strip()
            date = match.group(2).strip()
        else:
            product = name_part
            date = ''
        
        #Extract numbers
        numbers = parts[1].strip().split()
        numbers = [int(num.replace(',', '')) for num in numbers]

        result.append([product, date] + numbers)

    #Make a dataframe with a the formatted list
    rows = []
    for row in result:
        commodity = row[0]
        date = row[1]
        if len(row) >= 5:
            rows.append([commodity, date, row[2], row[3], row[4], year_list[0]])
        if len(row) >= 8:
            rows.append([commodity, date, row[5], row[6], row[7], year_list[1]])

    df = pd.DataFrame(rows, columns=['Commodity', 'Date', 'On', 'Off', 'Totalall', 'Year'])

    return df

In [397]:
endpoint = 'https://esmis.nal.usda.gov'

report_dates = pd.date_range(datetime(2010, 6, 1), datetime(2026, 1, 1), freq='3ME')
report_dates = report_dates.where(report_dates.month != 12, report_dates + pd.DateOffset(months=1))
report_dates = sorted(report_dates.append(pd.DatetimeIndex(['2019-02-01']))) #We manually add this report date bc of a rescheduled report

urls = {}
for date in report_dates:
    report_year_month = date.strftime("%Y-%m")

    html = requests.get(f'{endpoint}/publication/grain-stocks?date={report_year_month}').text
    soup = BeautifulSoup(html, "html.parser")

    links = soup.find_all('a')
    for link in links: 
        if link.time:
            full_link_date = datetime.fromisoformat(link.time['datetime'].replace("Z", "+00:00"))
            link_date = full_link_date.strftime("%Y-%m-%d")
            link_year_month = full_link_date.strftime("%Y-%m")
            if link_year_month == report_year_month and link['href'].endswith('.txt'):
                urls[link_date] = link['href']
                break

In [402]:
full_df = []
for report_date, url in urls.items():
    data = requests.get(f'{endpoint}{url}').text
    df = format_grain_stocks_txt(data)
    df['Report_date'] = report_date
    full_df.append(df)
grain_stocks = pd.concat(full_df).reset_index(drop=True)
grain_stocks.to_csv('../../data/grain_stocks/grain_stocks.csv', index=False)

No contents table


Rivers Water Levels

In [None]:
location_dict = {
    "Mississippi River at St. Louis, MO": "USGS-07010000",
    "Ohio River at Cincinnati, OH": "USGS-03255000",
    "Illinois River at Meredosia, IL": "USGS-05585500",
    "Ohio River at Louisville, KY": "USGS-03294500",
}

In [None]:
def download_rivers(location_id):    
    url =  "https://api.waterdata.usgs.gov/ogcapi/v0/collections/daily/items?f=json&lang=en-US&limit=50000&skipGeometry=false&sortby=time&offset=0&" \
          f"monitoring_location_id={location_id}&parameter_code=00065"
    r = requests.get(url)
    data = r.json()['features']
    data_list = [d['properties'] for d in data]
    return pd.DataFrame(data_list)

In [100]:
data_list = []
for k, v in location_dict.items():
    df = download_rivers(v)
    df['location'] = k
    data_list.append(df)
rivers = pd.concat(data_list)

rivers.to_csv('../../data/rivers/rivers_gage_height.csv', index=False)