In [82]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

import re
import sys
import os

sys.path.append(os.path.abspath("../../")) #vscode import 

from settings.settings import USDA_NASS_API_KEY

Export sales

In [None]:
html = requests.get('https://apps.fas.usda.gov/export-sales/h107.htm').text
soup = BeautifulSoup(html, "html.parser")

table = soup.find('table')

rows = []
for tr in table.find_all("tr"):
    cells = [td.get_text(strip=True) for td in tr.find_all("td")]
    rows.append(cells)

cols = []
for i, j in zip(rows[1], rows[2]):
    if f"{i} {j}" not in cols:
        cols.append(f"{i} {j}")
    else:
        cols.append(f"NMY {i} {j}")

data = rows[4:]
df = pd.DataFrame(data, columns=cols)
df = df.dropna()
df.to_csv('../../data/US_export_sales/export_sales.csv', index=False)

Crop progress / condition

In [None]:
def download_data(category):
    endpoint = f"https://quickstats.nass.usda.gov/api/api_GET/?key={USDA_NASS_API_KEY}"
    request_params = f"source_desc=SURVEY&sector_desc=CROPS&group_desc=FIELD CROPS&commodity_desc=WHEAT&statisticcat_desc={category}" \
                    "&agg_level_desc=NATIONAL&class_desc=WINTER" \
                    "&format=JSON"

    url = '&'.join([endpoint, request_params])

    r = requests.get(url)
    df = pd.DataFrame(r.json()['data'])
    return df

In [None]:
conditions = download_data('CONDITION', 'WINTER')
progress = download_data('PROGRESS', 'WINTER')

conditions.to_csv('../../data/crop_progress/conditions.csv', index=False)
progress.to_csv('../../data/crop_progress/progress.csv', index=False)

Grain Stocks

In [79]:
endpoint = 'https://esmis.nal.usda.gov'

report_dates = pd.date_range(datetime(2010, 6, 1), datetime(2026, 1, 1), freq='3ME')
report_dates = report_dates.where(report_dates.month != 12, report_dates + pd.DateOffset(months=1))

urls = []
for date in report_dates:
    report_year_month = date.strftime("%Y-%m")

    html = requests.get(f'{endpoint}/publication/grain-stocks?date={report_year_month}').text
    soup = BeautifulSoup(html, "html.parser")

    links = soup.find_all('a')
    for link in links: 
        if link.time:
            link_year_month = datetime.fromisoformat(link.time['datetime'].replace("Z", "+00:00")).strftime("%Y-%m")
            if link_year_month == report_year_month and link['href'].endswith('.txt'):
                urls.append(link['href'])
                break

In [80]:
for url in urls:
    data = requests.get(f'{endpoint}{url}').text
    break

In [None]:
data = data.replace('\r', '')
splits = data.split('\n')

lines = [line.strip() for line in splits if line]

In [229]:
content_start = lines.index('Contents')
title_domestic = lines[content_start + 1].split('.')[0]
title_mt = lines[content_start + 2].split('.')[0]

content_formatted = [line.split('.')[0] for line in lines]
content_end = content_formatted.index('Information Contacts')

tables = lines[content_end:]
title_domestic_idx = tables.index(title_domestic)
title_mt_idx = tables.index(title_mt)

stocks = tables[title_domestic_idx:title_mt_idx]

In [325]:
year_line = stocks[3]
headers = stocks[5]
stocks_data = stocks[10:-3]

year_list = year_line.replace(' ', '').split(':')[1:]
cols = list(headers.replace(' ', '').split(':'))
cols.insert(0, 'Commodity')

In [287]:
commodity = None
filtered_data = []
for data in stocks_data:
    if len(data.split(':')[1]) == 0 and len(data.split(':')[0]) >= 2:
        commodity = data.split(':')[0].strip()
    elif len(data.split(':')[1]) > 0:
        filtered_data.append(f'{commodity} {data}')

In [289]:
result = []
for line in filtered_data:
    # Séparer le nom et les chiffres
    parts = line.split(':')
    name_part = parts[0].strip().replace('.', '')
    
    # Séparer le produit et la date (chercher le dernier espace avant un chiffre ou "March/June/etc")
    # On utilise une regex pour séparer correctement
    match = re.match(r'(.+?)\s+((?:March|June|September|December)\s+\d+.*)', name_part)
    
    if match:
        product = match.group(1).strip()
        date = match.group(2).strip()
    else:
        product = name_part
        date = ''
    
    # Extraire les nombres
    numbers = parts[1].strip().split()
    # Convertir en entiers (enlever les virgules)
    numbers = [int(num.replace(',', '')) for num in numbers]
    
    # Créer une liste avec le produit, la date, puis les nombres
    result.append([product, date] + numbers)

In [327]:
rows = []

for row in result:
    commodity = row[0]
    date = row[1]
    if len(row) >= 5:
        rows.append([commodity, date, row[2], row[3], row[4], year_list[0]])
    if len(row) >= 8:
        rows.append([commodity, date, row[5], row[6], row[7], year_list[1]])

# Créer la dataframe
df = pd.DataFrame(rows, columns=['Commodity', 'Date', 'On', 'Off', 'Totalall', 'Year'])


In [328]:
df

Unnamed: 0,Commodity,Date,On,Off,Totalall,Year
0,Corn,March 1,4085000,2869145,6954145,2009
1,Corn,March 1,4548000,3145787,7693787,2010
2,Corn,June 1,2205400,2056027,4261427,2009
3,Corn,June 1,2131400,2178644,4310044,2010
4,Corn,September 1,607500,1065811,1673311,2009
5,Corn,December 1,7405000,3497460,10902460,2009
6,Sorghum,March 1,32200,173650,205850,2009
7,Sorghum,March 1,23680,151873,175553,2010
8,Sorghum,June 1,12000,90215,102215,2009
9,Sorghum,June 1,10700,77154,87854,2010


Rivers Water Levels

In [None]:
location_dict = {
    "Mississippi River at St. Louis, MO": "USGS-07010000",
    "Ohio River at Cincinnati, OH": "USGS-03255000",
    "Illinois River at Meredosia, IL": "USGS-05585500",
    "Ohio River at Louisville, KY": "USGS-03294500",
}

In [None]:
def download_rivers(location_id):    
    url =  "https://api.waterdata.usgs.gov/ogcapi/v0/collections/daily/items?f=json&lang=en-US&limit=50000&skipGeometry=false&sortby=time&offset=0&" \
          f"monitoring_location_id={location_id}&parameter_code=00065"
    r = requests.get(url)
    data = r.json()['features']
    data_list = [d['properties'] for d in data]
    return pd.DataFrame(data_list)

In [100]:
data_list = []
for k, v in location_dict.items():
    df = download_rivers(v)
    df['location'] = k
    data_list.append(df)
rivers = pd.concat(data_list)

rivers.to_csv('../../data/rivers/rivers_gage_height.csv', index=False)