In [None]:
# ---------------- Imports ----------------
import requests
import json
import os
import sys
import datetime

from time import sleep

import pickle
import pandas as pd
import yaml

from bs4 import BeautifulSoup
from playwright.async_api import async_playwright



In [None]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_folder = os.path.join(config["paths"]["proj_store"], "data")


data_location = f'{data_folder}/raw_data/machine_collected/oyez_supreme_court'



In [None]:
# ---------------- Setup ----------------
os.makedirs(os.path.join(data_location, 'html_years'), exist_ok=True)
os.makedirs(os.path.join(data_location, 'data/transcripts'), exist_ok=True)
os.makedirs(os.path.join(data_location, 'data/case_list'), exist_ok=True)

class case_extractor:

    def __init__(self, year):
        self.year = str(year)
        self.next_year = str(int(year) + 1)

        # Load the HTML for case list scraping
        try:
            with open(os.path.join(data_location, 'html_years', f'{self.year}-{self.next_year} Term _ Oyez.html'), 'r', encoding="UTF-8") as f:
                self.year_soup = BeautifulSoup(f.read(), 'html.parser')
        except FileNotFoundError:
            self.year_soup = None
            print(f'HTML file for case list of {self.year} is missing.')

        # Load or initialize the case list DataFrame
        try:
            self.df = pd.read_csv(os.path.join(data_location, 'data/case_list', f'{self.year}_list.csv'))
        except FileNotFoundError:
            self.df = None

    def get_case_list(self):
        if self.year_soup is None:
            print(f'Cannot generate case list for {self.year}. HTML file is missing.')
            return

        case_list = []
        case_list_soup = self.year_soup.find('ul', attrs={'class': "index ng-scope"})
        case_items = case_list_soup.find_all('li') if case_list_soup else []

        for item in case_items:
            link = item.find('a')
            if link:
                case_list.append((link.text.strip(), link['href']))

        if case_list:
            self.df = pd.DataFrame(case_list, columns=['case_title', 'link'])
            self.df.to_csv(os.path.join(data_location, 'data/case_list', f'{self.year}_list.csv'), index=False)
            print(f'Saved case list for {self.year}.')
        else:
            print(f'No cases found for {self.year}.')

    def download_transcripts(self):
        if self.df is None:
            print(f'No case list found for {self.year}. Skipping transcript download.')
            return

        for _, row in self.df.iterrows():
            year, case_num = row['link'].split('/')[-2:]
            transcript_folder = os.path.join(data_location, 'data/transcripts', year)
            os.makedirs(transcript_folder, exist_ok=True)

            case_link = f'https://api.oyez.org/cases/{year}/{case_num}?labels=true'
            try:
                resp_json = requests.get(case_link).json()
            except Exception as e:
                print(f"Failed to fetch metadata for case {case_num}: {e}")
                continue

            if not resp_json.get('oral_argument_audio'):
                continue

            case_ids = [str(audio['id']) for audio in resp_json['oral_argument_audio']]
            for count, case_id in enumerate(case_ids):
                transcript_url = f'https://api.oyez.org/case_media/oral_argument_audio/{case_id}'
                print(f'Downloading transcript: {transcript_url}')
                sleep(3)

                try:
                    transcript_json = requests.get(transcript_url).json()
                    with open(os.path.join(transcript_folder, f'{case_num}_{count}.json'), 'w') as f:
                        json.dump(transcript_json, f, indent=4)
                except Exception as e:
                    print(f"Failed to download transcript for case {case_num}, ID {case_id}: {e}")


async def download_htmls(start_year, end_year):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        for year in range(start_year, end_year):
            next_year = year + 1
            print(f"Downloading HTML for {year}-{next_year}...")
            html_url = f'https://www.oyez.org/cases/{year}'
            try:
                page = await browser.new_page()
                await page.goto(html_url)
                await page.wait_for_load_state('networkidle')

                html_folder = os.path.join(data_location, 'html_years')
                os.makedirs(html_folder, exist_ok=True)

                with open(os.path.join(html_folder, f'{year}-{next_year} Term _ Oyez.html'), 'w', encoding='utf-8') as f:
                    f.write(await page.content())
                print(f"Saved HTML for {year}-{next_year}.")
            except Exception as e:
                print(f"Failed to download HTML for {year}-{next_year}: {e}")
        await browser.close()



In [None]:
# ---------------- Main ----------------
async def main():
    start_year = 2014
    end_year = datetime.datetime.now().year

    await download_htmls(start_year, end_year)

    for year in range(start_year, end_year):
        print(f"Processing year: {year}")
        extractor = case_extractor(year)

        if not os.path.exists(os.path.join(data_location, 'data/case_list', f'{year}_list.csv')):
            print(f"Generating case list for {year}.")
            extractor.get_case_list()

        extractor.download_transcripts()


if __name__ == '__main__':
    await main()



