In [14]:
import pandas as pd

In [15]:
scotus_df = pd.read_csv('../SCDB_2024_01_caseCentered_Citation.csv')

In [19]:
scotus_df['usCite'].value_counts().head(10)

usCite
347 U.S. 909     3
352 U.S. 1020    2
352 U.S. 1027    2
348 U.S. 956     2
352 U.S. 862     2
349 U.S. 901     2
346 U.S. 932     2
415 U.S. 289     2
346 U.S. 906     2
350 U.S. 869     2
Name: count, dtype: int64

In [20]:
#https://tile.loc.gov/storage-services/service/ll/usrep/usrep467/usrep467649/usrep467649.pdf
#https://tile.loc.gov/storage-services/service/ll/usrep/usrep{}/usrep{}{}/usrep{}{}.pdf.format(volume,volume,item,volume,item)
# scotus_df['usCite'].split(' ')[0] is volume, [2] is item
# skip NAs in usCite
scotus_df['loc_url'] = scotus_df['usCite'].apply(lambda x: f"https://tile.loc.gov/storage-services/service/ll/usrep/usrep{x.split(' ')[0]}/usrep{x.split(' ')[0]}{x.split(' ')[2]}/usrep{x.split(' ')[0]}{x.split(' ')[2]}.pdf" if pd.notna(x) else None)
# format for https://www.loc.gov/item/usrep{}{}
scotus_df['loc_details'] = scotus_df['usCite'].apply(lambda x: f"https://www.loc.gov/item/usrep{x.split(' ')[0]}{x.split(' ')[2]}" if pd.notna(x) else None)
#

In [21]:
scotus_df[['loc_url', 'loc_details', 'usCite','docket','caseName']][1000:1025]

Unnamed: 0,loc_url,loc_details,usCite,docket,caseName
1000,https://tile.loc.gov/storage-services/service/...,https://www.loc.gov/item/usrep35011,350 U.S. 11,3,"UNITED STATES ex rel. TOTH v. QUARLES, SECRETA..."
1001,https://tile.loc.gov/storage-services/service/...,https://www.loc.gov/item/usrep35046,350 U.S. 46,20,CORN PRODUCTS REFINING CO. v. COMMISSIONER OF ...
1002,https://tile.loc.gov/storage-services/service/...,https://www.loc.gov/item/usrep35055,350 U.S. 55,26,"UNITED STATES v. ANDERSON, CLAYTON & CO."
1003,https://tile.loc.gov/storage-services/service/...,https://www.loc.gov/item/usrep35061,350 U.S. 61,8,"INDIAN TOWING CO., INC., et al. v. UNITED STATES"
1004,https://tile.loc.gov/storage-services/service/...,https://www.loc.gov/item/usrep35077,350 U.S. 77,28,"NEESE, ADMINISTRATOR, v. SOUTHERN RAILWAY CO."
1005,https://tile.loc.gov/storage-services/service/...,https://www.loc.gov/item/usrep35079,350 U.S. 79,71,AFFRONTI v. UNITED STATES
1006,https://tile.loc.gov/storage-services/service/...,https://www.loc.gov/item/usrep35085,350 U.S. 85,112,REECE v. GEORGIA
1007,https://tile.loc.gov/storage-services/service/...,https://www.loc.gov/item/usrep35091,350 U.S. 91,32,MICHEL v. LOUISIANA
1008,https://tile.loc.gov/storage-services/service/...,https://www.loc.gov/item/usrep350107,350 U.S. 107,27,NATIONAL LABOR RELATIONS BOARD v. WARREN COMPA...
1009,https://tile.loc.gov/storage-services/service/...,https://www.loc.gov/item/usrep350114,350 U.S. 114,10 ORIG,ARIZONA v. CALIFORNIA et al.


In [30]:
print(scotus_df['loc_url'].dropna().sample(1).values[0])
print(scotus_df['loc_url'].dropna().values[0])

https://tile.loc.gov/storage-services/service/ll/usrep/usrep536/usrep536214/usrep536214.pdf
https://tile.loc.gov/storage-services/service/ll/usrep/usrep329/usrep3291/usrep3291.pdf


# loop through to try and download all pdfs from LOC.gov

- proceed only for educational purposes!
- review terms and use of loc.gov

## Loc.gov collections by volume 

https://www.loc.gov/collections/united-states-reports/about-this-collection/united-states-reports-by-volume/

In [None]:
# Enhanced downloader with TQDM progress bar and hidden print output
import pandas as pd
import requests
import os
import time
from tqdm import tqdm
output_folder = '../scotus_pdfs/'
os.makedirs(output_folder, exist_ok=True)

def format_usrep_url(usCite):
    if pd.isna(usCite):
        return None
    parts = usCite.split(' ')
    if len(parts) < 3:
        return None
    volume = parts[0]
    item = parts[2].zfill(3)  # Pad item to 3 digits
    return f'https://tile.loc.gov/storage-services/service/ll/usrep/usrep{volume}/usrep{volume}{item}/usrep{volume}{item}.pdf'

scotus_df['loc_url'] = scotus_df['usCite'].apply(format_usrep_url)

skip_files = set(['usrep336198.pdf','usrep338304.pdf'])
urls = [row['loc_url'] for _, row in scotus_df.iterrows() if pd.notna(row['loc_url'])]
total_files = len(urls)
already_downloaded = sum(os.path.exists(os.path.join(output_folder, url.split('/')[-1])) for url in urls)
# print(f'Total URLs: {total_files}, Already downloaded: {already_downloaded}')

downloaded_this_run = 0

for row in tqdm(scotus_df.itertuples(), total=len(scotus_df), desc='Downloading PDFs'):
    pdf_url = getattr(row, 'loc_url', None)
    if pdf_url and pdf_url.split('/')[-1] in skip_files:
        # print(f'Skipping {pdf_url}, in skip list.')
        continue
    if pd.notna(pdf_url):
        filename = pdf_url.split('/')[-1]  # usrepVVVNNN.pdf
        pdf_path = os.path.join(output_folder, filename)
        if os.path.exists(pdf_path):
            continue
        try:
            # print(f'Trying to download: {pdf_url}')
            response = requests.get(pdf_url)
            if response.status_code == 200:
                with open(pdf_path, 'wb') as f:
                    f.write(response.content)
                downloaded_this_run += 1
                # print(f'Downloaded {pdf_path}')
            else:
                # print(f'Failed to download {pdf_url}')
                pass
            time.sleep(3)  # Wait 3 seconds between requests
        except Exception as e:
            # print(f'Error downloading {pdf_url}: {e}')
            pass
# print(f'Newly downloaded this run: {downloaded_this_run}')

Downloading PDFs:  42%|████▏     | 3866/9277 [3:47:35<11:13:59,  7.47s/it]  

# still trying to find other refs

- seeing that volume 334 has
  - `usrep334statement`
  - `usrep334inmemoriam`
  - `usrep334decisionspercuriam`
