**PROGRAM OVERVIEW**

Use this program to perform the following steps in sequence:

1. **Retrieve** a set of PDFs from an API endpoint.
2. **Optionally subset** the list of PDFs if only certain documents are needed.
3. **Extract text** from the selected PDFs and **save** the output to a text file.

**LIBRARIES AND PACKAGES**

In [1]:
import requests # type: ignore
import os
from PyPDF2 import PdfReader # type: ignore
from datetime import datetime

Fetch the **PDF LINKS** from the application **END POINT** and write them to a **PYTHON LIST**

In [None]:
# Set the API endpoint

#url = "https://api.usaspending.gov/api/v2/agency/012/?fiscal_year=2023"
url = "https://api.usaspending.gov/api/v2/agency/012/?fiscal_year=2024"

# Send a request to the API
response = requests.get(url)

# Check if the response is successful
if response.status_code == 200:
    try:
        data = response.json()  # Parse the JSON response
        pdf_list = []

        # Extract 'def_codes' if present in the response
        def_codes = data.get('def_codes', [])
        if def_codes:
            # Iterate through 'def_codes' and extract 'urls'
            for code in def_codes:
                urls = code.get('urls')
                if urls:  # Check if 'urls' is not None
                    # Split 'urls' if it's a string and contains '|'
                    if isinstance(urls, str):
                        pdf_list.extend(urls.split('|'))
                    # If 'urls' is a list, iterate and split if needed
                    elif isinstance(urls, list):
                        for url in urls:
                            pdf_list.extend(url.split('|'))

            # Display the consolidated list of PDF URLs
            if pdf_list:
                print("List of PDF URLs:")
                for pdf_url in pdf_list:
                    print(pdf_url)
            else:
                print("No PDF URLs found in the 'def_codes' key.")
        else:
            print("'def_codes' key not found in the response.")
    except ValueError:
        print("Error: Unable to parse JSON response.")
else:
    print("Response content:", response.text)
    print(f"Failed to fetch data from API. Status code: {response.status_code}")

List of PDF URLs:
http://www.govinfo.gov/content/pkg/PLAW-117publ58/pdf/PLAW-117publ58.pdf
http://www.govinfo.gov/content/pkg/PLAW-117publ103/pdf/PLAW-117publ103.pdf
http://www.govinfo.gov/content/pkg/PLAW-115publ56/pdf/PLAW-115publ56.pdf
http://www.govinfo.gov/content/pkg/PLAW-117publ328/pdf/PLAW-117publ328.pdf
http://www.govinfo.gov/content/pkg/PLAW-117publ328/pdf/PLAW-117publ328.pdf
http://www.govinfo.gov/content/pkg/PLAW-115publ123/pdf/PLAW-115publ123.pdf
http://www.govinfo.gov/content/pkg/PLAW-116publ20/pdf/PLAW-116publ20.pdf
http://www.govinfo.gov/content/pkg/PLAW-116publ94/pdf/PLAW-116publ94.pdf
http://www.govinfo.gov/content/pkg/PLAW-116publ127/pdf/PLAW-116publ127.pdf
http://www.govinfo.gov/content/pkg/PLAW-116publ136/pdf/PLAW-116publ136.pdf
http://www.govinfo.gov/content/pkg/PLAW-116publ136/pdf/PLAW-116publ136.pdf
http://www.govinfo.gov/content/pkg/PLAW-116publ139/pdf/PLAW-116publ139.pdf
http://www.govinfo.gov/content/pkg/PLAW-116publ260/pdf/PLAW-116publ260.pdf
http://www.govi

**CONSOLIDATED TEXT FILE**

Read the list of PDF URLs, download each PDF to a local directory, and then extract the text from each PDF. Finally, compile the extracted text into a single consolidated text file.

In [None]:
# If you want to subset the PDFs from API EndPoint, review and update the following list

# pdf_list = [
#      "http://www.govinfo.gov/content/pkg/PLAW-117publ58/pdf/PLAW-117publ58.pdf"
#  ]
#pdf_list = ["http://www.govinfo.gov/content/pkg/PLAW-115publ56/pdf/PLAW-115publ56.pdf"]

# Get the current date and timestamp
current_datetime = datetime.now().strftime('%Y%m%d_%H%M%S')

# Folder to save PDFs with date and timestamp
pdf_folder = f'USSP_DOWNLOAD_{current_datetime}' #change the prefix as per your requirement
os.makedirs(pdf_folder, exist_ok=True)  # Create the folder if it doesn't exist

# Text file to store extracted content with date and timestamp in the path. Change the directory as per your requirement.
output_text_file = f'/Users/arnabraychaudhari/Documents/6317/Project_LLM_and_RAG_2024_GWU/{pdf_folder}/USSP_Consolidated_{current_datetime}.txt'

os.makedirs(os.path.dirname(output_text_file), exist_ok=True)

# Fetch data from API
response = requests.get(url)

# Open the text file in write mode
with open(output_text_file, 'w') as text_file:
    # Check if the response is successful
    if response.status_code == 200:
        data = response.json()  # Parse the JSON data from the API response

        # Extract the "def_codes" key containing the PDF URLs
        if 'def_codes' in data:
            def_codes = data['def_codes']
            for code in def_codes:
                if 'urls' in code and code['urls']:  # Check if 'urls' exists and is not None
                    # Split the URLs if they are separated by '|'
                    pdf_urls = code['urls'].split('|') if '|' in code['urls'] else [code['urls']]
                    for i, pdf_url in enumerate(pdf_urls):
                        # Ensure the URL starts with http or https
                        if not pdf_url.startswith('http'):
                            print(f"Invalid URL: {pdf_url}")
                            continue
                        
                        # Check if the PDF URL is in the predefined list
                        if pdf_url not in pdf_list:
                            print(f"Skipping PDF not in the list: {pdf_url}")
                            continue

                        pdf_name = f"{pdf_folder}/{code['code']}_document_{i+1}.pdf"  # Save PDFs with a name based on the code and index
                        try:
                            # Download each PDF
                            pdf_response = requests.get(pdf_url)
                            if pdf_response.status_code == 200:
                                # Write the PDF content to the file
                                with open(pdf_name, 'wb') as pdf_file:
                                    pdf_file.write(pdf_response.content)
                                print(f"Downloaded PDF: {pdf_name}")
                                
                                # Extract text from the downloaded PDF
                                with open(pdf_name, 'rb') as file:
                                    reader = PdfReader(file)
                                    for page_num in range(len(reader.pages)):
                                        page = reader.pages[page_num]
                                        text = page.extract_text()
                                        if text:  # Only write if text was extracted
                                            text_file.write(f"\n\n[Extract from {pdf_name} - Page {page_num+1}]\n")
                                            text_file.write(text)
                            else:
                                print(f"Failed to download {pdf_url}, Status Code: {pdf_response.status_code}")
                        except Exception as e:
                            print(f"Error downloading {pdf_url}: {e}")
                else:
                    print(f"No valid 'urls' found in def_codes entry: {code}")
        else:
            print("No def_codes or URLs found in the API response.")
    else:
        print(f"Failed to retrieve data from API. Status code: {response.status_code}")

print(f"All PDF content has been written to {output_text_file}.")

Downloaded PDF: USSP_DOWNLOAD_20241031_015055/1_document_1.pdf
Downloaded PDF: USSP_DOWNLOAD_20241031_015055/3_document_1.pdf
Downloaded PDF: USSP_DOWNLOAD_20241031_015055/A_document_1.pdf
Downloaded PDF: USSP_DOWNLOAD_20241031_015055/AAB_document_1.pdf
Downloaded PDF: USSP_DOWNLOAD_20241031_015055/AAC_document_1.pdf
Downloaded PDF: USSP_DOWNLOAD_20241031_015055/C_document_1.pdf
Downloaded PDF: USSP_DOWNLOAD_20241031_015055/E_document_1.pdf
Downloaded PDF: USSP_DOWNLOAD_20241031_015055/I_document_1.pdf
Downloaded PDF: USSP_DOWNLOAD_20241031_015055/M_document_1.pdf
Downloaded PDF: USSP_DOWNLOAD_20241031_015055/N_document_1.pdf
Downloaded PDF: USSP_DOWNLOAD_20241031_015055/O_document_1.pdf
Downloaded PDF: USSP_DOWNLOAD_20241031_015055/O_document_2.pdf
Downloaded PDF: USSP_DOWNLOAD_20241031_015055/O_document_3.pdf
Downloaded PDF: USSP_DOWNLOAD_20241031_015055/O_document_4.pdf
No valid 'urls' found in def_codes entry: {'code': 'Q', 'public_law': 'Not Designated Nonemergency/Emergency/Disas