In [2]:
def download_pdfs(pdf_links, folder_name):
    # Download each PDF
    for pdf_url in pdf_links:
        pdf_name = pdf_url.split("/")[-1]
        file_path = os.path.join(folder_name, pdf_name)

        if os.path.exists(file_path):
            print(f"✅ Already downloaded: {pdf_name}")
            continue

        try:
            pdf_res = requests.get(pdf_url)
            pdf_res.raise_for_status()
            with open(file_path, "wb") as f:
                f.write(pdf_res.content)
            print(f"📥 Downloaded: {pdf_name}")
        except Exception as e:
            print(f"❌ Failed to download {pdf_name}: {e}")

    # Optional: Delay to be polite
    time.sleep(1)

In [None]:
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re

# Folder to save PDFs
os.makedirs("irs_eng_pdfs", exist_ok=True)
os.makedirs("irs_multilang_pdfs", exist_ok=True)

base_url = "https://www.irs.gov"
start_url = "https://www.irs.gov/downloads/irs-pdf?page={}"

headers = {
    "User-Agent": "Mozilla/5.0"
}

for page_num in range(58):  # Pages 0 to 57
    print(f"\n🔍 Scraping page {page_num}...")
    url = start_url.format(page_num)

    try:
        res = requests.get(url, headers=headers)
        res.raise_for_status()
    except Exception as e:
        print(f"❌ Failed to fetch page {page_num}: {e}")
        continue

    soup = BeautifulSoup(res.text, "html.parser")

    #Find all elements from the table
    table = soup.find("table", attrs={"class": "table table-striped tablesaw tablesaw-stack cols-4"})

    tbody = table.findChild("tbody", recursive=False)
    form_rows = tbody.findChildren("tr", recursive=False)

    single_language_regex = r"\((sp|ru|vie|zh-s|zh-t|de|ko|ht)\)+"
    filtered_forms = []
    for form in form_rows:
            form_desc_element = form.findChild("td", attrs={"headers": "view-name-table-column"})
            form_desc = form_desc_element.contents[0]
            if not bool(re.search(single_language_regex, form_desc)):
                filtered_forms.append(form)

    multi_language_regex = r"\(en-(sp|ru|vie|zh-s|zh-t|de|ko|ht)\)+"

    english_forms = []
    multilang_forms = []

    for form in filtered_forms:
            form_desc_element = form.findChild("td", attrs={"headers": "view-name-table-column"})
            form_desc = form_desc_element.contents[0]
            if not bool(re.search(single_language_regex, form_desc)):
                english_forms.append(form.findChild("td", attrs={"headers": "view-uri-table-column"}))
            else:
                multilang_forms.append(form.findChild("td", attrs={"headers": "view-uri-table-column"}))

    eng_pdf_links = [
            urljoin(base_url, form.findChild("a")["href"])
            for form in english_forms
        ]

    multilang_pdf_links = [
            urljoin(base_url, form.findChild("a")["href"])
            for form in multilang_forms
        ]

download_pdfs(eng_pdf_links, "irs_eng_pdfs")
download_pdfs(multilang_pdf_links, "irs_multilang_pdfs")

print("\n🎉 Done downloading all PDFs.")



🔍 Scraping page 0...

🔍 Scraping page 1...


  tbody = table.findChild("tbody", recursive=False)
  form_rows = tbody.findChildren("tr", recursive=False)
  form_desc_element = form.findChild("td", attrs={"headers": "view-name-table-column"})
  form_desc_element = form.findChild("td", attrs={"headers": "view-name-table-column"})
  english_forms.append(form.findChild("td", attrs={"headers": "view-uri-table-column"}))
  urljoin(base_url, form.findChild("a")["href"])



🔍 Scraping page 2...

🔍 Scraping page 3...

🔍 Scraping page 4...

🔍 Scraping page 5...

🔍 Scraping page 6...

🔍 Scraping page 7...

🔍 Scraping page 8...

🔍 Scraping page 9...

🔍 Scraping page 10...

🔍 Scraping page 11...

🔍 Scraping page 12...

🔍 Scraping page 13...

🔍 Scraping page 14...

🔍 Scraping page 15...

🔍 Scraping page 16...

🔍 Scraping page 17...

🔍 Scraping page 18...

🔍 Scraping page 19...

🔍 Scraping page 20...

🔍 Scraping page 21...

🔍 Scraping page 22...

🔍 Scraping page 23...

🔍 Scraping page 24...

🔍 Scraping page 25...

🔍 Scraping page 26...

🔍 Scraping page 27...

🔍 Scraping page 28...

🔍 Scraping page 29...

🔍 Scraping page 30...

🔍 Scraping page 31...

🔍 Scraping page 32...

🔍 Scraping page 33...

🔍 Scraping page 34...

🔍 Scraping page 35...

🔍 Scraping page 36...

🔍 Scraping page 37...

🔍 Scraping page 38...

🔍 Scraping page 39...

🔍 Scraping page 40...

🔍 Scraping page 41...

🔍 Scraping page 42...

🔍 Scraping page 43...

🔍 Scraping page 44...

🔍 Scraping page 45

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import shutil

# Copy folder to a new location in your Google Drive
shutil.copytree('irs_all_pdfs', '/content/drive/MyDrive/UTD Coursework/Sem 4/AI Agents project/Tax project/irs_all_pdfs')


'/content/drive/MyDrive/UTD Coursework/Sem 4/AI Agents project/Tax project/irs_all_pdfs'

In [None]:
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Folder to save PDFs
os.makedirs("irs_all_pdfs", exist_ok=True)

base_url = "https://www.irs.gov"
start_url = "https://www.irs.gov/downloads/irs-pdf?page={}"

headers = {
    "User-Agent": "Mozilla/5.0"
}

for page_num in range(58):  # Pages 0 to 57
    print(f"\n🔍 Scraping page {page_num}...")
    url = start_url.format(page_num)

    try:
        res = requests.get(url, headers=headers)
        res.raise_for_status()
    except Exception as e:
        print(f"❌ Failed to fetch page {page_num}: {e}")
        continue

    soup = BeautifulSoup(res.text, "html.parser")

    # Find all <a> tags with href ending in .pdf
    links = soup.find_all("a", href=True)

print(links[0])


🔍 Scraping page 0...

🔍 Scraping page 1...

🔍 Scraping page 2...

🔍 Scraping page 3...

🔍 Scraping page 4...

🔍 Scraping page 5...

🔍 Scraping page 6...

🔍 Scraping page 7...

🔍 Scraping page 8...

🔍 Scraping page 9...

🔍 Scraping page 10...

🔍 Scraping page 11...

🔍 Scraping page 12...

🔍 Scraping page 13...

🔍 Scraping page 14...

🔍 Scraping page 15...

🔍 Scraping page 16...

🔍 Scraping page 17...

🔍 Scraping page 18...

🔍 Scraping page 19...

🔍 Scraping page 20...

🔍 Scraping page 21...

🔍 Scraping page 22...

🔍 Scraping page 23...

🔍 Scraping page 24...

🔍 Scraping page 25...

🔍 Scraping page 26...

🔍 Scraping page 27...

🔍 Scraping page 28...

🔍 Scraping page 29...

🔍 Scraping page 30...

🔍 Scraping page 31...

🔍 Scraping page 32...

🔍 Scraping page 33...

🔍 Scraping page 34...

🔍 Scraping page 35...

🔍 Scraping page 36...

🔍 Scraping page 37...

🔍 Scraping page 38...

🔍 Scraping page 39...

🔍 Scraping page 40...

🔍 Scraping page 41...

🔍 Scraping page 42...

🔍 Scraping page 43..