In [1]:
def download_pdfs(pdf_links, folder_name):
    # Download each PDF
    for pdf_url in pdf_links:
        pdf_name = pdf_url.split("/")[-1]
        file_path = os.path.join(folder_name, pdf_name)

        if os.path.exists(file_path):
            print(f"Already downloaded: {pdf_name}")
            continue

        try:
            pdf_res = requests.get(pdf_url)
            pdf_res.raise_for_status()
            with open(file_path, "wb") as f:
                f.write(pdf_res.content)
            print(f"Downloaded: {pdf_name}")
        except Exception as e:
            print(f"Failed to download {pdf_name}: {e}")
    time.sleep(1)

In [2]:
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re

# Folder to save PDFs
os.makedirs("irs_eng_pdfs", exist_ok=True)
os.makedirs("irs_multilang_pdfs", exist_ok=True)

base_url = "https://www.irs.gov"
start_url = "https://www.irs.gov/downloads/irs-pdf?page={}"

headers = {
    "User-Agent": "Mozilla/5.0"
}

for page_num in range(58):  # Pages 0 to 57
    print(f"\nScraping page {page_num}...")
    url = start_url.format(page_num)

    try:
        res = requests.get(url, headers=headers)
        res.raise_for_status()
    except Exception as e:
        print(f"Failed to fetch page {page_num}: {e}")
        continue

    soup = BeautifulSoup(res.text, "html.parser")

    #Find all elements from the table
    table = soup.find("table", attrs={"class": "table table-striped tablesaw tablesaw-stack cols-4"})

    tbody = table.find("tbody", recursive=False)
    form_rows = tbody.find_all("tr", recursive=False)

    single_language_regex = r"\((sp|SP|ru|RU|vie|VIE|zh-s|ZH-S|zh-t|ZH-T|de|DE|ko|KO|ht|HT)\)+"
    filtered_forms = []
    for form in form_rows:
            form_desc_element = form.find("td", attrs={"headers": "view-name-table-column"})
            form_desc = form_desc_element.contents[0]
            if not bool(re.search(single_language_regex, form_desc)):
                filtered_forms.append(form)

    multi_language_regex = r"\(en-(sp|SP|ru|RU|vie|VIE|zh-s|ZH-S|zh-t|ZH-T|de|DE|ko|KO|ht|HT)\)+"

    english_forms = []
    multilang_forms = []

    for form in filtered_forms:
            form_desc_element = form.find("td", attrs={"headers": "view-name-table-column"})
            form_desc = form_desc_element.contents[0]
            if not bool(re.search(multi_language_regex, form_desc)):
                english_forms.append(form.find("td", attrs={"headers": "view-uri-table-column"}))
            else:
                multilang_forms.append(form.find("td", attrs={"headers": "view-uri-table-column"}))

    eng_pdf_links = [
            urljoin(base_url, form.find("a")["href"])
            for form in english_forms
        ]

    multilang_pdf_links = [
            urljoin(base_url, form.find("a")["href"])
            for form in multilang_forms
        ]

    download_pdfs(eng_pdf_links, "irs_eng_pdfs")
    download_pdfs(multilang_pdf_links, "irs_multilang_pdfs")

print("\nDone downloading all PDFs.")



Scraping page 0...
Downloaded: f1098.pdf
Downloaded: f1099h.pdf
Downloaded: i1099sb.pdf
Downloaded: i1097btc.pdf
Downloaded: p5883.pdf
Downloaded: p6081.pdf
Downloaded: f1098q.pdf
Downloaded: f1099sa.pdf
Downloaded: p4190.pdf
Downloaded: f1099a.pdf
Downloaded: i1099ptr.pdf
Downloaded: i1098et.pdf
Downloaded: f15674.pdf
Downloaded: i1098q.pdf
Downloaded: i1099qa.pdf
Downloaded: f1099c.pdf
Downloaded: i1099ac.pdf
Downloaded: i1098f.pdf
Downloaded: p5878.pdf
Downloaded: p16.pdf
Downloaded: i1099s.pdf
Downloaded: i1099h.pdf
Downloaded: i1099cap.pdf
Downloaded: p6072.pdf
Downloaded: f1099qa.pdf
Downloaded: i720.pdf
Downloaded: f1099ptr.pdf
Downloaded: f3922.pdf
Downloaded: p590b.pdf
Downloaded: f1099ltc.pdf
Downloaded: p6058.pdf
Downloaded: p907.pdf
Downloaded: f1099ls.pdf
Downloaded: f1098e.pdf
Downloaded: i1099ltc.pdf
Downloaded: p6054.pdf
Downloaded: p505.pdf
Downloaded: p560.pdf
Downloaded: p6059.pdf
Downloaded: f1098t.pdf
Downloaded: p962esp.pdf

Scraping page 1...
Downloaded: p519.pd