## Ingestion
This notebook downloads the latest citizenship test questions and parses them into text

In [93]:
import os
import re
import json
import requests
from PyPDF2 import PdfReader


In [94]:

# Step 1: Download the PDF
url_2008 = "https://www.uscis.gov/sites/default/files/document/questions-and-answers/100q.pdf"
url_2025 = "https://www.uscis.gov/sites/default/files/document/questions-and-answers/2025-Civics-Test-128-Questions-and-Answers.pdf"

tests = [
    {
        "test_type": "2008_civics_test",
        "url": url_2008
    },
    {
        "test_type": "2025_civics_test",
        "url": url_2025
    }
]

for test in tests:
    url = test["url"]
    filename = test["test_type"]+".pdf"

    # Download and save
    response = requests.get(url)
    with open(filename, "wb") as f:
        f.write(response.content)

    print(f"PDF downloaded and saved as {filename}")



PDF downloaded and saved as 2008_civics_test.pdf
PDF downloaded and saved as 2025_civics_test.pdf


In [95]:
for test in reversed(tests):
    filename = test["test_type"]+".pdf"
    # Step 2: Parse text from the PDF
    reader = PdfReader(filename)
    all_text = ""

    for page in reader.pages:
        text = page.extract_text()
        if text:
            all_text += text + "\n"

    # clean up a bit
    all_text = all_text.replace("\t", " ").replace("  ", " ")

    # Split the text into question blocks by looking for numbers at the start of a line
    # The regex looks for a number followed by a dot and spaces, using a lookahead for next number or end of string
    blocks = re.split(r"\n?\s*\d+\.\s+", all_text)


    qa_pairs = []

    for block in blocks[1:]: # we skip the first block since it is just intro data
        block = block.strip()
        if not block:
            continue

        # First line is the question, the rest are answers
        lines = block.splitlines()
        question = lines[0].strip()
        
        # Keep only lines starting with a bullet "•", remove the bullet and extra spaces
        # answers = [re.sub(r"^•\s*", "", line).strip() for line in lines[1:] if line.strip().startswith("•")]
        # Keep lines starting with • or ▪, remove the bullet and extra spaces
        # Define bullet characters you want to support

        answers = [
            line.strip()[1:].strip()
            for line in lines[1:]
            if line.strip().startswith(("•", "▪"))
        ]

        qa_pairs.append({
            "question": question,
            "answers": answers
        })

    # Save to JSON
    savefile = test["test_type"]+"_qa_pairs.json"
    with open(savefile, "w", encoding="utf-8") as f:
        json.dump(qa_pairs, f, ensure_ascii=False, indent=2)

    print(f"Extracted {len(qa_pairs)} QnAs into {savefile}")


Extracted 128 QnAs into 2025_civics_test_qa_pairs.json
Extracted 100 QnAs into 2008_civics_test_qa_pairs.json
