## Ingestion
This notebook downloads the latest citizenship test questions and parses them into text

In [1]:
import os
import re
import json
import requests
from typing import Dict
from PyPDF2 import PdfReader

### Downloading PDF of the USA civics tests

In [2]:
# Step 1: Download the PDF
url_2008 = "https://www.uscis.gov/sites/default/files/document/questions-and-answers/100q.pdf"
url_2025 = "https://www.uscis.gov/sites/default/files/document/questions-and-answers/2025-Civics-Test-128-Questions-and-Answers.pdf"

tests = [
    {
        "test_type": "2008_civics_test",
        "url": url_2008
    },
    {
        "test_type": "2025_civics_test",
        "url": url_2025
    }
]

for test in tests:
    url = test["url"]
    filename = test["test_type"]+".pdf"

    # Download and save
    response = requests.get(url)
    with open(filename, "wb") as f:
        f.write(response.content)

    print(f"PDF downloaded and saved as {filename}")



PDF downloaded and saved as 2008_civics_test.pdf
PDF downloaded and saved as 2025_civics_test.pdf


## Functions to populate missing questions
Some questions just redirect to some other website because they are asking about current term officials. We will replace those with the current data.

In [29]:
import os
import ast
from dotenv import load_dotenv
from openai import OpenAI
from datetime import date
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# Load API key from .env
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [4]:
def ask_gpt(prompt: str) -> dict:
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        response_format={"type": "json_object"},
        messages=[{"role": "user", "content": prompt}],
    )
    return json.loads(resp.choices[0].message.content)

In [None]:
def get_officeholder(qid: str) -> str:
    """
    Given a Wikidata entity QID (e.g., President of the US = Q11696),
    return the current officeholder's name.
    """
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    headers = {"User-Agent": "CitizenshipTestApp/0.1 (your_email@example.com)"}
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return f"Error: {r.status_code}"

    data = r.json()
    entity = data["entities"][qid]

    claims = entity.get("claims", {})
    
    # Try different properties that might indicate the current holder
    holder_id = None
    for prop in ["P1308", "P488", "P35"]:
        officeholders = claims.get(prop, [])
        if officeholders:
            # Look for the one without an end date (P582)
            for holder in reversed(officeholders):  # Check from most recent
                qualifiers = holder.get("qualifiers", {})
                # If no end date (P582), they're currently in office
                if "P582" not in qualifiers:
                    holder_id = holder["mainsnak"]["datavalue"]["value"]["id"]
                    break
            
            # If we still don't have one, just take the last
            if not holder_id and officeholders:
                holder_id = officeholders[-1]["mainsnak"]["datavalue"]["value"]["id"]
            
            if holder_id:
                break
    
    if not holder_id:
        return "No officeholder found"

    # Fetch the officeholder's data separately
    holder_url = f"https://www.wikidata.org/wiki/Special:EntityData/{holder_id}.json"
    holder_r = requests.get(holder_url, headers=headers)
    if holder_r.status_code != 200:
        return holder_id

    holder_data = holder_r.json()
    holder_name = holder_data["entities"].get(holder_id, {}).get("labels", {}).get("en", {}).get("value")
    
    if holder_name:
        return holder_name
    return holder_id

In [None]:
def get_current_governors() -> list:
    """
    Scrape Wikipedia for current US governors.
    Returns a list of strings in format "STATE: Governor Name"
    """
    url = "https://en.wikipedia.org/wiki/List_of_current_United_States_governors"
    headers = {"User-Agent": "CitizenshipTestApp/0.1 (your_email@example.com)"}
    
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return [f"Error: {r.status_code}"]
    
    soup = BeautifulSoup(r.content, 'html.parser')
    governors = []
    
    # Find all tables on the page
    tables = soup.find_all('table', {'class': 'wikitable'})
    
    # The first table contains the governors list
    if tables:
        table = tables[0]
        rows = table.find_all('tr')[1:]  # Skip header row
        
        for row in rows:
            cols = row.find_all(['td', 'th'])
            
            if len(cols) >= 3:
                # Column 0: State, Column 2: Governor name
                state = cols[0].get_text(strip=True).replace('(list)', '').strip()
                governor = cols[2].get_text(strip=True)
                
                if state and governor:
                    governors.append(f"{state}: {governor}")
    
    return governors

In [45]:
def get_current_senators() -> list:
    """
    Scrape Wikipedia for current US senators.
    Returns a list of strings in format "STATE: Senator1, Senator2"
    """
    url = "https://en.wikipedia.org/wiki/List_of_current_United_States_senators"
    headers = {"User-Agent": "CitizenshipTestApp/0.1 (your_email@example.com)"}
    
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return [f"Error: {r.status_code}"]
    
    soup = BeautifulSoup(r.content, 'html.parser')
    senators_by_state = {}
    
    # Find all tables - the 5th table (index 4) contains the full senator list
    tables = soup.find_all('table', {'class': 'wikitable'})
    
    if len(tables) >= 5:
        table = tables[4]
        rows = table.find_all('tr')[1:]  # Skip header row
        
        current_state = None
        
        for row in rows:
            cols = row.find_all(['td', 'th'])
            
            if len(cols) >= 3:
                first_col = cols[0].get_text(strip=True)
                
                # If first column has text, it's a new state with first senator
                if first_col:
                    current_state = first_col
                    senator = cols[2].get_text(strip=True)
                else:
                    # Empty first column means second senator for current state
                    senator = cols[1].get_text(strip=True)
                
                # Add senator to state's list
                if current_state and senator:
                    if current_state not in senators_by_state:
                        senators_by_state[current_state] = []
                    senators_by_state[current_state].append(senator)
    
    # Format as sorted list
    result = []
    for state in sorted(senators_by_state.keys()):
        senators = ", ".join(senators_by_state[state])
        result.append(f"{state}: {senators}")
    
    return result

In [46]:
# Usage
senators_list = get_current_senators()
for senator_info in senators_list:
    print(senator_info)

Alabama: Tommy Tuberville, Katie Britt
Alaska: Lisa Murkowski, Dan Sullivan
Arizona: Mark Kelly, Ruben Gallego
Arkansas: John Boozman, Tom Cotton
California: Alex Padilla, Adam Schiff
Colorado: Michael Bennet, John Hickenlooper
Connecticut: Richard Blumenthal, Chris Murphy
Delaware: Chris Coons, Lisa Blunt Rochester
Florida: Rick Scott, Ashley Moody
Georgia: Jon Ossoff, Raphael Warnock
Hawaii: Brian Schatz, Mazie Hirono
Idaho: Mike Crapo, Jim Risch
Illinois: Dick Durbin, Tammy Duckworth
Indiana: Todd Young, Jim Banks
Iowa: Chuck Grassley, Joni Ernst
Kansas: Jerry Moran, Roger Marshall
Kentucky: Mitch McConnell, Rand Paul
Louisiana: Bill Cassidy, John Kennedy
Maine: Susan Collins, Angus King
Maryland: Chris Van Hollen, Angela Alsobrooks
Massachusetts: Elizabeth Warren, Ed Markey
Michigan: Gary Peters, Elissa Slotkin
Minnesota: Amy Klobuchar, Tina Smith
Mississippi: Roger Wicker, Cindy Hyde-Smith
Missouri: Josh Hawley, Eric Schmitt
Montana: Steve Daines, Tim Sheehy
Nebraska: Deb Fischer,

In [28]:
# Usage
governors_list = get_current_governors()
print(governors_list)

['Alabama: Kay Ivey', 'Alaska: Mike Dunleavy', 'Arizona: Katie Hobbs', 'Arkansas: Sarah Huckabee Sanders', 'California: Gavin Newsom', 'Colorado: Jared Polis', 'Connecticut: Ned Lamont', 'Delaware: Matt Meyer', 'Florida: Ron DeSantis', 'Georgia: Brian Kemp', 'Hawaii: Josh Green', 'Idaho: Brad Little', 'Illinois: JB Pritzker', 'Indiana: Mike Braun', 'Iowa: Kim Reynolds', 'Kansas: Laura Kelly', 'Kentucky: Andy Beshear', 'Louisiana: Jeff Landry', 'Maine: Janet Mills', 'Maryland: Wes Moore', 'Massachusetts: Maura Healey', 'Michigan: Gretchen Whitmer', 'Minnesota: Tim Walz', 'Mississippi: Tate Reeves', 'Missouri: Mike Kehoe', 'Montana: Greg Gianforte', 'Nebraska: Jim Pillen', 'Nevada: Joe Lombardo', 'New Hampshire: Kelly Ayotte', 'New Jersey: Phil Murphy', 'New Mexico: Michelle Lujan Grisham', 'New York: Kathy Hochul', 'North Carolina: Josh Stein', 'North Dakota: Kelly Armstrong', 'Ohio: Mike DeWine', 'Oklahoma: Kevin Stitt', 'Oregon: Tina Kotek', 'Pennsylvania: Josh Shapiro', 'Rhode Island

In [None]:
# Example usage - FIXED QID!
print("President:", get_officeholder("Q11696"))
print("Speaker:", get_officeholder("Q912994"))  # Changed from Q11005
print("Vice President:", get_officeholder("Q11699"))
print("Chief Justice:", get_officeholder("Q11201"))


President: Donald Trump
Speaker: Mike Johnson
Vice President: JD Vance
Chief Justice: John Roberts


In [None]:
main_prompt = """
For the given question, return the most recent applicable response as of {today}.

Rules:
- Always return ONLY a valid JSON object of the form:
  {{"answers": ["answer1", "answer2", "..."]}}
- Do not include code fences, explanations, or any text outside of the JSON.
- If the answer is independent of location, put acceptable variants in "answers".
- If the answer is dependent on location, each entry in "answers" must be
  "XX: Answer", where XX is the two-letter state/territory abbreviation.
- Include ALL U.S. states, the District of Columbia, and all U.S. territories
  (PR, GU, AS, VI, MP).
- If a location does not have an applicable official (e.g., DC or territories
  for U.S. Senators), list it explicitly, e.g. "DC: no Senators".
- If you do not know the latest information as of {today}, give the most recent
  information available to you.

Examples:

question: What is the name of the President of the United States now?
response: {{"answers": ["Donald J. Trump", "Donald Trump", "Trump"]}}

question: Who is one of your state’s U.S. Senators now?
response: {{"answers": [
    "AL: Katie Britt", "AL: Tommy Tuberville",
    "AK: Lisa Murkowski", "AK: Dan Sullivan",
    "AZ: Mark Kelly", "AZ: Ruben Gallego",
    "CA: Alex Padilla", "CA: Adam Schiff",
    "DC: no Senators",
    "PR: no Senators", "GU: no Senators",
    "AS: no Senators", "VI: no Senators", "MP: no Senators"
]}}
    
This is the question you are to retrieve an answer to:
<question>
{question}
</question>
"""


In [None]:
def populate_missing_questions(qa_pairs: Dict) -> Dict:
    """"
    this function gets any missing answers to questions using gpt-4o-mini.
    For example, it gets the name of the current president, senators, etc.
    """
    # get today's date
    today = date.today()

    # keywords that flag that hte answer is a placeholder
    keywords = ["answers will vary", "visit uscis.gov"]

    for qa_pair in qa_pairs:
        question = qa_pair["question"]
        answer = qa_pair["answers"][0].lower()

        # check if answer is a placeholder
        if any(word in answer for word in keywords):
            print('variable response found, extracting latest answer for...')
            print(f"question: {question}")

            # setup prompt
            prompt = main_prompt.format(today=today,question=question)

            # call chatGPT
            data = ask_gpt(prompt)

            # add the list results back to qa pairs
            qa_pair["answers"] = data["answers"]
    
    return qa_pairs

### parsing/cleanup functions
We parse through the PDF and get a cleaned up json format

In [None]:
def parse_clean_pdf(filename: str) -> Dict:
    """ 
    This function reads a pdf, splits by page, 
    and parses the unique QnAs for the civics test
    """
    # Parse text from the PDF
    reader = PdfReader(filename)
    all_text = ""

    for page in reader.pages:
        text = page.extract_text()
        if text:
            all_text += text + "\n"

    # clean up a bit
    all_text = all_text.replace("\t", " ").replace("  ", " ")

    # Split the text into question blocks by looking for numbers at the start of a line
    # The regex looks for a number followed by a dot and spaces, using a lookahead for next number or end of string
    blocks = re.split(r"\n?\s*\d+\.\s+", all_text)

    #initialize qa pairs dict
    qa_pairs = []

    for block in blocks[1:]: # we skip the first block since it is just intro data
        block = block.strip()
        if not block:
            continue

        # First line is the question, the rest are answers
        lines = block.splitlines()
        question = lines[0].strip().replace('*','')
        
        # Keep only lines starting with a bullet "•", remove the bullet and extra spaces
        # answers = [re.sub(r"^•\s*", "", line).strip() for line in lines[1:] if line.strip().startswith("•")]
        # Keep lines starting with • or ▪, remove the bullet and extra spaces
        # Define bullet characters you want to support

        answers = [
            line.strip()[1:].strip()
            for line in lines[1:]
            if line.strip().startswith(("•", "▪"))
        ]

        qa_pairs.append({
            "question": question,
            "answers": answers
        })

    return qa_pairs

In [None]:
def save_to_json(savefile: str, qa_pairs: Dict) ->None:
    "small script to save results to a json file"
    
    with open(savefile, "w", encoding="utf-8") as f:
        json.dump(qa_pairs, f, ensure_ascii=False, indent=2)

## MAIN FOR LOOP

In [None]:
# MAIN

for test in reversed(tests):
    
    # get filename with civics test
    filename = test["test_type"]+".pdf"
    
    # parse the qna from the pdf
    qa_pairs = parse_clean_pdf(filename)

    # replace variable undefined answers with latest info
    qa_pairs = populate_missing_questions(qa_pairs=qa_pairs)

    # Save to JSON
    savefile = test["test_type"]+"_qa_pairs.json"
    _ = save_to_json(savefile, qa_pairs)

    print(f"Extracted {len(qa_pairs)} QnAs into {savefile}")
