## Civics guide Ingestion
This notebook downloads the civics guide pdf, and parses the data into text and chunks it.

In [174]:
import re
import requests
from PyPDF2 import PdfReader

## Ingestion of raw data

### PDF downloading

In [175]:
# download PDF
url = "https://www.uscis.gov/sites/default/files/document/brochures/OOC_M-1175_CivicsTextbook_8.5x11_V7_RGB_English_508.pdf"

filename = "../documents/civics_guide.pdf"

# Download and save
response = requests.get(url)
with open(filename, "wb") as f:
    f.write(response.content)

print(f"PDF downloaded and saved as {filename}")

PDF downloaded and saved as ../documents/civics_guide.pdf


### Parsing and slight text cleanup

In [176]:
reader = PdfReader(filename)
all_text = []

total_pages = len(reader.pages)

for i, page in enumerate(reader.pages):
    # Skip first 4 pages
    if i < 4:
        continue

    # Skip last 7 pages
    if i >= total_pages - 7:
        continue
    
    text = page.extract_text()
    if text:
        # Remove common repeating sections
        lines = text.split('\n')
        filtered_lines = []
        skip_section = False
        skip_test_section = False  # NEW: separate flag for TEST YOUR KNOWLEDGE
        
        for line in lines:
            # Check if line contains "TEST YOUR KNOWLEDGE" 
            if "TEST YOUR KNOWLEDGE" in line.upper():
                # Keep text before "TEST YOUR KNOWLEDGE" if any
                before_test = re.split(r'TEST YOUR KNOWLEDGE', line, flags=re.IGNORECASE)[0]
                if before_test.strip():
                    filtered_lines.append(before_test)
                skip_test_section = True
                continue
            
            # Check if we've reached the end of TEST YOUR KNOWLEDGE section
            if skip_test_section and "you may study just the questions that have been marked with an asterisk" in line.lower():
                skip_test_section = False
                continue  # Skip this line too
            
            # If we're in TEST YOUR KNOWLEDGE section, skip everything
            if skip_test_section:
                continue
            
            # Skip the instruction boxes about bold text and red boxes, as well as most image footers
            if any(phrase in line for phrase in [
                "Within each chapter there are some",
                "sentences and phrases that are written",
                "in bold font",
                "number in a red box",
                "Civics Test Questions",
                "For example, the following sentence",
                "This sentence is from Question",
                "Photo by",
                "Courtesy of",
                "Associate Justice Sonia",
                "President George W. Bush",
                "President Obama"
                
                ]):
                skip_section = True
                continue
                
            # Resume after sections end (look for next chapter or content)
            if skip_section and (line.startswith("CHAPTER") or len(line.strip()) > 50):
                skip_section = False
            
            if not skip_section:
                # Remove the book title from the line (case insensitive)
                original_line = line
                line = re.sub(r'ONE NATION, ONE PEOPLE:?\s*(THE USCIS CIVICS TEST TEXTBOOK)?', '', line, flags=re.IGNORECASE)
                
                # Remove 1-3 digit numbers that appear right before a period or colon
                line = re.sub(r'\s\d{1,3}\.', '.', line)
                line = re.sub(r'\s\d{1,3}\:', ':', line)
                line = re.sub(r'^\d{1,3}\.\s*', '', line)
                line = re.sub(r'^\d{1,3}\:\s*', ':', line)
                line = re.sub(r'^\s*\d{1,3}\s*$', '', line)
                
                # Remove leftover info from the example question boxes in red
                line = line.replace('written? 1787  66  ', '')
                
                # Skip lines that start with CHAPTER
                if line.strip().startswith("CHAPTER"):
                    continue
                
                # Only add non-empty lines to filtered_lines
                if line.strip():
                    filtered_lines.append(line)
        
        all_text.append('\n'.join(filtered_lines))

### Manual verification: saving to txt file for quick check

In [177]:
text_2_save = "\n\n================NEW PAGE================\n\n".join(all_text)

In [178]:
# save parsing into a text file to proofread
output_filename = '../documents/parsed_civics_guide.txt'

with open(output_filename, 'w', encoding='utf-8') as f:
    f.write(text_2_save)

print(f"Text saved to {output_filename}")

Text saved to ../documents/parsed_civics_guide.txt


## Qdrant

### Setup of qdrant