In [None]:
# pip install pdfplumber

Defaulting to user installation because normal site-packages is not writeable
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 206 kB/s eta 0:00:01
Collecting pypdfium2>=4.18.0
  Downloading pypdfium2-4.30.1-py3-none-macosx_11_0_arm64.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 694 kB/s eta 0:00:01
[?25hCollecting pdfminer.six==20231228
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 731 kB/s eta 0:00:01
Collecting cryptography>=36.0.0
  Downloading cryptography-44.0.1-cp39-abi3-macosx_10_9_universal2.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 877 kB/s eta 0:00:01
Installing collected packages: cryptography, pypdfium2, pdfminer.six, pdfplumber
Successfully installed cryptography-44.0.1 pdfminer.six-20231228 pdfplumber-0.11.5 pypdfium2-4.30.1
You should consider upgrading via the '/Library/Developer/Command

In [None]:
import pdfplumber

pdf_path = "book1.pdf"

with pdfplumber.open(pdf_path) as pdf:
    text = ""
    for page in pdf.pages:
        text += page.extract_text() + "\n"

# Save extracted text for further processing
with open("extracted_text.txt", "w", encoding="utf-8") as f:
    f.write(text)


In [4]:
import re

# Load extracted text
with open("extracted_text.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Define pattern to extract "Problem Set" sections
problem_set_pattern = r"Problem Set.*?(?=\n\nChapter|\n\nProblem Set|\Z)"  # Captures until next "Chapter" or another "Problem Set"

problem_sets = re.findall(problem_set_pattern, text, re.DOTALL)

# Save extracted problem sets
with open("problem_sets.txt", "w", encoding="utf-8") as f:
    f.write("\n\n".join(problem_sets))


In [5]:
# Define question patterns
question_patterns = [
    r"\d+\..*?(?=\n\d+\.|\n\n|\nProblem Set|\Z)",  # Matches "1. Question" until next number
    r"(Find|Solve|Prove|Compute|Evaluate).*?(?=\n\n|\n[A-Z])",  # Matches standalone problems
]

questions = []
for problem_set in problem_sets:
    for pattern in question_patterns:
        matches = re.findall(pattern, problem_set, re.DOTALL)
        questions.extend(matches)

# Save extracted questions
with open("questions_extracted.txt", "w", encoding="utf-8") as f:
    f.write("\n\n".join(questions))


In [32]:
import re

# Define patterns
page_header_pattern = r"Chapter\d+\s+[A-Za-z]+"  # Matches "Chapter1 Matrices"
problem_set_pattern = r"Problem Set\s*\d+"  # Matches "Problem Set 1"
question_pattern = r"\d+\..*"  # Matches a question starting with a number (e.g., "1. What is...")

# Storage
questions_with_chapters = []
current_chapter = "Unknown"
inside_problem_set = False
current_question = ""

# Load extracted text
with open("extracted_text.txt", "r", encoding="utf-8") as f:
    text = f.read()

lines = text.split("\n")  # Process text line by line

for i, line in enumerate(lines):
    line = line.strip()

    # Check for chapter header
    match = re.search(page_header_pattern, line)  # Search for pattern anywhere in the line
    if match:
        current_chapter = match.group()  # Get the exact matched string

    # If "Problem Set" is found, mark that we are inside a problem set
    elif re.match(problem_set_pattern, line):
        inside_problem_set = True  # Start collecting questions
        current_question = ""  # Reset the current question

    # If we encounter an empty line or next "Chapter", reset problem set flag
    elif inside_problem_set and (line == "" or re.match(page_header_pattern, line)):
        if current_question:
            # Add the current question to the list when an empty line or chapter is found
            questions_with_chapters.append((current_chapter, current_question.strip()))
        inside_problem_set = False
        current_question = ""  # Reset current question

    # Collect question content if inside a problem set and a question is detected
    elif inside_problem_set and re.match(question_pattern, line):
        if current_question:
            # If a previous question is collected, save it before starting a new one
            questions_with_chapters.append((current_chapter, current_question.strip()))
        current_question = line  # Start a new question

    # If we're inside a problem set and the line doesn't match a new question,
    # add the line to the current question (for multi-line questions).
    elif inside_problem_set and current_question:
        current_question += " " + line  # Append the line to the current question

# Save the extracted questions with chapters
with open("questions_with_chapters.txt", "w", encoding="utf-8") as f:
    for chapter, question in questions_with_chapters:
        f.write(f"{chapter}\n{question}\n\n")


In [33]:
def convert_to_latex(question):
    question = question.replace("≤", r"\leq")  # Example replacements
    question = question.replace("≥", r"\geq")
    question = question.replace("^", r"^{}")   # Handling exponents
    question = re.sub(r"(\d+)/(\d+)", r"\\frac{\1}{\2}", question)  # Convert fractions
    return f"\\textbf{{{question}}}"

# Save in LaTeX format
with open("questions_latex.tex", "w", encoding="utf-8") as f:
    for chapter, question in questions_with_chapters:
        f.write(f"\\section*{{{chapter}}}\n")
        f.write(f"\\begin{{itemize}}\n")
        f.write(f"    \\item {convert_to_latex(question)}\n")
        f.write(f"\\end{{itemize}}\n\n")


In [38]:
import json

# Save the extracted questions with chapters to JSON
questions_with_chapters_dict = [
    {"chapter": chapter[9:], "question": question} for chapter, question in questions_with_chapters
]

with open("questions_with_chapters.json", "w", encoding="utf-8") as f:
    json.dump(questions_with_chapters_dict, f, indent=4, ensure_ascii=False)

# Save questions in LaTeX format
with open("questions_latex.tex", "w", encoding="utf-8") as f:
    for entry in questions_with_chapters_dict:
        chapter = entry["chapter"]
        question = entry["question"]
        f.write(f"\\section*{{{chapter}}}\n")
        f.write(f"\\begin{{itemize}}\n")
        f.write(f"    \\item {convert_to_latex(question)}\n")
        f.write(f"\\end{{itemize}}\n\n")


In [39]:
import json

# Load the JSON file containing the questions and chapters
with open("questions_with_chapters.json", "r", encoding="utf-8") as f:
    questions_with_chapters_dict = json.load(f)

# Extract unique chapter names
unique_chapters = set(entry["chapter"] for entry in questions_with_chapters_dict)

# Count the number of unique chapters
num_unique_chapters = len(unique_chapters)

# Print the unique chapters and their count
print("Unique Chapters:", unique_chapters)
print("Number of unique chapters:", num_unique_chapters)


Unique Chapters: {'Orthogonality', 'EigenvaluesandEigenvectors', 'ComputationswithMatrices', 'VectorSpaces', 'LinearProgrammingandGameTheory', 'PositiveDefiniteMatrices', 'MatricesandGaussianElimination', 'Determinants'}
Number of unique chapters: 8
