In [89]:
import json
import fitz
import re #library for regex
import os
from tqdm import tqdm

In [90]:
Sections = {"Executive Summary" : r"(?i)\bExecutive Summary\b",
            "Project and Quality Management" : r"(?i)\bProject Management|Project and Quality Management|Quality Management\b",
            "Hull Design and Structural Analysis" : r"(?i)\bHull Design|Analysis|Hull Design and Structural Analysis|Structural Analysis\b",
            "Development and Testing" : r"(?i)\bDevelopment and Testing|Development|Testing\b",
            "Construction" : r"(?i)\bConstruction\b"
            }

Prompts = {"Executive Summary": "Write the executive summary for an ASCE Concrete Canoe Competition.",
           "Project and Quality Management": "Write the project management section for an ASCE Concrete Canoe Competition.",
           "Hull Design and Structural Analysis" : "Write the hull design and structural analysis section for an ASCE Concrete Canoe Competition.",
           "Development and Testing" : "Write the development and testing section for an ASCE Concrete Canoe Competition.",
           "Construction" : "Write the construction section for an ASCE Concrete Canoe Competition."
           }

In [91]:
def pdf_to_text(path):
    doc = fitz.open(path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text

In [92]:
def extract_text(text, sections):
    # re.match checks if the beginning of text matches the regex pattern
    # re.finditer returns an iterator giving match objects for all matches

    extracted = []
    pattern = '(?i)' + '|'.join(val.lstrip("(?i)") for val in sections.values())  # move (?i) to the front
    matches = list(re.finditer(pattern, text))

    for i, match in enumerate(matches):
        section_name = next(key for key, val in sections.items() if re.match(val, match.group()))

        start = match.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)

        section_text = text[start:end].strip()

        if section_text:
            extracted.append((section_name, section_text))
     
    return extracted

In [93]:
def build_json(pdf_dir, output_json):
    with open(output_json, 'w') as outfile:
        
        for filename in tqdm(os.listdir(pdf_dir)):
            if not filename.endswith(".pdf"):
                continue
            
            try:
                path = os.path.join(pdf_dir, filename)
                raw_text = pdf_to_text(path)
                section_entries = extract_text(raw_text, Sections)

                for section, content in section_entries:
                    if len(content.strip()) < 50:
                        continue

                    entry = {
                        "system": "You are a helpful assistant that writes proposals for the ASCE Concrete Canoe Competition.",
                        "prompt": Prompts[section],
                        "output": content.strip()
                    }

                    outfile.write(json.dumps(entry) + "\n")

            except Exception as e:
                print(f"Error processing {filename}: {e}")

In [94]:
# print(pdf_to_text("./pdfs/2010-UAH.pdf"))

In [95]:
# text = pdf_to_text("./pdfs/2010-UAH.pdf")
# sections = extract_text(text, Sections)  

# for i, (section_name, content) in enumerate(sections):
#     print("=" * 80)
#     print(f"SECTION: {section_name.upper()} (#{i+1})")
#     print("=" * 80)
#     print(content)
#     print("\n\n")

In [96]:
pdf_dir = "./pdfs"
output_path = "RAW_DATA.jsonl"

build_json(pdf_dir, output_path)

 15%|█▍        | 23/155 [00:08<00:47,  2.78it/s]

Error processing #2 Virginia Polytechnic Institute and State University - Apex Predator - Project Proposal - 2024 - Society-Wide.pdf: Cannot open empty file: filename='./pdfs\\#2 Virginia Polytechnic Institute and State University - Apex Predator - Project Proposal - 2024 - Society-Wide.pdf'.


 21%|██        | 32/155 [00:11<00:41,  2.99it/s]

Error processing #4 California Polytechnic State University, San Luis Obispo Project Proposal - 2024 Nationals.pdf: Cannot open empty file: filename='./pdfs\\#4 California Polytechnic State University, San Luis Obispo Project Proposal - 2024 Nationals.pdf'.


 66%|██████▌   | 102/155 [00:50<00:11,  4.78it/s]

MuPDF error: format error: No default Layer config



 99%|█████████▉| 154/155 [01:33<00:00,  2.11it/s]

MuPDF error: format error: cannot find object in xref (292 0 R)



100%|██████████| 155/155 [01:33<00:00,  1.65it/s]
