In [7]:
import pandas as pd
import fitz
import re
import csv
from pathlib import Path

mmpia_df = pd.DataFrame()

for pdf_document in Path("./data/mmpia/vega").glob("*.pdf"):

    doc = fitz.open(pdf_document)

    def extract_name(file_stem):
        parenthesis_index = file_stem.find("(")
        extracted_name = file_stem[:parenthesis_index].strip().lower()
        extracted_name = re.sub(r"\s+"," ", extracted_name)
        return extracted_name

    data_dict = {"id": extract_name(pdf_document.stem) }
    
    pattern = re.compile(r"([A-Za-z0-9\-]+)\s+(\d+)\s+(\d+)")
    
    # Pages from 2 to 6 (zero-indexed pages 1 to 5)
    for page_number in range(1, 6):
        page = doc.load_page(page_number)
        text = page.get_text()    
        matches = pattern.findall(text)
        for scale, raw_score, t_score in matches:
            raw_column = f"{scale}_raw"
            t_column = f"{scale}_scale_t"
            data_dict[raw_column] = int(raw_score)
            data_dict[t_column] = int(t_score)
    
    # Convert the extracted data into a Pandas DataFrame
    # Since we have a single row of data, we use pd.DataFrame.from_records
    scales_df = pd.DataFrame.from_records([data_dict])
    mmpia_df = pd.concat([mmpia_df, scales_df])

mmpia_df.sort_values(by="id").to_excel("./analysis/mmpia.xlsx", index=False)