# Convert metadata and PDFs to LLM dataset

This notebook will process the already downloaded PDF files and convert them to a data set suitable for fine-tuning and evaluating LLMs.

A new field "content" will be added to each record. The field contains an object that in turn contains the fields "pdfinfo" and "pages", that contain the metadata and text extracted from the PDF file.

In [2]:
%%time

import os.path
import glob
import json
import re

import fitz
import regex  # has better Unicode support than standard library re module

PAGES = [0, 1, 2, 3, 4, 5, 6, 7, -1]  # pages to analyze: first 8 pages + last page
THRESHOLD = 100                       # paragraphs shorter than this will always be kept
LONG_PARAGRAPH_PAGES = [0, 1]         # on first two pages, some long paragraphs are accepted
LONG_PARAGRAPH_MAX = 2                # how many long paragraphs to keep on the first two pages
MAX_LENGTH = 2000                     # maximum length of a paragraph - tokens
PDF_METADATA_SKIP = {'format', 'creator', 'producer'}  # PDF metadata fields not to include in extracted text

metadata_files = glob.glob("../metadata/article-en-*.jsonl")

def id_to_fn(identifier):
    """convert a URI identifier to a simpler string we can use as a filename for the PDF"""
    return '../pdfs/' + identifier.replace('https://', '').replace('/','_') + ".pdf"

def split_text(text, max_length=2000): 
    # Split paragraphs based on newlines followed by uppercase letters 
    paragraphs = re.split(r'\n+(?=[A-Z])', text)    
    # Combine paragraphs to ensure each part is within the max_length 
    combined_paragraphs = [] 
    current_paragraph = "" 
    for paragraph in paragraphs: 
        if len(current_paragraph) + len(paragraph) <= max_length: 
            current_paragraph += paragraph + '\n' 
        else: 
            combined_paragraphs.append(current_paragraph.strip()) 
            current_paragraph = paragraph + '\n' 
    if current_paragraph: 
            combined_paragraphs.append(current_paragraph.strip()) 

    return combined_paragraphs

def extract_content(fn):
    """extract and return the pdfinfo metadata and the first few pages of text (and last page) from the given PDF file"""

    pdfinfo = {}
    pages = []
    
    with fitz.open(fn) as pdf:

        for key in pdf.metadata.keys():
            if key not in PDF_METADATA_SKIP and pdf.metadata.get(key):
                pdfinfo[key] = pdf.metadata.get(key)

        for page in PAGES:
            if page > len(pdf) - 2:
                continue

            texts = []
            text = pdf[page].get_text(sort=True)
            # Use regular expression to split text into paragraphs
            # Delimiter: newline(s) followed by an upper case character
            paragraphs = split_text(text, MAX_LENGTH)
            long_paragraph_count = 0

            for paragraph in paragraphs:
                paragraph = " ".join(paragraph.strip().split())

                if '.....' in paragraph or '. . . . .' in paragraph: # looks like a ToC entry, skip it
                    continue
                elif len(paragraph) < THRESHOLD:  # short paragraph, keep it
                    texts.append(paragraph)
                elif page in LONG_PARAGRAPH_PAGES and long_paragraph_count < LONG_PARAGRAPH_MAX:
                    # allow some long paragraphs on the first two pages
                    long_paragraph_count += 1
                    texts.append(paragraph)
                else:  # must be a long paragraph, skip it
                    pass
            text = '\n'.join(texts)
            if text:
                pages.append({"page": pdf[page].number + 1, "text": text})
    return {"pdfinfo": pdfinfo, "pages": pages}


for mdfile in sorted(metadata_files):
    out_path = mdfile.replace('metadata', 'llm-dataset')
    print(f"converting {mdfile} to {out_path}")
    with open(mdfile) as infile, open(out_path, "w") as outfile:
        for line in infile:
            rec = json.loads(line)
            print(rec)
            pdf_path = id_to_fn(rec["id"])
            print(pdf_path)
            content = extract_content(pdf_path)
            outrec = {"id": rec["id"], "url": rec["url"], "content": content, "ground_truth": rec["ground_truth"]}
            json.dump(outrec, outfile)
            outfile.write("\n")

converting ../metadata/article-en-test.jsonl to ../llm-dataset/article-en-test.jsonl
{'doctype': 'article', 'subset': 'test', 'repository': 'Doria', 'url': 'https://www.doria.fi/bitstream/handle/10024/182782/107883-Article Text-213924-1-10-20211120 Ostling.pdf', 'id': 'https://www.doria.fi/handle/10024/182782', 'rowid': 'article3', 'ground_truth': {'language': 'en', 'title': '‘The wrath of God on children of disobedience’', 'alt_title': ['COVID-19 in the theology and ideology of the Westboro Baptist Church {en}'], 'creator': ['Östling, Erik'], 'year': '2021', 'doi': '10.30664/ar.107883', 'type_coar': 'journal article'}}
../pdfs/www.doria.fi_handle_10024_182782.pdf
{'doctype': 'article', 'subset': 'test', 'repository': 'Taju', 'url': 'https://taju.uniarts.fi/bitstream/handle/10024/6005/Järvinen_Great_Horizons_AM.pdf', 'id': 'https://taju.uniarts.fi/handle/10024/6005', 'rowid': 'article5', 'ground_truth': {'language': 'en', 'title': '"Great horizons flooded with the alien light of the Su