In [None]:
import os
import json
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from tqdm import tqdm
import configparser

In [None]:
config = configparser.ConfigParser()

# Read the ini file
config.read("config.ini")

# Access values
metadata_path = config["paths"]["metadata"]
pdfs_path = config["paths"]["pdfs"]
markdowns_path = config["paths"]["markdowns"]


metadata_files = os.listdir(metadata_path)
metadata = []

# read json metadata files
for file in tqdm(metadata_files):
    with open(os.path.join(metadata_path, file), 'r') as f:
        metadata.append(json.load(f))
        


In [None]:
converter = PdfConverter(
   artifact_dict=create_model_dict(),
)

In [15]:
metadata[0]

{'Titre': 'Absence de brises soleil et de stores',
 'Texte': "Des feuilles de papier sont collées sur le vitrage en guide de protection contre l'éblouissement.",
 'Images': ['https://www.dispositif-rexbp.com/sites/default/files/2023-11/D%C3%A9faut%20de%20protections%20solaires%20chaleur%20et%20%C3%A9blouissement.jpg'],
 'Videos': [None],
 'PDFs': ['https://www.dispositif-rexbp.com/sites/default/files/2023-11/Eblouissement%20et%20surchauffe_1.pdf'],
 'Thématique': ['Éclairage et électricité spécifique'],
 'Type_document': ['Photos commentées'],
 'Lien': 'https://www.dispositif-rexbp.com/ressource/absence-de-brises-soleil-et-de-stores'}

In [4]:
def pdf_name_to_path(pdf_name):
    return os.path.join(pdfs_path, f"{pdf_name}.pdf")

def transform_pdf_to_markdown(converter,metadata):
    
    title = metadata["Titre"]
    pdf_path = pdf_name_to_path(title)   
    output_file_path = os.path.join(markdowns_path, f"{title}.md")
    images_output_path = os.path.join(markdowns_path, "images" , title)
    if os.path.exists(output_file_path): 
        print(f"markdown {output_file_path} already exists, skipping ... ")
        return
    if not os.path.exists(pdf_path): 
        print(f"pdf {pdf_path} not found, skipping ... ")
        return
    
    print(f"converting {pdf_path} to markdown ...")
    rendered = converter(pdf_path)
    

    text, ext, images = text_from_rendered(rendered)

    #transform the refrences of the images inside the text to the new path
    for img in images:
        text = text.replace(img,f"<images/{title}/{os.path.basename(img)}>")
    
    with open(output_file_path, 'w') as f:
        f.write(text)
    
    # save the images
    if not os.path.exists(images_output_path):
        os.makedirs(images_output_path)
    for img in images:
        img_path = os.path.join(images_output_path,os.path.basename(img))
        try :
            images[img].save(img_path)
        except:
            print(f"error saving {img_path}")
            continue

        


In [None]:
for m in tqdm(metadata):
    transform_pdf_to_markdown(converter,m)