In [115]:
import textract
import subprocess
import re
import os

In [129]:
import os

def list_pdf_files(directory):
    
    # Initialize an empty list to store the paths of PDF files
    pdf_files = []

    # Walk through the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                # Construct the full path and add it to the list
                pdf_files.append(os.path.join(root, file))

    return pdf_files
def pdf_text(pdf_path):

    # Extract text from the PDF file
    text = textract.process(
    pdf_path,
    method='pdftotext'
    )
    text = text.decode('utf-8')
    return text

def clean_efsa_journal_references(text):
    pattern = re.compile(
        r"^EFSA Journal \d{4};.*(?:\n.*){3}",
        flags=re.MULTILINE
    )
    # pattern = re.compile(
    #     r"EFSA Journal \d{4}; \d+\(\d+\):\d+(?:\n\d{1,3}\n.*\n.*|\n.*\n.*)?",
    #     flags=re.MULTILINE
    # )

    # Replace matched patterns with an empty string
    cleaned_text = re.sub(pattern, '', text)

    # Clean up extra newlines left after removal
    cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text)

    return cleaned_text

def extract_section(text, start_keyword, end_keyword):
    

    pattern = f"({re.escape(start_keyword)}.*?{re.escape(end_keyword)})"
    match = re.search(pattern, text, flags=re.DOTALL)

    if match:
        # Remove all empty lines
        text = re.sub(r'^\s*\n', '', match.group(1), flags=re.MULTILINE)
        # print(text)
        return text  # Return the matched text including the start and end keywords
    else:
        return None  # Return None if no match was found
def save_text_to_file(cleaned_text, file_path):
    
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_text)
    print(f"Text successfully saved to {file_path}")



def run_ontogpt_evidence(input_file, element):
    print(f"Processing {input_file}...")
    evidence_dict=[]
    # Define the command and arguments
    command = "ontogpt"
    args = ["extract", "-t", element, "-i", input_file]

    # Combine the command and arguments
    full_command = [command] + args
    print("Running command:", ' '.join(full_command))
    # Execute the command
    try:
        result = subprocess.run(full_command, check=True, text=True, capture_output=True)
        if result.stdout:
            # print("Output:", result.stdout)
            # Process the output to extract evidence and cites
            evidence_dict = process_output(result.stdout, input_file)
        if result.stderr:
            print("Errors:", result.stderr)
    except subprocess.CalledProcessError as e:
        print("An error occurred:", e)
    except Exception as e:
        print("An unexpected error occurred:", e)
    
    return evidence_dict

def process_output(output, input_file):
    results = []
    lines = output.strip().split('\n')
    current = {}

    for line in lines:
        line = line.strip()

        if line.startswith("claim:"):
            if current:  # Save the previous entry if exists
                results.append(current)
                current = {}  # Reset current after appending to results
            current = {"Claim": line.split("claim:")[-1].strip(), "Path_to_text": input_file}
        elif line.startswith("evidence:") and "Claim" in current:
            current["Supporting Evidence"] = line.split("evidence:")[-1].strip()
        elif line.startswith("cites:") and "Supporting Evidence" in current:
            current["Cites"] = line.split("cites:")[-1].strip()
            

    if current:  # Add the last processed entry
        results.append(current)
    return results
def process(folder_path):
    
    # List all files in the given directory
    for filename in os.listdir(folder_path):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        # Check if the file is a .txt file
        if os.path.isfile(file_path) and filename.endswith('.txt'):
            # Call the processing function on each .txt file
            evidence = run_ontogpt_evidence(file_path, "food_ingredient_ontology.Evidence")
            for i, k in enumerate(evidence):
                print(f"Evidence {i + 1}:")
                for key, value in k.items():
                    print(f"{key}: {value}")
                print("\n" + "-"*50 + "\n")  # Separator between evidences


In [130]:
# Specify the directory to search for PDF files
directory_path = '/Users/AliTarik/Documents/EFSA_pdf'
directory_path_out = '/Users/AliTarik/Documents/EFSA_txt'
# Get the list of PDF file paths
pdf_paths = list_pdf_files(directory_path)

# Print the list of PDF paths
for path in pdf_paths:
    print(path)
    text = pdf_text(path)
    extracted_text = extract_section(text, "ASSESSMENT", "DOCUMENTATION PROVIDED TO EFSA")

    # Clean the text
    cleaned_text = clean_efsa_journal_references(extracted_text)
    save_text_to_file(cleaned_text, os.path.join(directory_path_out, os.path.basename(path).replace('.pdf', '.txt')))
    # print(cleaned_text)
process(directory_path_out)

/Users/AliTarik/Documents/EFSA_pdf/EFSA Journal - 2009 -  - Opinion on the substantiation of health claims related to alpha linolenic acid and maintenance of.pdf
Text successfully saved to /Users/AliTarik/Documents/EFSA_txt/EFSA Journal - 2009 -  - Opinion on the substantiation of health claims related to alpha linolenic acid and maintenance of.txt
/Users/AliTarik/Documents/EFSA_pdf/EFSA Journal - 2011 -  - Scientific Opinion on the substantiation of health claims related to activated charcoal and.pdf
Text successfully saved to /Users/AliTarik/Documents/EFSA_txt/EFSA Journal - 2011 -  - Scientific Opinion on the substantiation of health claims related to activated charcoal and.txt
/Users/AliTarik/Documents/EFSA_pdf/EFSA Journal - 2009 -  - Scientific Opinion on the substantiation of health claims related to biotin and energy‐yielding.pdf
Text successfully saved to /Users/AliTarik/Documents/EFSA_txt/EFSA Journal - 2009 -  - Scientific Opinion on the substantiation of health claims relat