In [2]:
import subprocess
import pandas as pd
import re
import datetime
import json
import shutil
import os
import textract

In [4]:
def list_pdf_files(directory):
    
    # Initialize an empty list to store the paths of PDF files
    pdf_files = []

    # Walk through the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                # Construct the full path and add it to the list
                pdf_files.append(os.path.join(root, file))

    return pdf_files
def pdf_text(pdf_path):

    # Extract text from the PDF file
    text = textract.process(
    pdf_path,
    method='pdftotext'
    )
    text = text.decode('utf-8')
    return text

def clean_efsa_journal_references(text):
    pattern = re.compile(
        r"^EFSA Journal \d{4};.*(?:\n.*){3}",
        flags=re.MULTILINE
    )
    # pattern = re.compile(
    #     r"EFSA Journal \d{4}; \d+\(\d+\):\d+(?:\n\d{1,3}\n.*\n.*|\n.*\n.*)?",
    #     flags=re.MULTILINE
    # )

    # Replace matched patterns with an empty string
    try:
        cleaned_text = re.sub(pattern, '', text)
    except Exception as e:
        print(e)
        print(text)
        cleaned_text = text
    # cleaned_text = re.sub(pattern, '', text)

    # Clean up extra newlines left after removal
    try:
        cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text)
    except Exception as e:  
        print(e)
        print(cleaned_text)
        cleaned_text = " "
    # cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text)

    return cleaned_text

def extract_section(text, start_keyword, end_keyword, mid_keyword = "", end_keyword_2="", multiple=False):
    if multiple:
       pattern_1 = f"({re.escape(start_keyword)}.*?{re.escape(end_keyword)})"
       pattern_2 = f"({re.escape(mid_keyword)}.*?{re.escape(end_keyword_2)})"
       match_1 = re.search(pattern_1, text, flags=re.DOTALL)
       match_2 = re.search(pattern_2, text, flags=re.DOTALL)
       if match_1 and match_2:
           text_1 = re.sub(r'^\s*\n', '', match_1.group(1), flags=re.MULTILINE)
           text_2 = re.sub(r'^\s*\n', '', match_2.group(1), flags=re.MULTILINE)

           return text_1+text_2
    
    else:
        pattern = f"({re.escape(start_keyword)}.*?{re.escape(end_keyword)})"
        match = re.search(pattern, text, flags=re.DOTALL)

        if match:
            # Remove all empty lines
            text = re.sub(r'^\s*\n', '', match.group(1), flags=re.MULTILINE)
            # print(text)
            return text  # Return the matched text including the start and end keywords
        else:
            return None  # Return None if no match was found
def save_text_to_file(cleaned_text, file_path):

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_text)
    print(f"Text successfully saved to {file_path}")



def run_ontogpt_evidence(input_file, element):
    print(f"Processing {input_file}...")
    evidence_dict=[]
    # Define the command and arguments
    command = "ontogpt"
    args = ["extract", "-t", element, "-i", input_file ]#, "-m", "gpt-4o"]

    # Combine the command and arguments
    full_command = [command] + args
    print("Running command:", ' '.join(full_command))
    # Execute the command
    try:
        result = subprocess.run(full_command, check=True, text=True, capture_output=True)
        if result.stdout:
            print("Output:", result.stdout)
            # Process the output to extract evidence and cites
            # evidence_dict = process_output(result.stdout, input_file)
        if result.stderr:
            print("Errors:", result.stderr)
    except subprocess.CalledProcessError as e:
        print("An error occurred:", e)
    except Exception as e:
        print("An unexpected error occurred:", e)
    
    return evidence_dict

def process_output(output, input_file):
    results = []
    lines = output.strip().split('\n')
    current = {}

    for line in lines:
        line = line.strip()

        if line.startswith("claim:"):
            if current:  # Save the previous entry if exists
                results.append(current)
                current = {}  # Reset current after appending to results
            current = {"Claim": line.split("claim:")[-1].strip(), "Path_to_text": input_file}
        elif line.startswith("evidence:") and "Claim" in current:
            current["Supporting Evidence"] = line.split("evidence:")[-1].strip()
        elif line.startswith("cites:") and "Supporting Evidence" in current:
            current["Cites"] = line.split("cites:")[-1].strip()
            

    if current:  # Add the last processed entry
        results.append(current)
    return results
def process(folder_path, element):
    
    # List all files in the given directory
    for filename in os.listdir(folder_path):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        # Check if the file is a .txt file
        if os.path.isfile(file_path) and filename.endswith('.txt'):
            print(f"Processing {file_path} / {filename}...")
            # Call the processing function on each .txt file
            evidence = run_ontogpt_evidence(file_path,element)
            for i, k in enumerate(evidence):
                print(f"Evidence {i + 1}:")
                for key, value in k.items():
                    print(f"{key}: {value}")
                print("\n" + "-"*50 + "\n")  # Separator between evidences


In [3]:
# Specify the directory to search for PDF files
directory_path = '/Users/AliTarik/Documents/EFSA_pdf'
directory_path_out = '/Users/AliTarik/OntoGPT_FHC_EFSA/text_v3'

In [24]:

# Get the list of PDF file paths
pdf_paths = list_pdf_files(directory_path)

# Print the list of PDF paths
for path in pdf_paths:
    print(path)
    text = pdf_text(path)
    extracted_text = extract_section(text, "ASSESSMENT", "CONCLUSIONS")

    # Clean the text
    cleaned_text = clean_efsa_journal_references(extracted_text)
    save_text_to_file(cleaned_text, os.path.join(directory_path_out, os.path.basename(path).replace('.pdf', '.txt')))
    # print(cleaned_text)


/Users/AliTarik/Documents/EFSA_pdf/EFSA Journal - 2009 -  - Opinion on the substantiation of health claims related to alpha linolenic acid and maintenance of.pdf
Text successfully saved to /Users/AliTarik/OntoGPT_FHC_EFSA/text_v3/EFSA Journal - 2009 -  - Opinion on the substantiation of health claims related to alpha linolenic acid and maintenance of.txt
/Users/AliTarik/Documents/EFSA_pdf/EFSA Journal - 2011 -  - Scientific Opinion on the substantiation of health claims related to activated charcoal and.pdf
Text successfully saved to /Users/AliTarik/OntoGPT_FHC_EFSA/text_v3/EFSA Journal - 2011 -  - Scientific Opinion on the substantiation of health claims related to activated charcoal and.txt
/Users/AliTarik/Documents/EFSA_pdf/EFSA Journal - 2009 -  - Scientific Opinion on the substantiation of health claims related to biotin and energy‐yielding.pdf
Text successfully saved to /Users/AliTarik/OntoGPT_FHC_EFSA/text_v3/EFSA Journal - 2009 -  - Scientific Opinion on the substantiation of h

In [15]:
directory_path_out = '/Users/AliTarik/Documents/EFSA_claimbyclaim/2009_1209/Claim_Doc'


In [17]:
process(directory_path_out, "schema_claim_v1.Information")


Processing /Users/AliTarik/Documents/EFSA_claimbyclaim/2009_1209/Claim_Doc/ID_117_114.txt / ID_117_114.txt...
Processing /Users/AliTarik/Documents/EFSA_claimbyclaim/2009_1209/Claim_Doc/ID_117_114.txt...
Running command: ontogpt extract -t schema_claim_v1.Information -i /Users/AliTarik/Documents/EFSA_claimbyclaim/2009_1209/Claim_Doc/ID_117_114.txt
Output: ---
input_text: --
raw_completion_output: |-
  claim: Biotin contributes to normal energy-yielding metabolism.

  evidence: Biotin is a cofactor for four carboxylase enzymes which are also involved in energy-yielding metabolism (IoM, 1998; Stryer, 1988).
prompt: |+
  From the text below, extract the following entities in the following format:

  text: <Supporting evidence of a health claim.>


  Text:
  Biotin is a cofactor for four carboxylase enzymes which are also involved in energy-yielding metabolism (IoM, 1998; Stryer, 1988).

  ===

extracted_object:
  claim:
    text: Biotin contributes to normal energy-yielding metabolism.
  e

In [19]:
import os
import subprocess
import json
import re

def run_ontogpt_evidence(input_file, element):
    print(f"Processing {input_file}...")
    # Define the command and arguments
    command = "ontogpt"
    args = ["extract", "-t", element, "-i", input_file]

    # Combine the command and arguments
    full_command = [command] + args
    print("Running command:", ' '.join(full_command))
    
    # Execute the command
    try:
        result = subprocess.run(full_command, check=True, text=True, capture_output=True)
        if result.stdout:
            print("Output:", result.stdout)
            return result.stdout
        if result.stderr:
            print("Errors:", result.stderr)
    except subprocess.CalledProcessError as e:
        print("An error occurred:", e)
    except Exception as e:
        print("An unexpected error occurred:", e)
    
    return None

def extract_claim_and_evidence(output):
    # Extract the claim text
    claim_match = re.search(r'claim:\s*(.+?)\s*evidence:', output, re.DOTALL)
    # Extract the evidence text
    evidence_match = re.search(r'evidence:\s*(.+?)\s*prompt:', output, re.DOTALL)
    
    if claim_match and evidence_match:
        claim = claim_match.group(1).strip()
        evidence = evidence_match.group(1).strip()
        return {
            'claim': claim,
            'evidence': evidence
        }
    return None

def save_claim_evidence_to_json(claim_evidence, output_dir, filename):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    json_path = os.path.join(output_dir, f"{filename}.json")
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(claim_evidence, json_file, indent=4)
    print(f"Saved JSON to {json_path}")

def sanitize_filename(filename):
    return re.sub(r'[^\w\-_\. ]', '', filename)

def process_claim_doc_folder(claim_doc_folder, output_folder, element):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # List all files in the Claim_Doc directory
    for filename in os.listdir(claim_doc_folder):
        file_path = os.path.join(claim_doc_folder, filename)
        # Check if the file is a .txt file
        if os.path.isfile(file_path) and filename.endswith('.txt'):
            print(f"Processing {file_path} / {filename}...")
            # Call the processing function on each .txt file
            output = run_ontogpt_evidence(file_path, element)
            if output:
                claim_evidence = extract_claim_and_evidence(output)
                if claim_evidence:
                    sanitized_claim = sanitize_filename(claim_evidence['claim'])
                    json_filename = sanitized_claim if sanitized_claim else 'claim_evidence'
                    save_claim_evidence_to_json(claim_evidence, output_folder, json_filename)

def process_all_folders(base_folder, element):
    # Walk through all directories and subdirectories in the base folder
    for root, dirs, files in os.walk(base_folder):
        # Check if the current directory is a document ID folder
        if root.endswith('Claim_Doc'):
            doc_id_folder = os.path.dirname(root)
            claim_extracted_json_folder = os.path.join(doc_id_folder, 'Claim_extracted_Json')
            print(f"Creating Claim_extracted_Json in {doc_id_folder}")
            process_claim_doc_folder(root, claim_extracted_json_folder, element)

# Example usage
base_folder = '/Users/AliTarik/Documents/EFSA_claimbyclaim'
element = 'schema_claim_v1.Information'
process_all_folders(base_folder, element)


Creating Claim_extracted_Json in /Users/AliTarik/Documents/EFSA_claimbyclaim/2009_1210
Processing /Users/AliTarik/Documents/EFSA_claimbyclaim/2009_1210/Claim_Doc/ID_234.txt / ID_234.txt...
Processing /Users/AliTarik/Documents/EFSA_claimbyclaim/2009_1210/Claim_Doc/ID_234.txt...
Running command: ontogpt extract -t schema_claim_v1.Information -i /Users/AliTarik/Documents/EFSA_claimbyclaim/2009_1210/Claim_Doc/ID_234.txt
Output: ---
input_text: --
raw_completion_output: |-
  claim: Calcium contributes to normal energy-yielding metabolism

  evidence: The evidence provided by consensus opinions/reports from authoritative bodies and reviews shows that there is good consensus on the role of calcium in the stabilisation and activity of certain enzymes involved in energy metabolism, such as glyceraldehyde phosphate dehydrogenase, pyruvate dehydrogenase, and -ketoglutarate dehydrogenase. However, the normal activity of these enzymes is not significantly affected by changes in extracellular calciu

In [3]:
def driversetup():
    options = webdriver.ChromeOptions()
    #run Selenium in headless mode
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")
    #overcome limited resource problems
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("lang=en")
    #open Browser in maximized mode
    options.add_argument("start-maximized")
    #disable infobars
    options.add_argument("disable-infobars")
    #disable extension
    options.add_argument('ignore-certificate-errors')
    options.add_argument('--ignore-ssl-errors=yes')
    options.add_argument("--disable-extensions")
    options.add_argument("--incognito")
    options.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Chrome(options=options)

    return driver
def setup_directories(base_dir='pdfs'):
    if os.path.exists(base_dir):
        shutil.rmtree(base_dir)
    os.makedirs(base_dir)
    os.makedirs(base_dir + '/json', exist_ok=True)
    return base_dir

def getpage(url, c, pdf_dir):
    log = ''
    pdf_name = f'efsa-{datetime.datetime.now().strftime("%Y-%m-%d-at-%H-%M-")}{c}.pdf'
    pdf_path = os.path.join(pdf_dir, pdf_name)

    if os.path.exists(pdf_path):
        log += f'\tFile n {c} skipped, already present. Filename: {pdf_name}\n'
        return log

    driver = driversetup()
    driver.get(url)
    driver.close()

    try:
        filename = max([f for f in os.listdir('.') if f.endswith('.pdf')], key=os.path.getctime)
        shutil.move(filename, pdf_path)
    except Exception as e:
        log += f'File n {c} failed, refer to the following URL: {url}\nError: {str(e)}\n'
    return log
def get_pdfs():
    # Set up directories
    pdfs_dir = setup_directories()

    # Load claims data
    claims = pd.read_excel('data/tree.xlsx')
    res = []
    for k in tqdm(range(len(claims['EFSA Opinion Reference']))):
        i = claims['EFSA Opinion Reference'][k]
        claim = claims['Claim'][k]
        if pd.isna(i):
            continue
        for j in re.split(',', str(i)):
            args = re.split(';|:', j)
            if len(args) == 3:
                res.append('https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.' + args[0].strip() + '.' + args[2].strip() + '#' + claim + '#' + str(k))

    log = ''
    counter = 0

    for i in tqdm(res):
        sep = i.split('#')
        log += getpage(sep[0], counter, pdfs_dir)
        claim_data = {
            'Claim': sep[1],
            'Supporting Evidences': {}
        }
        
        for n in range(1, 9):
            text_key = f'Supporting Evidence Text {n}'
            ref_key = f'Supporting Evidence Reference {n}'
            if not pd.isna(claims.get(text_key, [None])[int(sep[2])]):
                claim_data['Supporting Evidences'][text_key] = claims[text_key][int(sep[2])]
                claim_data['Supporting Evidences'][ref_key] = claims[ref_key][int(sep[2])]

        json_path = os.path.join(pdfs_dir, 'json', f'claim_{counter}.json')
        with open(json_path, 'w') as f:
            json.dump(claim_data, f, indent=4)
        counter += 1

    print(log)
def pdf_text(pdf_path):

    # Extract text from the PDF file
    text = textract.process(
    pdf_path,
    method='pdftotext'
    )
    text = text.decode('utf-8')
    return text
def save_text_to_file(cleaned_text, file_path):
    
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_text)
    print(f"Text successfully saved to {file_path}")

def read_text_files(input_dir):
    text_data = {}
    for file_name in os.listdir(input_dir):
        if file_name.endswith(".txt"):
            with open(os.path.join(input_dir, file_name), "r", encoding='utf-8') as file:
                text_data[file_name] = file.read()
    return text_data

def parse_text(text):
    # Find the "Relevance of the claimed effect to human health" section
    pattern = r'Relevance of the claimed effect to human health(.*?)DOCUMENTATION PROVIDED TO EFSA'
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1)
    else:
        return None

def extract_id_sections(text):
    # Define all possible stopping points
    stop_patterns = [
        r'Characterisation of the food/constituent',
        r'Relevance of the claimed effect to human health',
        r'Scientific substantiation of the claimed effect',
        r'Panel’s comments on the proposed wording',
        r'Conditions and restrictions of use',
        r'Conditions and possible restrictions of use',
        r'CONCLUSION',
        r'DOCUMENTATION PROVIDED TO EFSA',
        r'[a-zA-Z\s\(\)]+\s*\(ID[\s\d,]+\)'  # New ID headers
    ]
    stop_pattern = '|'.join(stop_patterns)
    
    # Extract subsections with exact sets of IDs and their content
    # pattern = rf'([a-zA-Z\s\(\)\-]+)\s*\((ID[\s\d,]+)\)(.*?)(?=\n(?:{stop_pattern})|\n(?:[a-zA-Z\s\(\)\-]+\s*\(ID[\s\d,]+\))|\Z)'

    pattern = rf'([a-zA-Z\s\(\)\-,]+)\s*\((ID[\s\d,]+)\)(.*?)(?=\n(?:{stop_pattern})|\n(?:[a-zA-Z\s\(\)\-,]+\s*\(ID[\s\d,]+\))|\Z)'

    matches = re.findall(pattern, text, re.DOTALL)
    
    id_sections = {}
    for match in matches:
        title, ids, content = match[0], match[1], match[2]  # Unpack the first three elements
        ids_set = tuple(ids.replace("ID", "").strip().split(","))
        ids_set = tuple(id.strip() for id in ids_set)  # Ensure all IDs are stripped of whitespace
        if ids_set not in id_sections:
            id_sections[ids_set] = []
        id_sections[ids_set].append((title.strip(), content.strip()))
    
    return id_sections

def save_id_sections(id_sections, output_dir, doc_name):
    doc_output_dir = os.path.join(output_dir, doc_name)
    if not os.path.exists(doc_output_dir):
        os.makedirs(doc_output_dir)
    
    for ids_set, sections in id_sections.items():
        ids_str = "_".join(ids_set)
        file_name = f"ID_{ids_str}.txt"
        with open(os.path.join(doc_output_dir, file_name), "w", encoding='utf-8') as file:
            for title, content in sections:
                file.write(f"{title}\n")
                file.write(f"{content}\n\n")

def main(input_dir, output_dir):
    text_data = read_text_files(input_dir)
    for file_name, text in text_data.items():
        relevant_text = parse_text(text)
        if relevant_text:
            id_sections = extract_id_sections(relevant_text)
            doc_name = os.path.splitext(file_name)[0]  # Get the document name without extension
            save_id_sections(id_sections, output_dir, doc_name)
        else:
            print(f"Relevant section not found in the document: {file_name}")

main('/Users/AliTarik/Documents/EFSA_claimbyclaim/2011_2076/parsed.txt' ,'/Users/AliTarik/Documents/EFSA_claimbyclaim' )

NotADirectoryError: [Errno 20] Not a directory: '/Users/AliTarik/Documents/EFSA_claimbyclaim/2011_2076/parsed.txt'

In [None]:
def process_all_folders(base_folder, element):
    # Walk through all directories and subdirectories in the base folder
    for root, dirs, files in os.walk(base_folder):
        # Check if the current directory is a document ID folder
        if root.endswith('Claim_Doc'):
            doc_id_folder = os.path.dirname(root)
            claim_extracted_json_folder = os.path.join(doc_id_folder, 'Claim_extracted_Json')
            print(f"Creating Claim_extracted_Json in {doc_id_folder}")
            process_claim_doc_folder(root, claim_extracted_json_folder, element)