In [21]:
import re
import json
import textract
import os
import openai
import fitz

In [68]:
def extract_text_by_coordinates(pdf_path):
    """Extracts text from a PDF within the specified coordinates on a specific page.
    Coordinates must be a tuple (x0, y0, x1, y1) representing the rectangle from which to extract text.
    """
    doc = fitz.open(pdf_path)
    all_text = []
    coordinates = (10, 75, 550, 750) 
    for page_number in range(doc.page_count):  # Loop through all pages
        page = doc.load_page(page_number)
        text = page.get_textbox(coordinates)  # Get text within the defined rectangle
        if text:  # Check if there is any text extracted
            all_text.append(text)
    
    doc.close()
    return "\n".join(all_text)  # Join all text pieces with a newline

# Example usage
pdf_path = '/Users/AliTarik/Desktop/pdfs/2009_1209.pdf'
coordinates = (10, 75, 550, 750)  # Define your coordinates based on the PDF's layout
text = extract_text_by_coordinates(pdf_path)
print(text)

  
EFSA Journal 2009; 7(9):1209 
 
For citation purposes: EFSA Panel on Dietetic Products, Nutrition and Allergies (NDA); Scientific opinion on the 
substantiation of health claims related to biotin and energy-yielding metabolism (ID 114, 117), macronutrient metabolism 
(ID 113, 114, 117), maintenance of skin and mucous membranes (ID 115), maintenance of hair (ID 118, 2876) and function 
SCIENTIFIC OPINION 
Scientific Opinion on the substantiation of health claims related to biotin 
and energy-yielding metabolism (ID 114, 117), macronutrient metabolism 
(ID 113, 114, 117), maintenance of skin and mucous membranes (ID 115), 
maintenance of hair (ID 118, 2876) and function of the nervous system 
(ID 116) pursuant to Article 13(1) of Regulation (EC) No 1924/20061 
EFSA Panel on Dietetic Products, Nutrition and Allergies (NDA)2 
European Food Safety Authority (EFSA), Parma, Italy 
SUMMARY 
Following a request from the European Commission, the Panel on Dietetic Products, Nutrition and 
Alle

In [69]:

def pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = []
    coordinates = (10, 75, 550, 750)
    for page_number in range(doc.page_count):  # Loop through all pages
        page = doc.load_page(page_number)
        text = page.get_textbox(coordinates)  # Get text within the defined rectangle
        if text:  # Check if there is any text extracted
            all_text.append(text)
    
    doc.close()
    return "\n".join(all_text)  # Join all text pieces with a newline

def extract_sections(text):
    # Extract from ASSESSMENT to APPENDICES
    assessment_to_appendices_pattern = re.compile(r'ASSESSMENT(.*?APPENDICES)', re.DOTALL)
    assessment_to_appendices_match = assessment_to_appendices_pattern.search(text)
    if not assessment_to_appendices_match:
        return None, None, None, None
    assessment_to_appendices = assessment_to_appendices_match.group(1)

    # Check for the existence of "Scientific substantiation of the claimed effect"
    sci_substantiation_pattern = re.compile(r'Scientific substantiation of the claimed effect(.*?)(Panel’s comments on the proposed wording|Conditions and possible restrictions of use|CONCLUSIONS)', re.DOTALL)
    sci_substantiation_match = sci_substantiation_pattern.search(assessment_to_appendices)
    if not sci_substantiation_match:
        return None, None, None, None
    scientific_subs = sci_substantiation_match.group(1)

    # Check if "Conditions and possible restrictions of use" exists
    conditions_restrictions_pattern = re.compile(r'Conditions and possible restrictions of use(.*?)CONCLUSIONS', re.DOTALL)
    conditions_restrictions_match = conditions_restrictions_pattern.search(assessment_to_appendices)

    if conditions_restrictions_match:
        conditions_restrictions = conditions_restrictions_match.group(1)
        # conclusions_pattern = re.compile(r'Conditions and possible restrictions of use(.*?)DOCUMENTATION PROVIDED TO EFSA', re.DOTALL)
    else:
        conditions_restrictions = "None"
        # conclusions_pattern = re.compile(r'CONCLUSIONS(.*?)DOCUMENTATION PROVIDED TO EFSA', re.DOTALL)
    conclusions_pattern = re.compile(r'CONCLUSIONS(.*?)DOCUMENTATION PROVIDED TO EFSA', re.DOTALL)
    conclusions_match = conclusions_pattern.search(assessment_to_appendices)
    if conclusions_match:
        conclusions = conclusions_match.group(1)
    else: return None, None, None, None

    references_pattern = re.compile(r'REFERENCES(.*?)APPENDICES', re.DOTALL)
    references_match = references_pattern.search(assessment_to_appendices)
    references = references_match.group(1) if references_match else ""

    return scientific_subs, conditions_restrictions, conclusions, references

def save_to_json(data, output_directory, filename):
    os.makedirs(output_directory, exist_ok=True)
    file_path = os.path.join(output_directory, filename)
    with open(file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

def process_document(file_path,output_dir, file_name):
    text = pdf_text(file_path)
    scientific_subs, conditions_restrictions, conclusions, references = extract_sections(text)

    if scientific_subs is None:
        print("Scientific substantiation section not found.")
        return
    
    data = {
        "scientific_substantiation": scientific_subs,
        "conditions_restrictions": conditions_restrictions,
        "conclusions": conclusions,
        "references": references
    }
    save_to_json(data, output_dir,f'{file_name}.json')

def preprocess_all_pdfs(input_dir, output_dir):
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                print(f"Processing {pdf_path}...")
                output_filename = os.path.splitext(file)[0]
                process_document(pdf_path, output_dir, output_filename)

# Usage example
file_path = '/Users/AliTarik/Desktop/pdfs'
output = '/Users/AliTarik/data/EFSA_jsons'
preprocess_all_pdfs(file_path,output)


Processing /Users/AliTarik/Desktop/pdfs/2009_1215.pdf...
Processing /Users/AliTarik/Desktop/pdfs/2009_1216.pdf...
Processing /Users/AliTarik/Desktop/pdfs/2010_1754.pdf...
Scientific substantiation section not found.
Processing /Users/AliTarik/Desktop/pdfs/2011_2079.pdf...
Processing /Users/AliTarik/Desktop/pdfs/2009_1209.pdf...


In [72]:


def load_text_from_json(json_file):
    """Load text data from a JSON file."""
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data  # Adjust 'text' depending on your JSON structure.

def query_chatgpt(text, api_key):
    """Send a text query to ChatGPT and return the response."""
    openai.api_key = api_key

    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",  # Ensure this is the correct model for chat completions
      messages=[
          {"role": "user", "content": text}
      ]
    )
    return response['choices'][0]['message']['content'].strip() 

def run_for_scientific(text):
      # Change to your actual JSON file path
    api_key = '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'  # Set your OpenAI API key

    
    prompt = f"""Extract the following without changing the words as I instruct from the given text:\n
                Commmon text:  Piece of text given before the first title. there may exist no text before the first title and in that case write "Not Founded" in the data part.Keep the citations \n
                 Content: This is structure that may occur multiple times in given text. Extract each seperately as the combination of title and context.   
                    Title: The piece of text given as a subtitle in the text. Example titles include "Energy-yielding metabolism (ID 114, 117)" and " Maintenance of skin and mucous membranes (ID 115)."
                    Context: Piece of text provided under the title, offering detailed information related to the title. Citations should be kept. It must end my as example: The Panel concludes that a cause and effect relationship has been established between the dietary intake of biotin and normal macronutrient metabolism. However, the evidence provided does not establish that inadequate intake of biotin leading to impaired macronutrient metabolism occurs in the general EU population.
                Commmon text:  Piece of text given before the first title. there might be none. In that case return Not founded in the dictionary

                Return a dictionary format as follows in the example: 
                    Oxygen transport (ID 250, ID 254, ID 256): In humans, iron is mainly found in porphyrins. In haemproteins (haemoglobin and myoglobin) iron is found in its ferrous state (Fe2+) which allows it to bind oxygen reversibly. Haemoglobin transports oxygen in the erythrocytes to the tissues (Hunt, 2005). It is well established that inadequate dietary iron intake in humans leads to hypochromic and microcytic anemia. The Panel concludes that a cause and effect relationship has been established between the dietary intake of iron and normal oxygen transport to tissues.
                    Formation of red blood cells and haemoglobin (ID 249, ID 1589): In humans, iron is mainly found in porphyrins. In haemproteins (haemoglobin and myoglobin) iron is found in its ferrous state (Fe2+) which allows it to bind oxygen reversibly. Haemoglobin transports oxygen in the erythrocytes to the tissues (Hunt, 2005). It is well established that inadequate dietary iron intake in humans leads to hypochromic and microcytic anaemia. The Panel concludes that a cause and effect relationship has been established between the intake of iron and normal formation of red blood cells and haemoglobin.
                    and so on 
            \n\n + {text}"""
    
    extracted_text = query_chatgpt(prompt, api_key)
    print(extracted_text)

def run_for_conclusion(text):
      # Change to your actual JSON file path
    api_key = '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'  # Set your OpenAI API key

    
    prompt = f"""Extract the following without changing the words as I instruct from the given text:\n
                Commmon text:  Piece of text given before the first title. there may exist no text before the first title and in that case write "Not Founded" in the data part.Keep the citations \n
                 Content: This is structure that may occur multiple times in given text. Extract each seperately as the combination of title and context.   
                    Title: The piece of text given as a subtitle in the text. Example titles include "Energy-yielding metabolism (ID 114, 117)" and " Maintenance of skin and mucous membranes (ID 115)."
                    Context: Piece of text provided under the title, offering detailed information related to the title. Citations should be kept.
                Commmon text:  Piece of text given before the first title. there might be none. In that case return Not founded in the dictionary

                Return a dictionary format as follows in the example: 
                    Oxygen transport (ID 250, ID 254, ID 256): In humans, iron is mainly found in porphyrins. In haemproteins (haemoglobin and myoglobin) iron is found in its ferrous state (Fe2+) which allows it to bind oxygen reversibly. Haemoglobin transports oxygen in the erythrocytes to the tissues (Hunt, 2005). It is well established that inadequate dietary iron intake in humans leads to hypochromic and microcytic anemia. The Panel concludes that a cause and effect relationship has been established between the dietary intake of iron and normal oxygen transport to tissues.
                    Formation of red blood cells and haemoglobin (ID 249, ID 1589): In humans, iron is mainly found in porphyrins. In haemproteins (haemoglobin and myoglobin) iron is found in its ferrous state (Fe2+) which allows it to bind oxygen reversibly. Haemoglobin transports oxygen in the erythrocytes to the tissues (Hunt, 2005). It is well established that inadequate dietary iron intake in humans leads to hypochromic and microcytic anaemia. The Panel concludes that a cause and effect relationship has been established between the intake of iron and normal formation of red blood cells and haemoglobin.
                    and so on 
            \n\n + {text}"""
    
    extracted_text = query_chatgpt(prompt, api_key)
    print(extracted_text)


def main():
    json_file = '/Users/AliTarik/data/EFSA_jsons/2009_1209.json'
    data = load_text_from_json(json_file)
    run_for_scientific(data["scientific_substantiation"])
    run_for_conclusion(data["conclusions"])

if __name__ == '__main__':
    main()

{
    "Energy-yielding metabolism (ID 114, 117)": "Biotin is a cofactor for four carboxylase enzymes which are also involved in energy-yielding metabolism (IoM, 1998; Stryer, 1988). The Panel concludes that a cause and effect relationship has been established between the dietary intake of biotin and normal energy-yielding metabolism. However, the evidence provided does not establish that inadequate intake of biotin leading to impaired energy-yielding metabolism occurs in the general EU population.",
    "Macronutrient metabolism (ID 113, 114, 117)": "Biotin is a cofactor for four carboxylase enzymes which are also involved in macronutrient metabolism (IoM, 1998; Stryer, 1988). The Panel concludes that a cause and effect relationship has been established between the dietary intake of biotin and normal macronutrient metabolism. However, the evidence provided does not establish that inadequate intake of biotin leading to impaired macronutrient metabolism occurs in the general EU populatio