In [6]:
# pip install openai==0.28

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting openai==0.28
  Using cached openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Using cached openai-0.28.0-py3-none-any.whl (76 kB)
[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.31.0
    Uninstalling openai-1.31.0:
      Successfully uninstalled openai-1.31.0


In [26]:
import re
import json
import textract
import os
import openai
import fitz

In [68]:

def pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = []
    coordinates = (10, 75, 550, 750)
    for page_number in range(doc.page_count): 
        page = doc.load_page(page_number)
        text = page.get_textbox(coordinates) 
        if text: 
            all_text.append(text)
    
    doc.close()
    return "\n".join(all_text)  # Join all text pieces with a newline

def extract_sections(text):
    # Extract from ASSESSMENT to APPENDICES
    assessment_to_appendices_pattern = re.compile(r'INFORMATION AS PROVIDED IN THE CONSOLIDATED LIST(.*?APPENDICES)', re.DOTALL)
    assessment_to_appendices_match = assessment_to_appendices_pattern.search(text)
    if not assessment_to_appendices_match:
        return None, None, None, None
    assessment_to_appendices = assessment_to_appendices_match.group(1)

    # Check for the existence of "Scientific substantiation of the claimed effect"
    sci_substantiation_pattern = re.compile(r'Scientific substantiation of the claimed effect(.*?)(Panel’s comments on the proposed wording|Conditions and possible restrictions of use|CONCLUSIONS)', re.DOTALL)
    sci_substantiation_match = sci_substantiation_pattern.search(assessment_to_appendices)
    if not sci_substantiation_match:
        return None, None, None, None
    scientific_subs = sci_substantiation_match.group(1)

    # Check if "Conditions and possible restrictions of use" exists
    conditions_restrictions_pattern = re.compile(r'Conditions and possible restrictions of use(.*?)CONCLUSIONS', re.DOTALL)
    conditions_restrictions_match = conditions_restrictions_pattern.search(assessment_to_appendices)

    if conditions_restrictions_match:
        conditions_restrictions = conditions_restrictions_match.group(1)
    else:
        conditions_restrictions = "None"
    conclusions_pattern = re.compile(r'CONCLUSIONS(.*?)DOCUMENTATION PROVIDED TO EFSA', re.DOTALL)
    conclusions_match = conclusions_pattern.search(assessment_to_appendices)
    if conclusions_match:
        conclusions = conclusions_match.group(1)
    else: return None, None, None, None

    references_pattern = re.compile(r'REFERENCES(.*?)APPENDICES', re.DOTALL)
    references_match = references_pattern.search(assessment_to_appendices)
    references = references_match.group(1) if references_match else ""

    return scientific_subs, conditions_restrictions, conclusions, references

def save_to_json(data, output_directory, filename):
    os.makedirs(output_directory, exist_ok=True)
    file_path = os.path.join(output_directory, filename)
    with open(file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

def process_document(pdf_path):
    output_dir = os.path.dirname(pdf_path)  # Use PDF's own directory to save the output
    output_filename = os.path.splitext(os.path.basename(pdf_path))[0] + '.json'

    text = pdf_text(pdf_path)
    scientific_subs, conditions_restrictions, conclusions, references = extract_sections(text)

    if scientific_subs is None:
        print("Scientific substantiation section not found.")
        return

    data = {
        "scientific_substantiation": scientific_subs,
        "conditions_restrictions": conditions_restrictions,
        "conclusions": conclusions,
        "references": references
    }
    save_to_json(data, output_dir, output_filename)

def preprocess_all_pdfs(input_dir):
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                print(f"Processing {pdf_path}...")
                process_document(pdf_path)

def main():
    input_dir = '/EFSA_DOCUMENTATION'
    preprocess_all_pdfs(input_dir)



In [69]:
main()

Processing /Users/AliTarik/Documents/EFSA_DOCUMENTATION/2010_1759/2010_1759.pdf...
Processing /Users/AliTarik/Documents/EFSA_DOCUMENTATION/2010_1760/2010_1760.pdf...
Processing /Users/AliTarik/Documents/EFSA_DOCUMENTATION/2010_1758/2010_1758.pdf...
Processing /Users/AliTarik/Documents/EFSA_DOCUMENTATION/2010_1798/2010_1798.pdf...
Processing /Users/AliTarik/Documents/EFSA_DOCUMENTATION/2010_1796/2010_1796.pdf...
Processing /Users/AliTarik/Documents/EFSA_DOCUMENTATION/2010_1764/2010_1764.pdf...
Processing /Users/AliTarik/Documents/EFSA_DOCUMENTATION/2010_1763/2010_1763.pdf...
Processing /Users/AliTarik/Documents/EFSA_DOCUMENTATION/2010_1797/2010_1797.pdf...
Scientific substantiation section not found.


In [104]:


def load_text_from_json(json_file):
    """Load text data from a JSON file."""
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data  # Adjust 'text' depending on your JSON structure.

def query_chatgpt(text, api_key):
    """Send a text query to ChatGPT and return the response."""
    openai.api_key = api_key

    response = openai.ChatCompletion.create(
      model="gpt-4o",  # Ensure this is the correct model for chat completions
      messages=[
          {"role": "user", "content": text}
      ],
      max_tokens=4096 
    )
    return response['choices'][0]['message']['content'].strip() 

def run_for_scientific(text):
      # Change to your actual JSON file path
    api_key = 'set your api key'  # Set your OpenAI API key

    
    prompt = f"""Extract the following without changing the words as I instruct from the given text:\n
                
                    Title: The piece of text given as a subtitle in the text. Example titles include "Energy-yielding metabolism (ID 114, 117)" and " Maintenance of skin and mucous membranes (ID 115)". If there is no title matches the description, then set the title as "Conclusion". Title either must be in the format of "Title (ID 123, ID 456)" which exists in the text or if there is no titles "Conclusion".\n
                    Context: Piece of text provided under the title, offering detailed information related to the title. Citations should be kept. It must end by similar example: "The Panel concludes that a cause and effect relationship has been established between the dietary intake of biotin and normal macronutrient metabolism. However, the evidence provided does not establish that inadequate intake of biotin leading to impaired macronutrient metabolism occurs in the general EU population" or like "The Panel concludes that a cause and effect relationship has been established between the consumption of live yoghurt cultures in yoghurt and improved digestion of lactose in yoghurt in individuals with lactose maldigestion.". 
                                In case there is no detected title, return the whole text privided for you as input from the first word to last. take the text until the end in that case without eny restriction.


                Do not write antyhing else other than the text you are asked to extract.
                do not put anything like ``` or "json"
                Return a dictionary format as follows in the example and nothing else: 
                {{
                    "title you find": "In humans, iron is mainly found in porphyrins. In haemproteins (haemoglobin and myoglobin) iron is found in its ferrous state (Fe2+) which allows it to bind oxygen reversibly. Haemoglobin transports oxygen in the erythrocytes to the tissues (Hunt, 2005). It is well established that inadequate dietary iron intake in humans leads to hypochromic and microcytic anemia. The Panel concludes that a cause and effect relationship has been established between the dietary intake of iron and normal oxygen transport to tissues."
                    "title you find": "In humans, iron is mainly found in porphyrins. In haemproteins (haemoglobin and myoglobin) iron is found in its ferrous state (Fe2+) which allows it to bind oxygen reversibly. Haemoglobin transports oxygen in the erythrocytes to the tissues (Hunt, 2005). It is well established that inadequate dietary iron intake in humans leads to hypochromic and microcytic anaemia. The Panel concludes that a cause and effect relationship has been established between the intake of iron and normal formation of red blood cells and haemoglobin."
                    and so on 
                    }}
            \n\n + {text}"""
    
    extracted_text = query_chatgpt(prompt, api_key)
    return extracted_text   

def run_for_conclusion(text):
      # Change to your actual JSON file path
    api_key = 'set your api key'  # Set your OpenAI API key

    
    prompt = f"""Extract the following without changing the words as I instruct from the given text:\n
                 
                    Title: The piece of text given as a subtitle in the text. Example titles include "Energy-yielding metabolism (ID 114, 117)" and " Maintenance of skin and mucous membranes (ID 115)" . If there is no title return "Conclusion" as title\n
                    Context: Piece of text provided under the title, offering detailed information related to the title. Citations should be kept.In case there is no detected title, return the full text given to you as context. it ususally ends by the information about the target population. in case there is no detected title, return the whole text privided for you as input from the first word to last. take the text until the end in that case.
                
                Do not write antyhing else other than the text you are asked to extract.
                do not put anything like ``` or "json"
                Return a dictionary format as follows in the example and nothing else: 
                {{
                    "Oxygen transport (ID 250, ID 254, ID 256)": "In humans, iron is mainly found in porphyrins. In haemproteins (haemoglobin and myoglobin) iron is found in its ferrous state (Fe2+) which allows it to bind oxygen reversibly. Haemoglobin transports oxygen in the erythrocytes to the tissues (Hunt, 2005). It is well established that inadequate dietary iron intake in humans leads to hypochromic and microcytic anemia. The Panel concludes that a cause and effect relationship has been established between the dietary intake of iron and normal oxygen transport to tissues."
                    "Formation of red blood cells and haemoglobin (ID 249, ID 1589)": "In humans, iron is mainly found in porphyrins. In haemproteins (haemoglobin and myoglobin) iron is found in its ferrous state (Fe2+) which allows it to bind oxygen reversibly. Haemoglobin transports oxygen in the erythrocytes to the tissues (Hunt, 2005). It is well established that inadequate dietary iron intake in humans leads to hypochromic and microcytic anaemia. The Panel concludes that a cause and effect relationship has been established between the intake of iron and normal formation of red blood cells and haemoglobin."
                    and so on 
                    }}
            \n\n + {text}"""
    
    extracted_text = query_chatgpt(prompt, api_key)
    return extracted_text

def save_output_to_file(directory,key, scientific_text, conditions_text, conclusion_text, references_text):
    claims_directory = os.path.join(directory, 'claims')
    os.makedirs(claims_directory, exist_ok=True)  # Create the subfolder if it does not exist
    filename = os.path.join(claims_directory, f"{key.replace(' ', '_').replace('/', '_').replace(':', '_')}.txt")
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(f"Scientific Substantiation:\n{scientific_text}\n\n")
        file.write(f"Conditions and Restrictions:\n{conditions_text}\n\n")
        file.write(f"Conclusions:\n{conclusion_text}\n\n")
        file.write(f"References:\n{references_text}")

def process_folder(folder_path):
    """Processes all JSON files in a given folder."""
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            json_file_path = os.path.join(folder_path, file_name)
            data = load_text_from_json(json_file_path)
            
            output_scientific = run_for_scientific(data["scientific_substantiation"])
            output_conclusion = run_for_conclusion(data["conclusions"])
            # print(output_scientific)
            # print(output_conclusion)
            try:
                dictionary_scientific = json.loads(output_scientific)
            except:
                print("Error in scientific substantiation")
                print(output_scientific)
            try:
                dictionary_conclusion = json.loads(output_conclusion)
            except:
                print("Error in conclusion")
                print(output_conclusion)
            # dictionary_conclusion = json.loads(output_conclusion)

            if "conclusion" in dictionary_conclusion.keys():
                first_key_in_scientific = next(iter(dictionary_scientific))
                scientific_text = dictionary_scientific[first_key_in_scientific]
                conclusion_text = dictionary_conclusion.get("conclusion", "No conclusion data available")
                conditions_text = data['conditions_restrictions']
                references_text = data['references']

                save_output_to_file(folder_path, "conclusion", scientific_text, conditions_text, conclusion_text, references_text)
            else:
                for key in dictionary_scientific.keys():
                    scientific_text = dictionary_scientific[key]
                    conclusion_text = dictionary_conclusion.get(key, "No conclusion data available")
                    conditions_text = data['conditions_restrictions']
                    references_text = data['references']
                    save_output_to_file(folder_path, key, scientific_text, conditions_text, conclusion_text, references_text)


def main():
    root_directory = '/EFSA_DOCUMENTATION'
    for subdir in next(os.walk(root_directory))[1]:
        process_folder(os.path.join(root_directory, subdir))

main()
