In [1]:
%%capture
%pip install -r requirements.txt

In [12]:
import os
import openai
from string import Template
import json
from neo4j import GraphDatabase
import glob
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep
import fitz

#### Text Preprocessing Action

In [None]:
# Function to convert any PDF to a txt file (same folder, same name)
def convert_pdf_to_text(pdf_folder, pdf_name):
    txt_filename = pdf_name.replace(".pdf", ".txt")

    pdf_path = os.path.join(pdf_folder, pdf_name)
    txt_path = os.path.join(pdf_folder, txt_filename)

    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])

    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text)
    
    print(f"Text extracted and saved to {txt_path}")

In [15]:
# Convert Apple Legislation
pdf_folder = "/Users/beatweichsler/Documents/temp/chemicalgraph_v1/data/apple_regulation"
pdf_name = "Apple_Regulated_Substances_Specification.pdf"
convert_pdf_to_text(pdf_folder, pdf_name)

Text extracted and saved to /Users/beatweichsler/Documents/temp/chemicalgraph_v1/data/apple_regulation/Apple_Regulated_Substances_Specification.txt


#### OpenAI action

In [None]:
load_dotenv() 
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
# Function to call the OpenAI API
def process_gpt(file_prompt, system_msg):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": file_prompt},
        ],
        response_format={"type": "json_object"},
    )
    return completion.choices[0].message.content

# Function to take one file and a prompt template, and return a json_object of all the CAS numbers
def extract_cas_numbers(file, prompt_template):
    system_msg = "You are a helpful AI assistant who extracts CAS numbers from documents."
    try:
        with open(file, "r") as f:
            text = f.read().rstrip()
            prompt = Template(prompt_template).substitute(ctext=text)
            result = process_gpt(prompt, system_msg=system_msg)
            #json_output = {
            #    "regulation": {
            #        "name": regulation_name,
            #        "chemicals": json.loads(result),
            #    }
            #}
            #return json.loads(result)
            # Save to a JSON file
            #with open(f'{regulation_name}output.json', 'w') as json_file:
            #    json.dump(json_output, json_file, indent=4)
            return result
    except Exception as e:
        print(f"Error processing {file}: {e}")

# function to create a json file containing the regulation name and the extracted cas numbers
def create_json(file, regulation_name, prompt_template):
    cas_data = json.loads(extract_cas_numbers(file, prompt_template))
    json_output = {
        "regulation": {
            "name": regulation_name,
            **cas_data
        }
    }

    with open('output.json', 'w') as json_file:
        json.dump(json_output, json_file, indent=4)
    return json_output

In [74]:
regulation_prompt = """
Extract all chemical substances and their CAS numbers from the text below. Follow these strict rules:

### **Rules for Extraction:**
0. **ALWAYS FINISH THE OUTPUT.** Never send partial responses or cut off the JSON structure.
1. **Extract ONLY exact matches from the provided text.** 
2. ""Do NOT infer missing CAS numbers or chemical names.**
3. **If a chemical name has multiple CAS numbers, or a CAS number is associated with multiple chemical names, create separate entries for every valid combination.**
4. **Each extracted entry must contain three fields:** 
   - `"name"`: Chemical name (as written in the document).  
   - `"cas"`: CAS number (exact match from the text).  
   - `"ec"`: Leave as an empty string `""` (EC numbers are not provided).
5. **STRICTLY follow the JSON format.** Do NOT merge or group entries. Each unique (name, CAS) combination must be a separate object.

---

### **Beginning of Output Format Example:**

[
    {"name": "Arsenic", "cas": "7440-38-2", "ec": "231-148-6"},
    {"name": "Nickel", "cas": "7440-02-0", "ec": "231-111-4"}
]

### **End of Output Format Example:**

Text to process:
$ctext
"""

In [75]:
file = "/Users/beatweichsler/Documents/temp/chemicalgraph_v1/data/apple_regulation/apple_regulation_subset.txt"
regulation_name = "Apple Regulated Substances Specification 069-0135-M"
#file = "/Users/beatweichsler/Documents/temp/chemicalgraph_v1/data/apple_regulation/Apple_Regulated_Substances_Specification.txt"
result = create_json(file, regulation_name, regulation_prompt)
print(result)

{'regulation': {'name': 'Apple Regulated Substances Specification 069-0135-M', 'chemicals': [{'name': 'Antimony', 'cas': '1309-64-4', 'ec': ''}, {'name': 'Arsenic compounds', 'cas': '7440-38-2', 'ec': ''}, {'name': 'Asbestos', 'cas': '1332-21-4', 'ec': ''}, {'name': 'Asbestos', 'cas': '12001-28-4', 'ec': ''}, {'name': 'Asbestos', 'cas': '12001-29-5', 'ec': ''}, {'name': 'Asbestos', 'cas': '12172-73-5', 'ec': ''}, {'name': 'Asbestos', 'cas': '77536-66-4', 'ec': ''}, {'name': 'Asbestos', 'cas': '77536-67-5', 'ec': ''}, {'name': 'Asbestos', 'cas': '77536-68-6', 'ec': ''}, {'name': 'Asbestos', 'cas': '132207-32-0', 'ec': ''}, {'name': 'Benzene', 'cas': '71-43-2', 'ec': ''}, {'name': 'Beryllium compounds', 'cas': '7440-41-7', 'ec': ''}, {'name': 'Bisphenol A (BPA)', 'cas': '80-05-7', 'ec': ''}, {'name': 'Bromine', 'cas': '7726-95-6', 'ec': ''}, {'name': 'Cadmium compounds', 'cas': '7440-43-9', 'ec': ''}, {'name': 'Chlorine', 'cas': '7782-50-5', 'ec': ''}, {'name': 'Dimethylfumarate (DMFu)',

#### NEO4J Action

In [79]:
class Neo4jChemicalGraph:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def create_regulation(self, name):
        """Create a regulation node."""
        query = """
        MERGE (r:Regulation {name: $name})
        RETURN r
        """
        with self.driver.session() as session:
            session.run(query, name=name)

    def create_chemical(self, name, cas=None, ec=None):
        """Create a Chemical node if CAS is provided, otherwise just create a ChemicalName node."""
        if cas:
            query = """
            MERGE (c:Chemical {CAS: $cas})
            ON CREATE SET c.EC = $ec
            MERGE (cn:ChemicalName {name: $name})
            MERGE (cn)-[:BELONGS_TO]->(c)
            RETURN c, cn
            """
            with self.driver.session() as session:
                session.run(query, name=name, cas=cas, ec=ec)
        else:
            # Only create ChemicalName if CAS is missing
            query = """
            MERGE (cn:ChemicalName {name: $name})
            RETURN cn
            """
            with self.driver.session() as session:
                session.run(query, name=name)

    def link_regulation_to_chemical(self, reg_name, cas):
        """Link a regulation to an existing chemical."""
        query = """
        MATCH (c:Chemical {CAS: $cas}), (r:Regulation {name: $reg_name})
        MERGE (c)-[:REGULATED_BY]->(r)
        """
        with self.driver.session() as session:
            session.run(query, cas=cas, reg_name=reg_name)

    def link_regulation_to_chemical_name(self, reg_name, chem_name):
        """Link a regulation to an existing chemical name."""
        query = """
        MATCH (cn:ChemicalName {name: $chem_name}), (r:Regulation {name: $reg_name})
        MERGE (cn)-[:REGULATED_BY]->(r)
        """
        with self.driver.session() as session:
            session.run(query, chem_name=chem_name, reg_name=reg_name)

    def import_json(self, json_data):
        """Import JSON into Neo4j while handling cases with missing CAS/EC numbers."""
        regulation_name = json_data["regulation"]["name"]
        self.create_regulation(regulation_name)

        for chemical in json_data["regulation"]["chemicals"]:
            chem_name = chemical["name"]
            cas = chemical.get("cas")  # CAS might be missing
            ec = chemical.get("ec")   # EC might be missing

            # Create chemical and chemical name if they do not exist
            self.create_chemical(chem_name, cas, ec)
            
            # Link regulation to existing chemical (if CAS is provided) or chemical name (in case only name is provided)
            if cas:
                self.link_regulation_to_chemical(regulation_name, cas)
            self.link_regulation_to_chemical_name(regulation_name, chem_name)

In [81]:
# NEO4J AURA configuration
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# # Verify Connectivity to NEO4J
# with GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) as driver:
#     driver.verify_connectivity()

# # Set Up Database
# db = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

db = Neo4jChemicalGraph(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)

#### Helper Functions

In [83]:
with open('output.json', 'r') as json_file:
    json_data = json.load(json_file)

db.import_json(json_data)