In [1]:
%%capture
%pip install -r requirements.txt

In [2]:
import os
import openai
from string import Template
import json
from neo4j import GraphDatabase
import glob
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep
import fitz
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_neo4j import Neo4jGraph, GraphCypherQAChain

## 1. Text Preprocessing Action

#### Defining Functions

In [4]:
# Function to convert any PDF to a txt file (same folder, same name)
def convert_pdf_to_text(pdf_folder, pdf_name):
    txt_filename = pdf_name.replace(".pdf", ".txt")

    pdf_path = os.path.join(pdf_folder, pdf_name)
    txt_path = os.path.join(pdf_folder, txt_filename)

    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])

    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text)
    
    print(f"Text extracted and saved to {txt_path}")



# Function to split one file into files with <40k characters each
def split_files(file_path):

    # Get the filename without the extension
    folder_name = os.path.splitext(file_path)[0]

    # Create a new folder with the same name as the file
    os.makedirs(folder_name, exist_ok=True)

    # Read the content of the original text file
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    print(text[100])

    # Define the max characters per file
    max_chars = 10000

    # Split the text into chunks of max_chars
    for i in range(0, len(text), max_chars):
        chunk = text[i:i + max_chars]
        output_file = os.path.join(folder_name, f"part_{i//max_chars + 1}.txt")
        with open(output_file, "w", encoding="utf-8") as out_f:
            out_f.write(chunk)

    print(f"Split {file_path} into {folder_name}/part_X.txt files successfully.")



# Function to find all files of one type in a folder
def find_files(folder_path, type):
    pdf_files = []
    
    # Check if the folder path exists
    if not os.path.exists(folder_path):
        print(f"Error: Folder '{folder_path}' does not exist.")
        return pdf_files
    
    # List all files in the directory
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        
        # Check if it's a file (not a directory) and has .pdf extension
        if os.path.isfile(file_path) and file.lower().endswith(type):
            pdf_files.append(file_path)
    
    return pdf_files



# Function to create sub-folders with smaller text files <40k characters for each PDF file 
def split_pdfs(folder_path):
    pdf_files = find_files(folder_path=folder_path, type=".pdf")

    for file in pdf_files:
        convert_pdf_to_text(folder_path, file)

        file_txt = os.path.splitext(file)[0] + ".txt"
        print(file_txt)

        split_files(file_txt)

        print("Created subfolders with smaller text files for file:" + file)

#### Calling all functions

This will search for all PDFs in the data/regulations folder and create sub-folders for each. Within each sub-folder, the text is split up into smaller .txt files with <40k characters so it can be sent to GPT-3.5 turbo in one call.

In [5]:
folder_path = "/Users/oskarribbe/Documents/Master Thesis/ChemicalGraphRAG/data/regulations"
split_pdfs(folder_path)

Text extracted and saved to /Users/oskarribbe/Documents/Master Thesis/ChemicalGraphRAG/data/regulations/REACH.txt
/Users/oskarribbe/Documents/Master Thesis/ChemicalGraphRAG/data/regulations/REACH.txt
d
Split /Users/oskarribbe/Documents/Master Thesis/ChemicalGraphRAG/data/regulations/REACH.txt into /Users/oskarribbe/Documents/Master Thesis/ChemicalGraphRAG/data/regulations/REACH/part_X.txt files successfully.
Created subfolders with smaller text files for file:/Users/oskarribbe/Documents/Master Thesis/ChemicalGraphRAG/data/regulations/REACH.pdf
Text extracted and saved to /Users/oskarribbe/Documents/Master Thesis/ChemicalGraphRAG/data/regulations/Apple_069-0135-M.txt
/Users/oskarribbe/Documents/Master Thesis/ChemicalGraphRAG/data/regulations/Apple_069-0135-M.txt
o
Split /Users/oskarribbe/Documents/Master Thesis/ChemicalGraphRAG/data/regulations/Apple_069-0135-M.txt into /Users/oskarribbe/Documents/Master Thesis/ChemicalGraphRAG/data/regulations/Apple_069-0135-M/part_X.txt files successf

## 2. OpenAI Action

#### Key Validation

In [None]:
load_dotenv() 
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

#### Helper Functions

In [79]:
# Function to call the OpenAI API
def process_gpt(file_prompt, system_msg):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": file_prompt},
        ],
        temperature=0,
        response_format={"type": "json_object"},
    )
    return completion.choices[0].message.content

# Function to take one file and a prompt template, and return a json_object of all the CAS numbers
def extract_cas_numbers(file, prompt_template, regulation_name):
    system_msg = "You are a helpful AI assistant who extracts CAS numbers from documents."
    try:
        with open(file, "r") as f:
            text = f.read().rstrip()
            prompt = Template(prompt_template).substitute(ctext=text, cregulation_name=regulation_name)
            result = process_gpt(prompt, system_msg=system_msg)
            return result
    except Exception as e:
        print(f"Error processing {file}: {e}")

def create_json(file, regulation_name, prompt_template):
    # extract_cas_numbers is already returning a parsed dictionary
    cas_data = extract_cas_numbers(file, prompt_template)
    
    json_output = {
        "regulation": {
            "name": regulation_name,
            "chemicals": cas_data  # Add the parsed data as "chemicals" field
        }
    }

    with open('output.json', 'w') as json_file:
        json.dump(json_output, json_file, indent=4)
    return json_output

def extract_cas_from_folder(folder, regulation_prompt, regulation_name, output_file="output.json"):
    chemicals = {'chemicals': []}
    for file in glob.glob(folder + "/*.txt"):
        print(file)
        try:
            latest_result = extract_cas_numbers(file, regulation_prompt, regulation_name)
            latest_result_dict = json.loads(latest_result)

            print(type(latest_result_dict))
            print(latest_result_dict)

            chemicals = {
                'chemicals': latest_result_dict['chemicals'] + chemicals['chemicals']
            }

            print(type(chemicals))
            print(chemicals)

        except Exception as e:
            print(e)

    with open(output_file, 'w') as f:
        json.dump(chemicals, f, indent=4)

#### Prompts

*this one works for the apple legislation, might need to be adjusted for different docs*

In [80]:
regulation_prompt = """
Extract all chemical names and their CAS numbers from the text below. Follow these strict rules:

### **Rules for Extraction:**
0. **NEVER STOP EARLY**. Never stop extracting before the end of the text, and never stop generating the JSON before mentioning all pairs.
1. **NEVER HALLUCINATE**. Only find real matches between **chemical names** and **CAS numbers**.
2. **Do NOT infer missing CAS numbers or chemical names.**
3. **FIND AS MANY TUPLES AS POSSIBLE**. Your goal is to maximise the number of correct (name, CAS) combinations. Find all of them!
4. If a chemical name has multiple CAS numbers create separate entries for every valid combination.
5. If a CAS number has multiple chemical names (or names are indicated by "OR"), create separate entries for every valid combination.
6. **STRICTLY follow the JSON format.** Do NOT merge or group entries. Each unique (name, CAS) combination must be a separate object.
7. The regulation name for all entries is "$cregulation_name".
---

### **Beginning of Output Format Example:**

{
    "chemicals": [
        {
            "chemical_name": "Benzene",
            "CAS": "71-43-2",
            "regulation": "$cregulation_name"
        },
        {
            "chemical_name": "Mercury",
            "CAS": "7439-97-6",
            "regulation": "$cregulation_name"
        },
        {
            "chemical_name": "Lead chromate",
            "CAS": "7758-97-6",
            "regulation": "$cregulation_name"
        }
    ]
}

### **End of Output Format Example:**

Text to process:
$ctext
"""

#### Pipeline for one doc

Just specify the file and the regulation name and running this cell will produce an output.json

In [52]:
file = "/Users/beatweichsler/Documents/temp/chemicalgraph_v1/data/regulations/Apple_069-0135-M/part_3.txt"
regulation_name = "Apple Regulated Substances Specification 069-0135-M"
#file = "/Users/beatweichsler/Documents/temp/chemicalgraph_v1/data/apple_regulation/Apple_Regulated_Substances_Specification.txt"
result = extract_cas_numbers(file, regulation_prompt, regulation_name)
print(result)

result_json = json.loads(result)
print(type(result))
print(type(result_json))

{
    "chemicals": [
        {
            "chemical_name": "Lead",
            "CAS": "7439-92-1",
            "regulation": "Apple Regulated Substances Specification 069-0135-M"
        },
        {
            "chemical_name": "Lead compounds",
            "CAS": "7439-92-1",
            "regulation": "Apple Regulated Substances Specification 069-0135-M"
        },
        {
            "chemical_name": "Mercury",
            "CAS": "7439-97-6",
            "regulation": "Apple Regulated Substances Specification 069-0135-M"
        },
        {
            "chemical_name": "Mercury compounds",
            "CAS": "7439-97-6",
            "regulation": "Apple Regulated Substances Specification 069-0135-M"
        },
        {
            "chemical_name": "Methyl-phenol compounds",
            "CAS": "95-48-7",
            "regulation": "Apple Regulated Substances Specification 069-0135-M"
        },
        {
            "chemical_name": "Methyl-phenol compounds",
            "CAS": "

#### Pipeline for the entire data/regulations folder

Pipeline scans all .txt files in the folder and concatenates all chemicals 

In [81]:
folder = "/Users/beatweichsler/Documents/temp/chemicalgraph_v1/data/regulations/Apple_069-0135-M"
regulation_name = "Apple Regulated Substances Specification 069-0135-M"

chemicals = {'chemicals': []}
output_file = 'outputc.json'

extract_cas_from_folder(folder, regulation_prompt, regulation_name, output_file)


/Users/beatweichsler/Documents/temp/chemicalgraph_v1/data/regulations/Apple_069-0135-M/part_2.txt
<class 'dict'>
{'chemicals': [{'chemical_name': 'Antimony', 'CAS': '1309-64-4', 'regulation': 'Apple Regulated Substances Specification 069-0135-M'}, {'chemical_name': 'Arsenic compounds', 'CAS': '7440-38-2', 'regulation': 'Apple Regulated Substances Specification 069-0135-M'}, {'chemical_name': 'Benzene', 'CAS': '71-43-2', 'regulation': 'Apple Regulated Substances Specification 069-0135-M'}, {'chemical_name': 'Beryllium compounds', 'CAS': '7440-41-7', 'regulation': 'Apple Regulated Substances Specification 069-0135-M'}, {'chemical_name': 'Bisphenol A (BPA)', 'CAS': '80-05-7', 'regulation': 'Apple Regulated Substances Specification 069-0135-M'}, {'chemical_name': 'Bromine', 'CAS': '7726-95-6', 'regulation': 'Apple Regulated Substances Specification 069-0135-M'}, {'chemical_name': 'Cadmium compounds', 'CAS': '7440-43-9', 'regulation': 'Apple Regulated Substances Specification 069-0135-M'}, 

## 3. NEO4J Action

Class for interacting with the ChemicalDatabase
-> Currently only loads data

In [3]:
class ChemicalDatabase:
    def __init__(self, URI, AUTH):
        self._driver = GraphDatabase.driver(URI, auth=AUTH)

    def close(self):
        self._driver.close()

    def import_json(self, chemicals_data_json):
        """
        Inserts chemical data from a JSON-like structure into Neo4j.

        Args:
            chemicals_data (dict): A dictionary containing a "chemicals" key with a list of chemical dictionaries.
        """
        if not isinstance(chemicals_data_json, dict) or "chemicals" not in chemicals_data_json or not isinstance(chemicals_data_json["chemicals"], list):
            raise ValueError("Invalid chemicals data format. Expected a dictionary with a 'chemicals' list.")

        with self._driver.session() as session:
            for chemical in chemicals_data_json["chemicals"]:
                self._insert_chemical(session, chemical)

    @staticmethod
    def _insert_chemical(session, chemical):
        """
        Inserts a single chemical into Neo4j, handling cases with missing CAS or chemical name.

        Args:
            session (neo4j.Session): The Neo4j session.
            chemical (dict): A dictionary representing a single chemical.
        """
        chemical_name = chemical.get("chemical_name")
        cas = chemical.get("CAS")
        regulation = chemical.get("regulation")

        if not regulation:
            print(f"Skipping chemical due to missing regulation: {chemical}")
            return

        if chemical_name and cas:
            query = """
            MERGE (c:Chemical {cas: $cas})
            MERGE (cn:ChemicalName {name: $name})
            MERGE (r:Regulation {name: $regulation})
            MERGE (cn)-[:IS_NAME_OF]->(c)
            MERGE (cn)-[:IS_REGULATED]->(r)
            MERGE (c)-[:IS_REGULATED]->(r)
            """
            session.run(query, name=chemical_name, cas=cas, regulation=regulation)

        elif chemical_name:
            query = """
            MERGE (cn:ChemicalName {name: $name})
            MERGE (r:Regulation {name: $regulation})
            MERGE (cn)-[:IS_REGULATED]->(r)
            """
            session.run(query, name=chemical_name, regulation=regulation)

        elif cas:
            query = """
            MERGE (c:Chemical {cas: $cas})
            MERGE (r:Regulation {name: $regulation})
            MERGE (c)-[:IS_REGULATED]->(r)
            """
            session.run(query, cas=cas, regulation=regulation)

        else:
            print(f"Skipping chemical due to missing chemical name and CAS: {chemical}")

## Load Data into database

In [4]:
load_status = load_dotenv("auraconnection.txt")
if load_status is False:
    raise RuntimeError('Environment variables not loaded.')

URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))

chemicals_data_json = {
        "chemicals": [
            {
                "chemical_name": "Lead",
                "CAS": "7439-92-1",
                "regulation": "Apple Regulated Substances Specification 069-0135-M"
            },
            {
                "chemical_name": "Lead compounds",
                "CAS": "7439-92-1",
                "regulation": "Apple Regulated Substances Specification 069-0135-M"
            },
            {
                "chemical_name": "Mercury",
                "CAS": "7439-97-6",
                "regulation": "Apple Regulated Substances Specification 069-0135-M"
            },
            {
                "chemical_name": "Mercury compounds",
                "CAS": "7439-97-6",
                "regulation": "Apple Regulated Substances Specification 069-0135-M"
            },
            {
                "chemical_name": "Arsenic",
                "regulation": "Apple Regulated Substances Specification 069-0135-M"
            },
            {
                "CAS": "123-45-6",
                "regulation": "Another regulation"
            },
            {
                "CAS": "7439-97-6",
                "regulation": "Another regulation"
            },
        ]
    }

db = ChemicalDatabase(URI, AUTH)
try:
    db.import_json(chemicals_data_json)
    print("Chemicals inserted successfully.")
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    db.close()

Chemicals inserted successfully.


## RAG Test

- uses the Langchain Neo4J wrapper to get a graph object
- Use the Google API (also works with ChatOpenAI) to generate Cypher Queries

In [None]:
load_status = load_dotenv("googleapikey.txt")
if load_status is False:
    raise RuntimeError('Environment variables not loaded.')

API_KEY = os.getenv("API_KEY")

graph = Neo4jGraph(
    url=URI,
    username=AUTH[0],
    password=AUTH[1]
)

chain = GraphCypherQAChain.from_llm(
    ChatGoogleGenerativeAI(temperature=0, model="gemini-2.0-flash",google_api_key=API_KEY, allow_dangerous_requests=True), graph=graph, verbose=True, allow_dangerous_requests=True
)

chain.run("What regulations is chemical 7439-97-6 regulated by?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:Chemical {cas: "7439-97-6"})-[:IS_REGULATED]->(r:Regulation)
RETURN r.name
[0m
Full Context:
[32;1m[1;3m[{'r.name': 'Apple Regulated Substances Specification 069-0135-M'}, {'r.name': 'Another regulation'}][0m

[1m> Finished chain.[0m


'Apple Regulated Substances Specification 069-0135-M, Another regulation.'

## Legacy Neo4J functions

In [None]:
class Neo4jChemicalGraph:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def create_regulation(self, name):
        """Create a regulation node."""
        query = """
        MERGE (r:Regulation {name: $name})
        RETURN r
        """
        with self.driver.session() as session:
            session.run(query, name=name)

    def create_chemical(self, name, cas=None, ec=None):
        """Create a Chemical node if CAS is provided, otherwise just create a ChemicalName node."""
        if cas:
            query = """
            MERGE (c:Chemical {CAS: $cas})
            ON CREATE SET c.EC = $ec
            MERGE (cn:ChemicalName {name: $name})
            MERGE (cn)-[:BELONGS_TO]->(c)
            RETURN c, cn
            """
            with self.driver.session() as session:
                session.run(query, name=name, cas=cas, ec=ec)
        else:
            # Only create ChemicalName if CAS is missing
            query = """
            MERGE (cn:ChemicalName {name: $name})
            RETURN cn
            """
            with self.driver.session() as session:
                session.run(query, name=name)

    def link_regulation_to_chemical(self, reg_name, cas):
        """Link a regulation to an existing chemical."""
        query = """
        MATCH (c:Chemical {CAS: $cas}), (r:Regulation {name: $reg_name})
        MERGE (c)-[:REGULATED_BY]->(r)
        """
        with self.driver.session() as session:
            session.run(query, cas=cas, reg_name=reg_name)

    def link_regulation_to_chemical_name(self, reg_name, chem_name):
        """Link a regulation to an existing chemical name."""
        query = """
        MATCH (cn:ChemicalName {name: $chem_name}), (r:Regulation {name: $reg_name})
        MERGE (cn)-[:REGULATED_BY]->(r)
        """
        with self.driver.session() as session:
            session.run(query, chem_name=chem_name, reg_name=reg_name)

    def import_json(self, json_data):
        """Import JSON into Neo4j while handling cases with missing CAS/EC numbers."""
        regulation_name = json_data["regulation"]["name"]
        self.create_regulation(regulation_name)

        for chemical in json_data["regulation"]["chemicals"]:
            chem_name = chemical["name"]
            cas = chemical.get("cas")  # CAS might be missing
            ec = chemical.get("ec")   # EC might be missing

            # Create chemical and chemical name if they do not exist
            self.create_chemical(chem_name, cas, ec)
            
            # Link regulation to existing chemical (if CAS is provided) or chemical name (in case only name is provided)
            if cas:
                self.link_regulation_to_chemical(regulation_name, cas)
            self.link_regulation_to_chemical_name(regulation_name, chem_name)

In [None]:
# NEO4J AURA configuration
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# # Verify Connectivity to NEO4J
# with GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) as driver:
#     driver.verify_connectivity()

# # Set Up Database
# db = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

db = Neo4jChemicalGraph(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)

#### Helper Functions

In [None]:
with open('output.json', 'r') as json_file:
    json_data = json.load(json_file)

db.import_json(json_data)