In [1]:
import os
import pandas as pd
import numpy as np
from langchain_community.graphs import Neo4jGraph
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Neo4jVector
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI

In [4]:
#Initial the Neo4j dataset
NEO4J_URL = os.getenv('NEO4J_URL')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

kg = Neo4jGraph(
    url=NEO4J_URL, username=NEO4J_USERNAME, password=NEO4J_PASSWORD
)

In [5]:
#initial the openai_api_key
openai_api_key= os.environ.get("OPENAI_API_KEY_SBR")

**1. Querying the knowledge graph using Cypher**

In [6]:
#calculate total nodes
cypher = """
  MATCH (n) 
  RETURN count(n) AS numberOfNodes
  """
kg.query(cypher)

[{'numberOfNodes': 6161}]

In [7]:
#calculate total activities
cypher = """
  MATCH (A:Activity) 
  RETURN count(A) AS numberOfActivity
  """
kg.query(cypher)

[{'numberOfActivity': 132}]

In [8]:
#list all chemicals in the database
cypher = """
    MATCH (c:Target_Chemical)
    return c
    """
data = kg.query(cypher)
for chemical in data:
    print(chemical['c']['name'])

Methanol
Formic acid
Ethylene
Ethylene oxide
Ethylene glycol
Acetic acid
Propylene
Propylene oxide
Propylene glycol
Lactic acid
Acrylic acid
Acrolein
Acrylonitrile
Succinic acid
Butanol
1,3-Butadiene
Isobutylene
Itaconic acid
Xylitol
Furfural 
Levulinic acid
FDCA
Citric acid 
Sorbitol
p-Xylene


In [9]:
#list all LCA papers for methanol
cypher = """
    MATCH (tc:Target_Chemical {name: $Chemical})-[:Has_LCA_Studies]->(t:Paper_Title)
    RETURN t.name as Paper_title, t.author as Author, t.published_year as Published_year
  """
kg.query(cypher, params={"Chemical":"Methanol"})

[{'Paper_title': 'Techno-economic and environmental assessments for sustainable bio-methanol production as landfill gas valorization',
  'Author': 'Choe et al.',
  'Published_year': 2022},
 {'Paper_title': 'From Secondary Biomass to Bio-Methanol through CONVERGE Technology: An Environmental Analysis',
  'Author': 'Galusnyak et al.',
  'Published_year': 2023},
 {'Paper_title': 'Life Cycle Assessment of Bio-methanol Derived from Various Raw-materials',
  'Author': 'Galusnyak et al.',
  'Published_year': 2021},
 {'Paper_title': 'A LCA (life cycle assessment) of the methanol production from sugarcane bagasse',
  'Author': 'Reno et al.',
  'Published_year': 2011},
 {'Paper_title': 'A Comprehensive Assessment of the Carbon Footprint of the Coal-to-Methanol Process Coupled with Carbon Capture-, Utilization-, and Storage-Enhanced Oil Recovery Technology',
  'Author': 'Li et al.',
  'Published_year': 2024},
 {'Paper_title': 'Transition into Net-Zero Carbon Community from Fossil Fuels: Life Cycl

In [10]:
#list all synthesis pathways for methanol
cypher = """
    MATCH (tc:Target_Chemical {name: $Chemical})-[*]->(p:Pathway)
    RETURN p.name AS Pathway
  """
kg.query(cypher, params={"Chemical":"Methanol"})

[{'Pathway': 'Sugarcane bagasse-based pathway'},
 {'Pathway': 'Coal-to-methanol pathway'},
 {'Pathway': 'Conventional pathway (steam methane reforming)'},
 {'Pathway': 'Photocatalytic pathway'},
 {'Pathway': 'Natural gas to methanol pathway'},
 {'Pathway': 'Solid Recovered Fuel and Lignite to methanol pathway (case3)'},
 {'Pathway': 'Solid Recovered Fuel and Lignite to methanol pathway (case2)'},
 {'Pathway': 'Solid Recovered Fuel and Lignite to methanol pathway (case1)'},
 {'Pathway': 'PV/CCU-CH3OH technical pathway'},
 {'Pathway': 'NG-CH3OH technical pathway'},
 {'Pathway': 'Coal-CH3OH technical pathway'},
 {'Pathway': 'Natural gas-based pathway'},
 {'Pathway': 'Coal-based pathway'},
 {'Pathway': 'Coke oven gas (COG)-based pathway'},
 {'Pathway': 'Electrochemical CO2 reduction pathway (with recycling)'},
 {'Pathway': 'Thermochemical hydrogenation of CO2 pathway (with recycling)'},
 {'Pathway': 'Conventional pathway (natural gas-based)'},
 {'Pathway': 'Coal gasification pathway'},
 {'

In [11]:
#query inventory data for the specific synthesis pathway (e.g., Sugarcane bagasse-based pathway)
cypher = """
    MATCH (tc:Target_Chemical {name: $Chemical})-[*]->(p:Pathway {name:$Pathway}),
          (p)-[:Has_Activity]->(a:Activity)-[r]->(f:Flow), (f)-[:Has_Value]-(q:Amount)-[:Has_Unit]->(u:Unit)
    RETURN r,f,q,u
  """
data = kg.query(cypher, params={"Chemical":"Methanol", "Pathway": "Sugarcane bagasse-based pathway"})

if not data:
    print("Not found LCI data")
else:
    df = pd.DataFrame({
        'Flow name': [item['f']['name'] for item in data],
        'Type': [item['r'][1] for item in data],
        'Quantity': [item['q']['name'] for item in data],
        'Unit': [item['u']['name'] for item in data],
        'Is Reference Flow': [item['f']['is_reference_flow'] for item in data]
    })
    print(df)

                                   Flow name             Type   Quantity Unit  \
0                                       Slag  Has_Output_Flow   410.0000    g   
1                                        SO2  Has_Output_Flow     0.5240    g   
2                                       PM10  Has_Output_Flow     0.9300    g   
3                                        NOx  Has_Output_Flow     0.9280    g   
4                                        CO2  Has_Output_Flow  1830.0000    g   
5                                        CH4  Has_Output_Flow     0.2340    g   
6                                 Others VOC  Has_Output_Flow     0.0248    g   
7                                CH3OH (VOC)  Has_Output_Flow     0.0649    g   
8                                         CO  Has_Output_Flow     1.1200    g   
9                               Residual gas  Has_Output_Flow    90.0000    g   
10                                  Methanol  Has_Output_Flow     1.0000   kg   
11                          

**2. Node embedding similar search**

In [13]:
#create an embedding index for the selected node
index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(openai_api_key=openai_api_key),
    url=NEO4J_URL,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name='pathway',
    node_label="Pathway",  
    text_node_properties=['name', 'target_product'],
    embedding_node_property='pathway_embedding', 
)



In [14]:
print("Node label:", index.node_label)
print("Embedding property:", index.embedding_node_property)

Node label: Pathway
Embedding property: pathway_embedding


In [18]:
response = index.similarity_search("biomass to methanol")
response

[Document(page_content='\nname: Biomass-to-methanol (BTM) pathway\ntarget_product: ', metadata={'Target_product': 'Methanol', 'Inventory_ID': '[10-2]'}),
 Document(page_content='\nname: Biomass to MeOH (BOTM)\ntarget_product: ', metadata={'Target_product': 'Methanol', 'Inventory_ID': '[2-4]'}),
 Document(page_content='\nname: Natural gas to methanol pathway\ntarget_product: ', metadata={'Target_product': 'Methanol', 'Inventory_ID': '[21-4]'}),
 Document(page_content='\nname: Solid Recovered Fuel and Lignite to methanol pathway (case2)\ntarget_product: ', metadata={'Target_product': 'Methanol', 'Inventory_ID': '[21-2]'})]

**3. Q & A system**

In [19]:
#Cypher generation prompt
CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to 
query a graph database.
Instructions:
[1] Use only the provided relationship types and properties in the 
schema. Do not use any other relationship types or properties that 
are not provided.
Schema:
{schema}
Note: 
[1] Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than 
for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
[2] The functional unit includes FunctionalUnit_Quantity, FunctionalUnit_Unit, and FunctionalUnit_Product.
[3] The inventory (or input-output) data represents all input and output Flows of Activity. They includes Flow name, Amount, Unit, and 'Has_input/Has_output' relationship.
Given the returned text, please categorize the following flows as input and output (based on 'Has_input/Has_output' relationship), along with their respective amounts and units.

Examples: Here are a few examples of generated Cypher 
statements for particular questions:
# What chemicals are in the CLCIKG database?
MATCH (d:Database)-[:Has_Chemical_Names]->(c:Target_Chemical)
    WHERE d.name = 'C-LCIKG'
RETURN c.name

#The inventory data for 'Lebedev process pathway' pathway of the '1,3-Butadiene production' activity.
MATCH (p:Pathway {{name: 'Lebedev process pathway'}})-[:Has_Activity]->(a:Activity {{name: '1,3-Butadiene production'}})-[o:Has_Output_Flow|Has_Input_Flow]->(f:Flow), (f)-[:Has_Value]-(q:Amount)-[:Has_Unit]->(u:Unit)
RETURN f.name, q.name, u.name, o.name

The question is:
{question}"""

In [20]:
CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], 
    template=CYPHER_GENERATION_TEMPLATE
)

In [21]:
cypherChain = GraphCypherQAChain.from_llm(
    ChatOpenAI( model="gpt-4o", temperature=0, openai_api_key=openai_api_key),
    graph=kg,
    verbose=True,
    cypher_prompt=CYPHER_GENERATION_PROMPT,
)

In [22]:
question = "The inventory table for 'Natural gas to methanol pathway' of the 'Methanol production' activity"
response = cypherChain.run(question)
response

  warn_deprecated(




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Pathway {name: 'Natural gas to methanol pathway'})-[:Has_Activity]->(a:Activity {name: 'Methanol production'})-[o:Has_Output_Flow|Has_Input_Flow]->(f:Flow), (f)-[:Has_Value]-(q:Amount)-[:Has_Unit]->(u:Unit)
RETURN f.name, q.name, u.name, o.name[0m
Full Context:
[32;1m[1;3m[{'f.name': 'Methanol', 'q.name': 1.0, 'u.name': 'kg', 'o.name': 'Has_Output_Flow'}, {'f.name': 'Nitrogen', 'q.name': 0.0042, 'u.name': 'kg', 'o.name': 'Has_Output_Flow'}, {'f.name': 'Methanol', 'q.name': 0.0004, 'u.name': 'kg', 'o.name': 'Has_Output_Flow'}, {'f.name': 'Hydrogen', 'q.name': 0.0791, 'u.name': 'kg', 'o.name': 'Has_Output_Flow'}, {'f.name': 'Carbon dioxide', 'q.name': 0.0052, 'u.name': 'kg', 'o.name': 'Has_Output_Flow'}, {'f.name': 'Carbon monoxide', 'q.name': 0.1224, 'u.name': 'kg', 'o.name': 'Has_Output_Flow'}, {'f.name': 'Electricity', 'q.name': 0.447, 'u.name': 'MJ', 'o.name': 'Has_Input_Flow'}, {'f.name':

"The inventory table for 'Natural gas to methanol pathway' of the 'Methanol production' activity includes the following:\n\n**Output Flows:**\n- Methanol: 1.0 kg\n- Nitrogen: 0.0042 kg\n- Methanol: 0.0004 kg\n- Hydrogen: 0.0791 kg\n- Carbon dioxide: 0.0052 kg\n- Carbon monoxide: 0.1224 kg\n\n**Input Flows:**\n- Electricity: 0.447 MJ\n- Clean gas: 1.2116 kg"