# Using OpenAI & a Custom Ontology to Build a Knowledge Graph for People Experiencing Homelessness

## Objective:
Preprocess shelter and food bank data and create an ontology using [Web Protege](https://webprotege.stanford.edu/) to feed into an LLM model, gpt-4o, to return a cypher query to then create a knowledge graph.  The knowledge graph will be created on Neo4j Aura Cloud instance.   

In [None]:
#!pip install neo4j openai rdflib
#!pip install utils
#!pip install nltk
#!pip install spacy
#!pip install owlready2

## Preparing the Data to Feed into the LLM

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
food = pd.read_csv('/content/food_banks.csv')
food_ca = pd.read_csv('/content/food_ca.csv')
shelters = pd.read_csv('/content/shelter_service.csv')

In [None]:
print(food.columns)
print(food_ca.columns)
print(shelters.columns)

In [None]:
#Concatenating important columns together to save as a text file the LLM can parse through

documents = []
for df in [food, food_ca, shelters]:
    if df.equals(shelters):  # Check if the current dataframe is df1
        for index, row in df.iterrows():
            text_parts = [
                str(row["service_name"]),
                str(row["narrative"]),
                str(row["schedule"]),
                str(row["address"]),
                str(row["recommended_for"]),
                str(row["neighborhood"])
            ]
            combined_text = " ".join(text_parts)
            documents.append(combined_text)
    elif df.equals(food_ca):  # Check if the current dataframe is df2
        for index, row in df.iterrows():
            text_parts = [
                str(row["Name"]),
                str(row["Description"]),
                str(row["Resource Type"]),
                str(row["Street Address"]),
                str(row["Web Link"]),
                str(row["City"])
            ]
    elif df.equals(food):  # Check if the current dataframe is df2
        for index, row in df.iterrows():
            text_parts = [
                str(row["FCLTY_NM"]),
                str(row["ST_ADDRESS"]),
                str(row["WEBSITE"]),
                str(row["BUS_CAT_CL"]),
                str(row["BUS_CAT_DS"]),
                str(row["OCCPNT_TYP"])
            ]
            combined_text = " ".join(text_parts)
            documents.append(combined_text)

In [None]:
#Uncomment if needing to write out the above file

#with open("documents.txt", "w") as file:  # Open file in write mode
#    for item in documents:
#        file.write(item + "\n")  # Write each item to a new line

## Generating Cyper Code with a LLM

In [None]:
#!pip install neo4j openai rdflib owlready2 nltk spacy

In [2]:
from neo4j import GraphDatabase
from openai import OpenAI
from rdflib import Graph
from rdflib.namespace import RDF, OWL, RDFS
from owlready2 import get_ontology
import nltk
import spacy

In [3]:
#Creating an environment using OpenAI Key

import os
os.environ["OPENAI_API_KEY"] = "YOUR_KEY"

In [4]:
import os
from openai import OpenAI
from rdflib import Graph

#Chunking the text (breaking it up) for the LLM

def chunk_text(text, chunk_size=4000, overlap=500):
    """Chunks text with overlap."""
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        if end > len(text):
            end = len(text)
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# Load ontology that was created -- THIS IS A CUSTOM ONTOLOGY
g = Graph()
g.parse("knowhax_ontology_turtle.ttl")
ontology = g.serialize(format="ttl")

# Read text document
with open('documents.txt', 'r') as file:
    text_content = file.read().replace('\n', '')

# Chunk the text
text_chunks = chunk_text(text_content)

# Initialize OpenAI client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

system_message = (
    "You are an expert in extracting structured information out of natural language text. "
    "You extract entities with their attributes and relationships between entities. "
    "You can produce the output as Cypher write statements on request. "
    "You use correct Cypher syntax. "
    "You use correct variable naming conventions and do not duplicate variable names. "
)

# Process each chunk
cypher_script = ""  # Accumulate Cypher statements

for chunk in text_chunks:
    prompt = f"""Given the owl/rdf ontology below run your best entity extraction over the content.
    The extracted entities and relationships must be described using exclusively the terms in the ontology
    and in the way they are defined. This means that for attributes and relationships you will respect the domain and range constraints.
    You will never use terms not defined in the ontology.
    Return just the Cypher query itself. No other output.
    Absolutely no comments on the output.  Don't add any markdown content. Just the structured code output.
    Using merge to allow for linkage of nodes from multiple passes. Don't add commas in merge
    statements between labels and properties. Make sure node labels are not duplicated.  Do not add any unnecesaary punctuation.
    Use correct cypher syntax. Make sure variable names are unique when they have to be unique. Do not repeat variable names from
    statement to statement.

    ONTOLOGY:
    {ontology}

    CONTENT:
    {chunk}"""

    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt}
        ],
        model="gpt-4o"
    )

    cypher_script += response.choices[0].message.content

# Print or use the accumulated cypher_script
print(cypher_script)

# (Rest of your code for connecting to Neo4j and running the Cypher script)

merge (org1:Organization {name: 'Castlegar Community Harvest Food Bank'})
merge (location1:Location {address: '614 Christina Pl'})
merge (org1)-[:locatedAT]->(location1)
merge (org2:Organization {name: 'Prince Rupert Salvation Army Family Services'})
merge (org3:Organization {name: "St. Mark's Food Bank"})
merge (org4:Organization {name: 'Richmond Food Bank Society'})
merge (website1:Website {url: 'http://richmondfoodbank.org/'})
merge (org4)-[:hasWebsite]->(website1)
merge (org5:Organization {name: 'Surrey Food Bank'})
merge (location5:Location {address: '10732 City Parkway'})
merge (website2:Website {url: 'https://www.surreyfoodbank.org/'})
merge (org5)-[:locatedAT]->(location5)
merge (org5)-[:hasWebsite]->(website2)
merge (org6:Organization {name: 'Tansi Friendship Centre'})
merge (location6:Location {address: '5301 South Access Road'})
merge (org6)-[:locatedAT]->(location6)
merge (org7:Organization {name: 'Cowichan Valley Basket Society'})
merge (location7:Location {address: '5810 

In [5]:
#Write out above Cypher Code just in case Neo4j Aura Cloud Instance isn't responding below

with open('cypher_script_jellyv6.txt', 'w') as f:
    f.write(cypher_script)

In [None]:
#Connect to Neo4j Aura Instance

driver = GraphDatabase.driver("neo4j+s://eba7e457.databases.neo4j.io", auth=("neo4j", "YOUR PASSWORD"))

In [None]:
with driver.session() as session:
    session.run(cypher_script)

In [None]:
driver.close()