# Constructing knowledge graphs from text, tables, and images using OpenAI functions

In [None]:
!pip install langchain neo4j openai python-dotenv unstructured[all-docs] pydantic lxml

In [None]:
!conda install -c conda-forge poppler -y

In [None]:
!conda install -c conda-forge tesseract -y

Import important and necessary libraries to run the entire project

In [5]:
import os
import base64
import requests
from langchain.graphs import Neo4jGraph
from dotenv import load_dotenv
from tqdm import tqdm
from langchain.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship
)
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel
from unstructured.partition.auto import partition_pdf
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain.chat_models import ChatOpenAI, ChatOllama
from langchain.prompts import ChatPromptTemplate
from langchain.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel
from langchain.schema.messages import HumanMessage, SystemMessage

Load the env variables securely from .env file. Note we are using a different Neo4j instance for the graph creation and infformation retrieval. This allows us to make changes/manipulate the vector index and the graph database separately with no dependencies.

In [6]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["NEO4J_URI"] = os.getenv('NEO4J_URI_2')
os.environ["NEO4J_USERNAME"] = os.getenv('NEO4J_USERNAME')
os.environ["NEO4J_PASSWORD"] = os.getenv('NEO4J_PASSWORD')

llm_name = os.getenv('LLM')
ollama_base_url = os.getenv('OLLAMA_BASE_URL')

In [7]:
print(os.getenv('NEO4J_USERNAME'))

neo4j


In [8]:
graph = Neo4jGraph(
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

In [9]:
class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

In [10]:
class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [11]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )

def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [12]:
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
    [(
      "system",
      f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Identifying and Processing Tables
- **Table Detection**: Identify tables by the keyword "Table" in text document.
- **Entity and Relationship Extraction**: From tables, extract entities and their relationships. Consider rows, columns, and headers for contextual understanding.
## 4. Handling Image URI/Links
Follow these rules if the text contains the keyword Image URI
- **Mandatory Image URI in Each Node**: Each node in the document must include an 'ImageURI' attribute. This applies to all nodes, regardless of their type or content.
- **Direct URI Integration**: Attach the Image URI directly as an attribute within each node. Do not create separate nodes for URIs.
- **Consistent Key-Value Format**: Use a uniform key-value pair for the Image URI attribute across all nodes. The key should be 'ImageURI', and the value should be the actual URI link.
- **No Exclusions**: Ensure every node, including text, image, sketch, etc., has an 'ImageURI' attribute. This is essential for visual representation and consistency across the knowledge graph.
- **Specificity for Image Nodes**: For nodes starting with the 'Image' keyword, the 'ImageURI' attribute should directly reference the relevant image.
## 5. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 6. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"), 
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.  
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial. 
## 7. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination."""),
        ("human", "Use the given format to extract information from the following input: {input}"),
        ("human", "Tip: Make sure to answer in the correct format"),
    ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [87]:
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
    [(
      "system",
      f"""# Anweisungen für den Wissensgraphen für GPT-4 in der Sprache Deutsch
## 1. Überblick
Du bist ein erstklassiger Algorithmus, der darauf ausgelegt ist, Informationen in strukturierten Formaten zu extrahieren, um einen Wissensgraphen aufzubauen.
- **Knoten** repräsentieren Entitäten und Konzepte. Sie sind vergleichbar mit Wikipedia-Knoten.
- Das Ziel ist es, Einfachheit und Klarheit im Wissensgraphen zu erreichen, um ihn für ein breites Publikum zugänglich zu machen.
## 2. Beschriftung von Knoten
- **Konsistenz**: Stelle sicher, dass du grundlegende oder elementare Typen für die Beschriftung von Knoten verwendest.
  - Zum Beispiel, wenn du eine Entität identifizierst, die eine Person repräsentiert, beschrifte sie immer als **"person"**. Vermeide die Verwendung spezifischerer Begriffe wie "Mathematiker" oder "Wissenschaftler".
- **Knoten-IDs**: Verwende niemals Ganzzahlen als Knoten-IDs. Knoten-IDs sollten Namen oder im Text gefundene menschenlesbare Identifikatoren sein.
{'- **Erlaubte Knoten-Beschriftungen:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Erlaubte Beziehungstypen:**' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Identifizierung und Verarbeitung von Tabellen
- **Tabellenerkennung**: Identifiziere Tabellen durch das Stichwort "Table" im Textdokument.
- **Extraktion von Entitäten und Beziehungen**: Extrahiere aus Tabellen Entitäten und ihre Beziehungen. Berücksichtige dabei Zeilen, Spalten und Überschriften für ein kontextuelles Verständnis.
## 4. Umgang mit Bild-URI/Links
Befolge diese Regeln, wenn der Text das Stichwort Bild-URI enthält
- **Obligatorische Bild-URI in jedem Knoten**: Jeder Knoten im Dokument muss ein 'ImageURI'-Attribut enthalten. Dies gilt für alle Knoten, unabhängig von ihrem Typ oder Inhalt.
- **Direkte URI-Integration**: Hänge die Bild-URI direkt als Attribut in jeden Knoten an. Erstelle keine separaten Knoten für URIs.
- **Einheitliches Schlüssel-Wert-Format**: Verwende ein einheitliches Schlüssel-Wert-Paar für das Bild-URI-Attribut in allen Knoten. Der Schlüssel sollte 'ImageURI' sein, und der Wert sollte der tatsächliche URI-Link sein.
- **Keine Ausnahmen**: Stelle sicher, dass jeder Knoten, einschließlich Text, Bild, Skizze usw., ein 'ImageURI'-Attribut hat. Dies ist entscheidend für die visuelle Darstellung und Konsistenz im Wissensgraphen.
- **Spezifität für Bildknoten**: Für Knoten, die mit dem Stichwort 'Image' beginnen, sollte das 'ImageURI'-Attribut direkt auf das relevante Bild verweisen.
## 5. Umgang mit numerischen Daten und Datum
- Numerische Daten wie Alter oder andere verwandte Informationen sollten als Attribute oder Eigenschaften der entsprechenden Knoten eingebunden werden.
- **Keine separaten Knoten für Daten/Zahlen**: Erstelle keine separaten Knoten für Daten oder numerische Werte. Hänge sie immer als Attribute oder Eigenschaften von Knoten an.
- **Eigenschaftsformat**: Eigenschaften müssen in einem Schlüssel-Wert-Format vorliegen.
- **Anführungszeichen**: Verwende niemals escapeierte einfache oder doppelte Anführungszeichen innerhalb von Eigenschaftswerten.
- **Namenskonvention**: Verwende camelCase für Eigenschaftsschlüssel, z. B. `birthDate`.
## 6. Coreference Resolution
- **Erhaltung der Entitätskonsistenz**: Bei der Extraktion von Entitäten ist es wichtig, Konsistenz sicherzustellen.
Wenn eine Entität wie "John Doe" mehrmals im Text erwähnt wird, aber mit unterschiedlichen Namen oder Pronomen (z. B. "Joe", "er") bezeichnet wird,
verwende immer die vollständigste Bezeichnung für diese Entität im gesamten Wissensgraphen. In diesem Beispiel verwende "John Doe" als Entitäts-ID.
Denke daran, der Wissensgraph sollte kohärent und leicht verständlich sein, daher ist die Aufrechterhaltung der Konsistenz bei Entitätsreferenzen entscheidend.
## 7. Strikte Einhaltung
Halte dich streng an die Regeln. Nichtkonformität führt zur Beendigung."""),
        ("human", "Verwende das gegebene Format, um Informationen aus der folgenden Eingabe zu extrahieren: {input}"),
        ("human", "Tipp: Stelle sicher, die Antwort im korrekten Format zu geben"),
    ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [13]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.run(document.page_content)
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

In [14]:
path = "pdf/"

In [15]:
raw_pdf_elements = partition_pdf(
    filename=path + "DIA_Dino_Celi_druck.pdf",
    # Using pdf format to find embedded image blocks
    # extract_images_in_pdf=True,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=path+'img',
)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]

In [17]:
texts = [i.text for i in text_elements]
tables = [i.text for i in table_elements]

In [75]:
import boto3

s3_client = boto3.client('s3', region_name='eu-north-1')
bucket_name = 'neo4j-rag-img-s3'
for img_file in sorted(os.listdir('./img')):
    if img_file.endswith('.jpg'):
        img_path = os.path.join('./img', img_file)
        s3_client.upload_file(img_path, bucket_name, img_file)

PartialCredentialsError: Partial credentials found in env, missing: AWS_SECRET_ACCESS_KEY

In [14]:
import boto3

s3_client = boto3.client('s3', region_name='us-east-1')
bucket_name = 'neo4j-rag-img-s3'
response = s3_client.list_objects_v2(Bucket=bucket_name)
image_uris = []
if 'Contents' in response:
    for item in response['Contents']:
        file_name = item['Key']
        if file_name.endswith('.jpg'):
            # Construct the URI for each image file
            image_uri = f"https://{bucket_name}.s3.amazonaws.com/{file_name}"
            print(image_uri)
            image_uris.append(image_uri)

https://neo4j-rag-img-s3.s3.amazonaws.com/figure-2-1.jpg
https://neo4j-rag-img-s3.s3.amazonaws.com/figure-4-2.jpg


In [76]:
def encode_image(image_path):
    ''' Getting the base64 string '''
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    
def encode_image_from_uri(image_uri):
    ''' Getting the base64 string from an image URI '''
    response = requests.get(image_uri)
    if response.status_code == 200:
        return base64.b64encode(response.content).decode('utf-8')
    else:
        raise Exception(f"Failed to process image.")

def image_summarize(img_base64,prompt):
    ''' Image summary '''
    chat = ChatOpenAI(model="gpt-4-vision-preview",
                      max_tokens=1024)

    msg = chat.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text":prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{img_base64}"
                        },
                    },
                ]
            )
        ]
    )
    return msg.content

# Store base64 encoded images
img_base64_list = []
# Store image summaries
image_summaries = []
# Prompt
prompt = "Describe the image in detail. Be specific about graphs, such as bar plots."
# Read images, encode to base64 strings
for img_uri in image_uris:
    base64_image = encode_image_from_uri(img_uri)
    img_base64_list.append(base64_image)
    image_summaries.append((image_summarize(base64_image,prompt), img_uri))

NameError: name 'image_uris' is not defined

In [18]:
documents = []
for text in texts:
    val = Document(page_content=text)
    documents.append(val)
    
for text in tables:
    # add table for table identification
    text = "Table: Numerical values are important and must be treated as properties of associated nodes depending upon the context of rows and column: " + text
    val = Document(page_content=text)
    documents.append(val)
    
#for text, uri in image_summaries:
#    # add image and image uri for image summary identification
#    text = "Image: Add ImageURI property " + uri + " to all the nodes extracted from this document: " + text
#    val = Document(page_content=text)
#    documents.append(val)

In [20]:
len(documents)

61

In [21]:
documents

[Document(page_content='DIPLOMARBEIT\n\nVAMED BIM Wiki Konzept eines unternehmensweiten Wissensmanagementsystems fiir das Building Information Modeling\n\neingereicht an der FH JOANNEUM Gesellschaft mbH Fachhochschulstudiengang Baumanagement und Ingenieurbau\n\nvorgelegt von: Dino Maximilian Celi, BSc 1810234003\n\nBetreuer: Thomas Mathoi, Bmstr. DI Dr.\n\nBMI 18\n\nGraz, 2020\n\nSeite II\n\nEhrenwortliche Erklarung\n\nIch erklare ehrenwértlich, dass ich die vorliegende Diplomarbeit selbststandig angefertigt und die mit ihr verbundenen Tatigkeiten selbst erbracht habe. Ich erklare weiters, dass ich kei- ne anderen als die angegebenen Hilfsmittel benutzt habe. Alle ausgedruckten, ungedruckten oder dem Internet im Wortlaut oder im wesentlichen Inhalt ibernommenen Formulierungen und Konzepte sind gema& den Regeln fiir gutes wissenschaftliches Arbeiten zitiert und durch Fu&noten bzw. durch andere genaue Quellenangaben gekennzeichnet.\n\nDie vorliegende Originalarbeit ist in dieser Form zur

In [93]:
for i, d in tqdm(enumerate(documents), total=len(documents)):
    clean_doc = d.page_content.replace("\\", "")
    d = Document(page_content=clean_doc)
    extract_and_store_graph(d)

 25%|██▍       | 15/61 [15:53<48:44, 63.58s/it]  


ValidationError: 1 validation error for _OutputFormatter
output -> rels
  field required (type=value_error.missing)