In [1]:
import sys
from graph_maker import GraphMaker, Ontology, GroqClient, OpenAIClient
from graph_maker import Document
import re
import json
import os
from openai import OpenAI
from graph_maker import Document
import pandas as pd
import pyvis
from pyvis.network import Network
from falkordb import FalkorDB
from langchain_core.prompts import PromptTemplate

In [2]:
client = OpenAI()

class ProcessConversation:
    CORESOLUTION_SYSTEM_PROMPT = """
                                    You are a coresolution resolver, which is to say you replace all pronouns referencing an object
                                    with it's proper noun based on the context.
                                    Ensure to also infer who 'I' refers to based on the context and replace all Is witht the proper noun.
                                    Example input:
                                    `Elon Musk was born in South Africa. There, he briefly attended classes at the University of Pretoria`

                                    Example output:
                                    `Elon Musk was born in South Africa. In South Africa, Elon briefly attended classes at the University of Pretoria`'

                                    Keep the same structure of the text as passed to you in the text body.
                                """

    #extract text from file
    @classmethod
    def _text_from_file(cls, file):
        with open(file) as f:
            text = f.read()
        return text

    @classmethod
    def create_outfile_path(cls, infile_file_path):
        return os.path.splitext(infile_file_path)[0] + "_renamed.txt"
    
    # split text into sentences
    @classmethod
    def sentence_splitter(cls, text):
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
        return sentences

    #create Document format
    @classmethod
    def create_docs(cls, sentences):
        docs = [Document(text=sentence["text"], metadata={"num": str(idx), "speaker":sentence["speaker"]}) for (idx, sentence) in enumerate(sentences)]
        return docs

    # run coreference resolution
    @classmethod
    def coreference_resolution(cls, text):
        #TODO: entire text is dumped for the time being, need to split it into chunks if it exceeds the limit
        response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {"role": "system", "content": cls.CORESOLUTION_SYSTEM_PROMPT},
            {"role": "user", "content": f"{text}"},
        ]
        )
        return response.choices[0].message.content

    @classmethod
    # replace dummy mappings with actual names in each line
    def _replace_speaker_with_mapping(cls, line, speaker_mapping):
        for key, value in speaker_mapping.items():
            if key in line:
                line = line.replace(key, value)
        return line
    
    @classmethod
    #store texts and speakers in a list of dictionaries
    def _extract_speaker_and_text(cls, sentences):
        speaker_text_store = []
        for input_string in sentences:
            print(input_string)
            speaker_match = re.search(r'speaker:(.*?)\|', input_string).group(1).strip()
            text_match = re.search(r'text:(.*)', input_string).group(1).strip()
            speaker_text_store.append({"text":text_match, "speaker":speaker_match})
        return speaker_text_store
    
    @classmethod
    #clump sentences together based on speaker and chunk limit, whichever comes first
    def clump_sentences(cls, sentences, chunk_limit):
        current_speaker = sentences[0]["speaker"]
        text = ""
        chunk_number = 0
        clumped_sentences = []
        for sentence in sentences:
            if sentence["speaker"] == current_speaker and chunk_number < chunk_limit:
                text += sentence["text"]
                chunk_number += 1
            elif sentence["speaker"] != current_speaker or chunk_number >= chunk_limit:
                clumped_sentences.append({"text": text, "speaker":current_speaker})
                current_speaker = sentence["speaker"]
                text = sentence["text"]
                chunk_number = 1
        else:
            clumped_sentences.append({"text": text, "speaker":current_speaker})
        return clumped_sentences

    @classmethod
    def process_sentences(cls, sentences, chunk_limit):
        speaker_text_store = cls._extract_speaker_and_text(sentences)
        sentences_clumped = cls.clump_sentences(speaker_text_store, chunk_limit)
        return sentences_clumped

    @classmethod
    #write conversation with correct speaker names to file
    def write_conversation_to_file(cls):
        # Replace speaker in conversation:
        if not os.path.exists(cls.outfile_file_path):
            with open(cls.infile_file_path, 'r') as infile, open(cls.outfile_file_path, 'w') as outfile:
                # Read the lines from the input file
                for line in infile.readlines():
                    processed_line = cls._replace_speaker_with_mapping(line)
                    outfile.write(processed_line)

    @classmethod
    #run the entire preprocessing pipeline
    def run(cls, infile_file_path):
        cls.infile_file_path = infile_file_path
        cls.outfile_file_path = cls.create_outfile_path(infile_file_path)
        cls.write_conversation_to_file()
        text = cls._text_from_file(cls.outfile_file_path)
        coresolved_text = cls.coreference_resolution(text)
        sentences = cls.sentence_splitter(coresolved_text)
        sentences_processed = cls.process_sentences(sentences, chunk_limit=3)
        docs = cls.create_docs(sentences_processed)
        return docs

    @classmethod    
    def line_generator(cls, file_path):
        with open(file_path, 'r') as file:
            for line in file:
                yield line.strip()
        
    @classmethod
    def aggregate_text(cls, text):
        pattern = r'text:\s*(.*?)(?=\n|$)'
        matches = re.findall(pattern, text)
        all_text = ' '.join(matches) 
        return all_text       


    @classmethod
    def create_docs(cls, infile_file_path, speaker_mapping):
        text =""
        line_gen = cls.line_generator(infile_file_path)
        for line in line_gen:
            text += str(cls._replace_speaker_with_mapping(line, speaker_mapping)) + "\n"
        coresolution_resolved_text = cls.coreference_resolution(text)
        all_text = cls.aggregate_text(coresolution_resolved_text)
        print(all_text)
        return [Document(text=all_text, metadata={"num": "1"})]


In [3]:
class CreateGraph:
    GRAPH_CLEANER_PROMPT = """
                            You are an expert at generating open cypher queries.
                            You are provided with a list of cypher queries for creation of new nodes and edges.
                            Your duty is as follows:
                            - Make the relationships between the nodes simpler and less verbose without losing the meaning.
                            - Do not miss out any nuances or details that are present in the relationship part of the query.
                            
                            YOU WILL BE PENALIZED FOR CHANGING THE MEANING IN ANY WAY.
                            DO NOT ADD OR REMOVE ANY NODES OR RELATIONSHIPS.

                            Ensure to use metadata in the relationship elements to capture any details that cannot be captured otherwise

                            Examples:
                            CREATE (charlie:Person:Actor {name: 'Charlie Sheen'})-[:ACTED_IN {role: 'Bud Fox'}]->(wallStreet:Movie {title: 'Wall Street'})<-[:DIRECTED]-(oliver:Person:Director {name: 'Oliver Stone'})

                            Notice how in the relationship 'ACTED_IN' the role of 'Bud fox' is present.
                            """
    EXAMPLE_PROMPT = """
                        CREATE
                        (:Person {name:"Lex Fridman's grandfather"})-[:Lex_Fridmans_grandfather_bought_a_d6_bulldozer]->(:Object {name:"d6 bulldozer"}),
                        (:Object {name:"d6 bulldozer"})-[:The_d6_bulldozer_is_described_as_a_big_bulldozer]->(:Miscellanous {name:"big bulldozer"}),
                        (:Person {name:"Lex Fridman's grandfather"})-[:Lex_Fridmans_grandfather_got_the_bulldozer_for_five_thousand_dollars]->(:Object {name:"d6 bulldozer"}),
                        (:Object {name:"d6 bulldozer"})-[:The_d6_bulldozer_is_specifically_a_1955_caterpillar_d6_bulldozer]->(:Miscellanous {name:"1955 caterpillar d6 bulldozer"}),
                        (:Person {name:"Lex Fridman's grandfather"})-[:Lex_Fridmans_grandfather_spent_an_entire_summer_fixing_the_bulldozer]->(:Object {name:"d6 bulldozer"}),
                        (:Person {name:"Lex Fridman's grandfather"})-[:Lex_Fridmans_grandfather_used_mail_order_to_buy_big_gears_for_the_bulldozers_transmission]->(:Object {name:"big gears"}),
                        (:Person {name:"Lex Fridman's grandfather"})-[:Lex_Fridmans_grandfather_built_a_crane_to_move_heavy_objects]->(:Object {name:"crane"}),
                        (:Person {name:"Lex Fridman's grandfather"})-[:Lex_Fridmans_grandfather_exhibited_a_strong_problem_solving_mentality]->(:Miscellanous {name:"problem solving mentality"}),
                        (:Person {name:"Lex Fridman's grandfather"})-[:Lex_Fridmans_grandfather_did_his_own_veterinary_work]->(:Action {name:"veterinary work"}),
                        (:Person {name:"Jeff Bezos"})-[:Jeff_Bezos_fell_in_love_with_the_idea_of_space_exploration_at_the_age_of_five]->(:Miscellanous {name:"space exploration"}),
                        (:Person {name:"Jeff Bezos"})-[:Jeff_Bezos_was_inspired_by_watching_Neil_Armstrong_walking_on_the_moon]->(:Event {name:"Neil Armstrong walking on the moon"}),
                        (:Event {name:"space race from 1957 to 1969"})-[:The_space_race_involved_the_Soviet_Union]->(:Place {name:"Soviet Union"}),
                        (:Event {name:"space race from 1957 to 1969"})-[:The_space_race_involved_the_US]->(:Place {name:"US"})
                    """
    EXAMPLE_RESULT = """
                        CREATE
                        (:Person {name:"Lex Fridman's grandfather"})-[:BOUGHT {price: 'five thousand dollars'}]->(:Object {name:"d6 bulldozer"}),
                        (:Object {name:"d6 bulldozer"})-[:DESCRIBED_AS]->(:Miscellanous {name:"big bulldozer"}),
                        (:Object {name:"d6 bulldozer"})-[:SPECIFIC_TYPE]->(:Miscellanous {name:"1955 caterpillar d6 bulldozer"}),
                        (:Person {name:"Lex Fridman's grandfather"})-[:FIXED {duration: 'entire summer'}]->(:Object {name:"d6 bulldozer"}),
                        (:Person {name:"Lex Fridman's grandfather"})-[:ORDERED_PARTS {method: 'mail order'}]->(:Object {name:"big gears"}),
                        (:Person {name:"Lex Fridman's grandfather"})-[:BUILT {objective: 'To move heavy objects'}]->(:Object {name:"crane"}),
                        (:Person {name:"Lex Fridman's grandfather"})-[:EXHIBITED]->(:Miscellanous {name:"problem solving mentality"}),
                        (:Person {name:"Lex Fridman's grandfather"})-[:PERFORMED]->(:Action {name:"veterinary work"}),
                        (:Person {name:"Jeff Bezos"})-[:FELL_IN_LOVE_WITH {age: 'five'}]->(:Miscellanous {name:"space exploration"}),
                        (:Person {name:"Jeff Bezos"})-[:INSPIRED_BY {event: 'Neil Armstrong walking on the moon'}]->(:Person {name:"Neil Armstrong"}),
                        (:Event {name:"space race from 1957 to 1969"})-[:INVOLVED {country: 'Soviet Union'}]->(:Place {name:"Soviet Union"}),
                        (:Event {name:"space race from 1957 to 1969"})-[:INVOLVED {country: 'US'}]->(:Place {name:"US"})
                    """

    ontology = Ontology(
        # labels of the entities to be extracted. Can be a string or an object, like the following.
        labels=[
            {"Person": "Person name without any adjectives, Remember a person may be references by their name or using a pronoun"},
            {"Object": "Do not add the definite article 'the' in the object name"},
            {"Event": "Event event involving multiple people. Do include qualifiers or verbs like gives, leaves, works etc."},
            "Place",
            "Document",
            "Organisation",
            "Action",
            {"Miscellanous": "Any important concept can not be categorised with any other given label"},
        ],
        # Relationships that are important for your application.
        # These are more like instructions for the LLM to nudge it to focus on specific relationships.
        # There is no guarentee that only these relationships will be extracted, but some models do a good job overall at sticking to these relations.
        relationships=[
            "Relation between any pair of Entities",
            ],
    )
    model = "gpt-4-turbo"
    llm = OpenAIClient(model=model, temperature=0.1, top_p=0.5)
    graph_maker = GraphMaker(ontology=ontology, llm_client=llm, verbose=False)
    net = Network(notebook=True, cdn_resources='in_line')

    @classmethod
    def create_graph(cls, documents):
        graph = cls.graph_maker.from_documents(
            list(documents),
            delay_s_between=10 ## delay_s_between because otherwise groq api maxes out pretty fast.
            )
        if graph:
            print("Graph creatd with total number of Edges:", len(graph))
            return graph
        else:
            return None
        
    # Function to check if a node exists
    @classmethod
    def node_exists(cls, net, node_id):
        return any(node['id'] == node_id for node in net.nodes)

    # Function to create a joined string
    @classmethod
    def create_joined_string(cls, input_string):
        modified_string = input_string.replace(' ', '_')
        modified_string = modified_string.replace("'", "")
        return modified_string

    @classmethod
    def build_graph_image(cls, graph):
        for chunk in graph:
            labels = [chunk.node_1.label, chunk.node_2.label]
            ids = [chunk.node_1.name, chunk.node_2.name]
            relationship = chunk.relationship
            print(f"{labels[0]}: {ids[0]} --> {labels[1]}:{ids[1]}; rel:{relationship}")

            for label,id in zip(labels,ids):
                if not cls.node_exists(cls.net, node_id=id):
                    cls.net.add_node(id, label=f"{label}:{id}")

            cls.net.add_edge(ids[0], ids[1], title=relationship, arrows='to')

    @classmethod
    def save_graph_image(cls, filename):
        cls.net.show_buttons(filter_=['physics'])
        if filename.endswith('.html'):
            cls.net.show(filename)
        else:
            print("Filename should have a .html extension")
        cls.net.show(filename)

    @classmethod
    def get_create_graph_query(cls, graph):
        relationship_list = []
        query = "CREATE\n"
        for chunk in graph:
            labels = [chunk.node_1.label, chunk.node_2.label]
            ids = [chunk.node_1.name, chunk.node_2.name]
            relationship = cls.create_joined_string(chunk.relationship.split('.')[0])
            relationship_list.append(relationship)
            query += f"""(:{labels[0]} {{name:"{ids[0]}"}})-[:{relationship}]->(:{labels[1]} {{name:"{ids[1]}"}}),\n"""
        query =  query[:-2]
        return query
    
    @classmethod
    def clean_create_graph_query(cls, query):
        response = client.chat.completions.create(
        model="gpt-4-turbo",
        temperature=0.0,
        messages=[
            {"role": "system", "content": cls.GRAPH_CLEANER_PROMPT},
            {"role": "user", "content": cls.EXAMPLE_PROMPT},
            {"role": "assistant", "content": cls.EXAMPLE_RESULT},
            {"role": "user", "content": f"{query}"},
        ]
        )
        return response.choices[0].message.content
    
    @classmethod
    def check_missing_nodes(cls, query, text):
        sys_prompt = f"""
                    You are an expert at analysing cypher queries.
                    You are provided with text and a cypher query. 
                    Your job is to determine if the cypher query is missing any nodes or relationships that are present in the text.
                    Return the aspects that are missing. If nothing is missing, return 'none'.
                    """
        user_prompt = f"""
                    The text is:{text};
                    The query is:{query};
                    """
        response = client.chat.completions.create(
            model="gpt-4-turbo",
            temperature=0.0,
            messages=[
                {"role": "system", "content": sys_prompt},
                {"role": "system", "content": user_prompt},
            ]
        )
        return response.choices[0].message.content
    
    @classmethod
    def fix_missing_nodes(cls, query, text, missing):
        sys_prompt = f"""
                    You are an expert at analysing cypher queries.
                    You are provided with text, a cypher query and what is missing in the cypher query.
                    Your job is to add any missing nodes or relationships to the cypher query based on the text and return the modified query.
                    Only add the missing nodes or relationships.
                    Do not change the existing query in any way.
                    """
        user_prompt = f"""
                    The text is:{text};
                    The query is:{query};
                    Missing nodes/relationships are: {missing};
                    """
        response = client.chat.completions.create(
            model="gpt-4-turbo",
            temperature=0.0,
            messages=[
                {"role": "system", "content": sys_prompt},
                {"role": "system", "content": user_prompt},
            ]
        )
        return response.choices[0].message.content
    
    @classmethod
    def run(cls, docs, graph_name):
        # Connect to FalkorDB
        db = FalkorDB(host='localhost', port=6379)
        g = db.select_graph(graph_name)
        graph = cls.create_graph(documents=docs)
        #cls.build_graph_image(graph=graph)
        #cls.save_graph_image(filename="graph.html")
        query = cls.get_create_graph_query(graph=graph)
        print("FIRST CREATION QUERY:",query)
        modified_query = cls.clean_create_graph_query(query=query)
        print("MODIFIED QUERY:",modified_query)
        #missing = cls.check_missing_nodes(modified_query, docs[0].text)
        #print("MISSING NODES:",missing)
        #if missing.lower() != "none":
        #    modified_query = cls.fix_missing_nodes(modified_query, docs[0].text, missing)
        #    print("FIXED QUERY:", modified_query)
        g.query(modified_query)

In [4]:
class RetrieveFromGraph:
    def __init__(self, graph_name) -> None:
        self.db = FalkorDB(host='localhost', port=6379)
        self.graph = self.db.select_graph(graph_name)
        self.QUESTION_GEN_PROMPT = """
        You’re a Cypher expert, with access to the following graph:
        The knowledge graph schema is as follows:
        The graph contains the following node labels:
        {node_text}
        the Module label has {node_length} number of nodes.
        The graph contains the following relationship types:
        {relationship_text}
        This is the end of the knowledge graph schema description.
        Ensure to generate the cypher query for the question passed to you
        """
        self.question_prompt_template = PromptTemplate.from_template(self.QUESTION_GEN_PROMPT)

    def create_query(self, question):
        response = client.chat.completions.create(
        model="gpt-4-turbo",
        temperature=0.0,
        messages=[
            {"role": "system", "content": self.question_prompt_template.format(
                node_text=self.view_nodes(), 
                node_length=self.get_total_number_of_nodes(), 
                relationship_text=self.view_edges())},
            {"role": "user", "content": f"{question}"},
        ]
        )
        return response.choices[0].message.content

    def view_nodes(self):
        node_text= ""
        nodes = self.graph.query("""MATCH (n) RETURN n""")
        for node in nodes.result_set:
            node_text += str(node[0]) + "\n"
        return (node_text)
    
    def get_total_number_of_nodes(self):
        return len(self.graph.query("""MATCH (n) RETURN n""").result_set)
    
    def get_unique_edge_set(self):
        edge_set = set()
        edges = self.graph.query("""MATCH ()-[r]->() RETURN type(r) AS type, properties(r) AS properties""")
        for e in edges.result_set:
            temp_e = str(e[0])
            for key, value in e[1].items():
                temp_e += str(f" {{{key}: {value}}}")
            edge_set.add(temp_e)
        return edge_set
    
    def view_edges(self):
        relationship_text = ""
        edge_set = self.get_unique_edge_set()
        for e in edge_set:
            relationship_text += str(e)+ "\n"
        return (relationship_text)
    
    def get_cypher_text_from_output(self, text):
        pattern = r'```cypher\n(.*?)\n```'
        cypher_query = re.search(pattern, text, re.DOTALL)
        return cypher_query.group(1)
    
    def get_total_number_of_edges(self):
        edge_set = self.get_unique_edge_set()
        edge_len=len(edge_set)
        return edge_len

    def get_answer(self, question):
        query = self.create_query(question)
        print("QUERY:",query)
        cypher_query = self.get_cypher_text_from_output(query)
        answer=self.graph.query(cypher_query)
        return answer.result_set[0]

In [3]:
class TextToCypher:
    TRIPLET_PROMPT = """
                    You are an expert at creating graphs from text using OpenCypher.
                    You are provided with a text and your job is to create a graph from the text.
                    Ensure to create the graph based on the relationships between the entities in the text.
                    Ensure too set nouns as nodes and verbs as relationships.

                    ## Ensure to not use metadata unless necessary and ensure to normalize the relationships between the nodes.:
                    For example:
                    Incorrect:
                    ```
                    MERGE (p1:Person { name: "David", interest: ["Guitar"] })
                    MERGE (p2:Person { name: "Sarah", interest: ["Guitar"] })
                    ```
                    Correct(normalised relationships):
                    ```
                    MERGE (s:Interest { name: "Guitar" })
                    MERGE (p1:Person { name: "David" })
                    MERGE (p1)-[:HAS]->(s)
                    MERGE (p2:Person { name: "Sarah" })
                    MERGE (p2)-[:HAS]->(s)
                    ```

                    ##Ensure to capture possessive relationships as well:
                    John's car is red: should generate a node for John and a node for car with property color as red.
                    
                    ##Ensure to not be ambiguous in the relationships between the nodes:
                    Ensure to capture the relationships between the nodes with enough detail to not be ambiguous.

                    ##Ensure to Reify the relationships between the nodes:
                    To reify a relationship is to make it an entity in its own right. 
                    For example, if you have a relationship between a person and a car, 
                    you can reify that relationship into a new entity called "owns" and connect the person and the car to the "owns" entity.

                    ##Capture all details in the text within the graph:
                    """

    @classmethod
    def cypher_from_text_first_pass(cls, text):
        #TODO: entire text is dumped for the time being, need to split it into chunks if it exceeds the limit
        USER_PROMPT = """
                    The text is:
                    {text} 
                    Try to not use any metadata unless necessary and ensure to normalize the relationships between the nodes.
                    Ensure the relationships between the nodes are not ambiguous and capture all details in the text within the graph and make sense.
                    """
        response = client.chat.completions.create(
        model="gpt-4-turbo",
        temperature=0.0,
        messages=[
            {"role": "system", "content": cls.TRIPLET_PROMPT},
            {"role": "user", "content": USER_PROMPT.format(text=text)},
        ]
        )
        return response.choices[0].message.content

    @classmethod
    def get_cypher_text_from_result(cls, text):
        pattern = r'```cypher\n(.*?)\n```'
        cypher_query = re.search(pattern, text, re.DOTALL)
        return cypher_query.group(1)

    @classmethod
    def cypher_from_text_second_pass(cls, text, cypher):
        USER_PROMPT = """
                        The text is: {text}. The cypher query is: {cypher}, modify the cypher query to even include implicit relationships between the nodes. 
                        Do not add in any nodes or relationships that are not present in the text. Ensure to capture all relationships between the nodes in the text.
                    """
        response = client.chat.completions.create(
        model="gpt-4-turbo",
        temperature=0.0,
        messages=[
            {"role": "system", "content": cls.TRIPLET_PROMPT},
            {"role": "user", "content": USER_PROMPT.format(text=text, cypher=cypher)},
        ]
        )
        return response.choices[0].message.content
    
    @classmethod
    def run(cls, text):
        first_pass = cls.cypher_from_text_first_pass(text)
        first_cypher = cls.get_cypher_text_from_result(first_pass)
        second_pass = cls.cypher_from_text_second_pass(text, first_cypher)
        second_cypher = cls.get_cypher_text_from_result(second_pass)
        #print("first cypher:", first_cypher)
        #print("second cypher:", second_cypher)
        return second_cypher


In [6]:
#initializing graph
graph_name = "Conv1"
db = FalkorDB(host='localhost', port=6379)
g = db.select_graph(graph_name)

In [4]:
"""
doc = ProcessConversation.create_docs(infile_file_path="./docs/bezos.txt", 
                                    speaker_mapping={"SPEAKER_00": "Jeff Bezos", "SPEAKER_01": "Lex Fridman"})
"""
doc = [
       Document(text="The following is a conversation with Jeff Bezos, founder of Amazon and Blue Origin. This is Jeff Bezos's first time doing a conversation of this kind and of this length. And as Jeff Bezos told Lex Fridman, it felt like Lex Fridman and Jeff Bezos could have easily talked for many more hours, and Lex Fridman is sure Lex Fridman and Jeff Bezos will.", metadata={'num': '1'}),
       Document(text="This is the Lex Friedman Podcast. And now, dear friends, here's Jeff Bezos. Jeff Bezos spent a lot of Jeff Bezos's childhood with Jeff Bezos's grandfather on a ranch here in Texas.", metadata={'num': '2'}),
       Document(text="And Lex Fridman heard Jeff Bezos had a lot of work to do around the ranch. So what's the coolest job Jeff Bezos remembers doing there? Wow, coolest.", metadata={'num': '3'}),
       Document(text="Most interesting. Most memorable. Most memorable.", metadata={'num': '4'}),
       Document(text="Most impactful. It's a real working ranch. And Jeff Bezos spent all Jeff Bezos's summers on that ranch from age four to 16. And Jeff Bezos's grandfather was really taking Jeff Bezos in the summers and the In the early summers, Jeff Bezos's grandfather was letting Jeff Bezos pretend to help on the ranch because, of course, a four-year-old is a burden, not a help in real life.", metadata={'num': '5'}),
       Document(text="Jeff Bezos's grandfather was really just watching Jeff Bezos and taking care of Jeff Bezos. Jeff Bezos's grandfather was doing that because Jeff Bezos's mom was so young. She had Jeff Bezos when she was 17, and so Jeff Bezos's grandfather was sort of giving her a break, and Jeff Bezos's grandmother and Jeff Bezos's grandfather would take Jeff Bezos for the summers.", metadata={'num': '6'})
       ]

In [5]:
i = 0
text = doc[i].text
print(text)   

The following is a conversation with Jeff Bezos, founder of Amazon and Blue Origin. This is Jeff Bezos's first time doing a conversation of this kind and of this length. And as Jeff Bezos told Lex Fridman, it felt like Lex Fridman and Jeff Bezos could have easily talked for many more hours, and Lex Fridman is sure Lex Fridman and Jeff Bezos will.


In [6]:
cypher = TextToCypher.run(text) 
print(cypher)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


// Create nodes for the people and organizations
MERGE (jeff:Person {name: "Jeff Bezos"})
MERGE (lex:Person {name: "Lex Fridman"})
MERGE (amazon:Company {name: "Amazon"})
MERGE (blueOrigin:Company {name: "Blue Origin"})

// Create relationships for founding
MERGE (jeff)-[:FOUNDED]->(amazon)
MERGE (jeff)-[:FOUNDED]->(blueOrigin)

// Create a node for the conversation event
MERGE (conversation:Conversation {type: "First long conversation"})

// Connect Jeff and Lex to the conversation
MERGE (jeff)-[:PARTICIPATED_IN]->(conversation)
MERGE (lex)-[:PARTICIPATED_IN]->(conversation)

// Express the potential for future conversations
MERGE (futureConversations:FutureConversations {description: "Potential for many more hours of conversation"})

// Connect Jeff and Lex to future conversations
MERGE (jeff)-[:MAY_HAVE]->(futureConversations)
MERGE (lex)-[:MAY_HAVE]->(futureConversations)

// Reify the relationship of telling
MERGE (telling:Event {description: "Jeff Bezos told Lex Fridman"})
MERGE 

In [30]:
def parse_cypher_to_json(cypher):
    # Define regex patterns to match nodes and relationships
    node_pattern = r'MERGE \((\w+):(\w+) {([^}]+)}\)'
    relationship_pattern = r'MERGE \((\w+)\)-\[:(\w+)\]->\((\w+)\)'
    
    nodes = {}
    relationships = []
    
    # Extract nodes
    for match in re.finditer(node_pattern, cypher):
        var, label, attrs = match.groups()
        attr_dict = {}
        for attr in attrs.split(", "):
            key, value = attr.split(": ")
            attr_dict[key] = value.strip('"')
        name = attr_dict.get('name', attr_dict.get('kind', 'Unknown'))
        nodes[var] = {"label": label, "name": name}
    
    # Extract relationships
    for match in re.finditer(relationship_pattern, cypher):
        node_1, rel, node_2 = match.groups()
        relationships.append({
            "node_1": nodes[node_1],
            "node_2": nodes[node_2],
            "relationship": f"{nodes[node_1]['name']} {rel.lower().replace('_', ' ')} {nodes[node_2]['name']}"
        })
    
    return json.dumps(relationships, indent=4)

In [31]:
f = parse_cypher_to_json(cypher)
print(f)

[
    {
        "node_1": {
            "label": "Person",
            "name": "Jeff Bezos"
        },
        "node_2": {
            "label": "Company",
            "name": "Amazon"
        },
        "relationship": "Jeff Bezos founded Amazon"
    },
    {
        "node_1": {
            "label": "Person",
            "name": "Jeff Bezos"
        },
        "node_2": {
            "label": "Company",
            "name": "Blue Origin"
        },
        "relationship": "Jeff Bezos founded Blue Origin"
    },
    {
        "node_1": {
            "label": "Person",
            "name": "Jeff Bezos"
        },
        "node_2": {
            "label": "Conversation",
            "name": "Unknown"
        },
        "relationship": "Jeff Bezos participated in Unknown"
    },
    {
        "node_1": {
            "label": "Person",
            "name": "Lex Fridman"
        },
        "node_2": {
            "label": "Conversation",
            "name": "Unknown"
        },
        "relation

In [10]:
def extract_relationships_and_nodes_with_properties(cypher_str):
    # Regular expression patterns to match the CREATE statements with relationships and properties
    node_pattern = re.compile(r'\((\w+):(\w+)(?: \{([^\}]*)\})?\)')
    relationship_pattern = re.compile(r'MERGE \((\w+)\)-\[:(\w+)\]->\((\w+)\)')

    # Function to parse properties string to a dictionary
    def parse_properties(properties_str):
        if not properties_str:
            return {}
        properties = properties_str.split(', ')
        prop_dict = {}
        for prop in properties:
            key, value = prop.split(': ')
            # Remove quotes from string values
            value = value.strip('"')
            prop_dict[key] = value
        return prop_dict

    nodes = {}
    relationships = {}

    # Extract nodes and their properties
    for match in node_pattern.finditer(cypher_str):
        node_var, label, properties_str = match.groups()
        properties = parse_properties(properties_str)
        nodes[node_var] = {"label": label, "properties": properties}

    # Extract relationships and associate nodes
    for match in relationship_pattern.finditer(cypher_str):
        node1, rel_type, node2 = match.groups()
        if rel_type not in relationships:
            relationships[rel_type] = []
        relationships[rel_type].append({
            "node_1": nodes[node1],
            "node_2": nodes[node2]
        })

    return relationships

In [11]:
rel_dict = extract_relationships_and_nodes_with_properties(cypher)
print(rel_dict)

{'FOUNDED': [{'node_1': {'label': 'Person', 'properties': {'name': 'Jeff Bezos'}}, 'node_2': {'label': 'Organization', 'properties': {'name': 'Amazon'}}}, {'node_1': {'label': 'Person', 'properties': {'name': 'Jeff Bezos'}}, 'node_2': {'label': 'Organization', 'properties': {'name': 'Blue Origin'}}}], 'PARTICIPATED_IN': [{'node_1': {'label': 'Person', 'properties': {'name': 'Jeff Bezos'}}, 'node_2': {'label': 'Conversation', 'properties': {'kind': 'first time', 'length': 'long'}}}, {'node_1': {'label': 'Person', 'properties': {'name': 'Lex Fridman'}}, 'node_2': {'label': 'Conversation', 'properties': {'kind': 'first time', 'length': 'long'}}}], 'TOLD': [{'node_1': {'label': 'Person', 'properties': {'name': 'Jeff Bezos'}}, 'node_2': {'label': 'Person', 'properties': {'name': 'Lex Fridman'}}}], 'COULD_HAVE_TALKED_FOR_MORE_HOURS_WITH': [{'node_1': {'label': 'Person', 'properties': {'name': 'Jeff Bezos'}}, 'node_2': {'label': 'Person', 'properties': {'name': 'Lex Fridman'}}}], 'SURE_THAT_W

In [36]:
g.query(cypher)
nodes = g.query("""MATCH (n) RETURN n""")
for node in nodes.result_set:
    print(node[0])

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


first cypher: // Create nodes
MERGE (podcast:Podcast {name: "Lex Friedman Podcast"})
MERGE (jeff:Person {name: "Jeff Bezos"})
MERGE (grandfather:Person {name: "Grandfather"}) // Assuming the name is not specified
MERGE (ranch:Location {name: "Ranch", state: "Texas"})
MERGE (childhood:Period {name: "Childhood"})

// Create relationships
MERGE (jeff)-[:GUEST_ON]->(podcast)
MERGE (childhood)-[:SPENT_WITH]->(grandfather)
MERGE (childhood)-[:OCCURRED_AT]->(ranch)
MERGE (jeff)-[:HAD]->(childhood)
second cypher: // Create nodes
MERGE (podcast:podcast {name: "Lex Friedman Podcast"})
MERGE (jeff:person {name: "Jeff Bezos"})
MERGE (grandfather:person {name: "Grandfather"}) // Assuming the name is not specified
MERGE (ranch:location {name: "Ranch", state: "Texas"})
MERGE (childhood:period {name: "Childhood"})

// Create relationships
MERGE (jeff)-[:GUEST_ON]->(podcast)
MERGE (jeff)-[:SPENT]->(childhood)
MERGE (childhood)-[:WITH]->(grandfather)
MERGE (childhood)-[:AT]->(ranch)
MERGE (jeff)-[:HAD]-

In [37]:
g.query(cypher)

<falkordb.query_result.QueryResult at 0x782ca956bb10>

In [38]:
nodes = g.query("""MATCH (n) RETURN n""")
for node in nodes.result_set:
    print(node[0])

(:person{name:"Jeff Bezos"})
(:person{name:"Lex Fridman"})
(:company{name:"Amazon"})
(:company{name:"Blue Origin"})
(:conversation{firstTime:True,length:"long",type:"Interview"})
(:futureConversations{description:"Potential continuation"})
(:podcast{name:"Lex Friedman Podcast"})
(:person{name:"Grandfather"})
(:location{name:"Ranch",state:"Texas"})
(:period{name:"Childhood"})


In [47]:
net = Network(notebook=True, cdn_resources='in_line')

def node_exists(net, node_id):
    return any(node['id'] == node_id for node in net.nodes)

def save_graph_image(filename):
    net.show_buttons(filter_=['physics'])
    if filename.endswith('.html'):
        net.show(filename)
    else:
        print("Filename should have a .html extension")
    net.show(filename)

def create_graph_image(graph, filename):
    for chunk in graph:
        labels = [chunk.node_1.label, chunk.node_2.label]
        ids = [chunk.node_1.name, chunk.node_2.name]
        relationship = chunk.relationship
        print(f"{labels[0]}: {ids[0]} --> {labels[1]}:{ids[1]}; rel:{relationship}")

        for label,id in zip(labels,ids):
            if not node_exists(net, node_id=id):
                net.add_node(id, label=f"{label}:{id}")

        net.add_edge(ids[0], ids[1], title=relationship, arrows='to')

    save_graph_image(filename)


def extract_relationships_and_nodes_with_properties(cypher_str):
    # Regular expression patterns to match the CREATE statements with relationships and properties
    node_pattern = re.compile(r'\((\w+):(\w+)(?: \{([^\}]*)\})?\)')
    relationship_pattern = re.compile(r'CREATE \((\w+)\)-\[:(\w+)\]->\((\w+)\)')

    # Function to parse properties string to a dictionary
    def parse_properties(properties_str):
        if not properties_str:
            return {}
        properties = properties_str.split(', ')
        prop_dict = {}
        for prop in properties:
            key, value = prop.split(': ')
            # Remove quotes from string values
            value = value.strip('"')
            prop_dict[key] = value
        return prop_dict

    nodes = {}
    relationships = {}

    # Extract nodes and their properties
    for match in node_pattern.finditer(cypher_str):
        node_var, label, properties_str = match.groups()
        properties = parse_properties(properties_str)
        nodes[node_var] = {"label": label, "properties": properties}

    # Extract relationships and associate nodes
    for match in relationship_pattern.finditer(cypher_str):
        node1, rel_type, node2 = match.groups()
        if rel_type not in relationships:
            relationships[rel_type] = []
        relationships[rel_type].append({
            "node_1": nodes[node1],
            "node_2": nodes[node2]
        })

    return relationships

In [48]:
rel = extract_relationships_and_nodes_with_properties(cyp_text_2)
print(rel)

{'TOOK_CARE_OF_JB_DURING_SUMMERS': [{'node_1': {'label': 'Person', 'properties': {'name': "Jeff Bezos's Grandfather"}}, 'node_2': {'label': 'Person', 'properties': {'name': 'Jeff Bezos'}}}], 'INVOLVED': [{'node_1': {'label': 'Activity', 'properties': {'description': 'pretend to help on the ranch'}}, 'node_2': {'label': 'Person', 'properties': {'name': 'Jeff Bezos'}}}]}


In [17]:
create_graph_image(g, "graph.html")

TypeError: 'Graph' object is not iterable

In [19]:
print(graph)

NameError: name 'graph' is not defined

In [23]:
query = """
CREATE
(:Person {name:"Jeff Bezos"})-[:FOUNDED]->(:Organisation {name:"Amazon"}),
(:Person {name:"Jeff Bezos"})-[:FOUNDED]->(:Organisation {name:"Blue Origin"}),
(:Person {name:"Jeff Bezos"})-[:HAD_CONVERSATION_WITH {details: "could have talked for many more hours, will talk again", first_time: "true", feeling: "felt like"}]->(:Person {name:"Lex Fridman"}),
(:Person {name:"Lex Fridman"})-[:HAD_CONVERSATION_WITH {details: "could have talked for many more hours, will talk again", mutual_assurance: "true"}]->(:Person {name:"Jeff Bezos"}),
(:Person {name:"Jeff Bezos"})-[:SPENT_CHILDHOOD_IN {location: "ranch", details: "spent a lot of time, did a lot of work, remembers doing the coolest job"}]->(:Place {name:"Texas"}),
(:Person {name:"Jeff Bezos"})-[:SPENT_SUMMERS_WITH {age_range: "from age four to 16"}]->(:Person {name:"Jeff Bezos's grandfather"}),
(:Person {name:"Jeff Bezos"})-[:GRANDFATHER {relationship: "took care during summers, watched over, gave breaks to mom"}]->(:Person {name:"Jeff Bezos's grandfather"}),
(:Person {name:"Jeff Bezos's grandfather"})-[:TOOK_CARE_WITH]->(:Person {name:"grandmother"}),
(:Person {name:"Jeff Bezos"})-[:BORN_TO {age_of_mother: "17"}]->(:Person {name:"mom"});
"""
missing = CreateGraph.check_missing_nodes(query, doc[0].text)
print("MISSING NODES:",missing)

if missing.lower() != "none":
    modified_query = CreateGraph.fix_missing_nodes(query, doc[0].text, missing)
    print("FIXED QUERY:", modified_query)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


MISSING NODES: The provided cypher query captures several aspects of Jeff Bezos's life and interactions, including his founding of Amazon and Blue Origin, his conversation with Lex Fridman, and some details about his family and childhood. However, upon reviewing the text, it appears that the query does not include all the relevant information:

1. **Missing Relationship Details in Conversation**: The query mentions the conversation between Jeff Bezos and Lex Fridman, including some details about the conversation's nature and mutual feelings. However, it does not explicitly state that this was Jeff Bezos's first time doing a conversation of this kind and length, which is a significant aspect of the conversation's uniqueness.

2. **Missing Details about Lex Fridman's Assurance**: While the query mentions mutual assurance in the conversation, it specifically highlights that "Lex Fridman is sure Lex Fridman and Jeff Bezos will" talk again. This assurance from Lex Fridman's perspective is n

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


FIXED QUERY: CREATE
(:Person {name:"Jeff Bezos"})-[:FOUNDED]->(:Organisation {name:"Amazon"}),
(:Person {name:"Jeff Bezos"})-[:FOUNDED]->(:Organisation {name:"Blue Origin"}),
(:Person {name:"Jeff Bezos"})-[:HAD_CONVERSATION_WITH {details: "could have talked for many more hours, will talk again", first_time: "true", feeling: "felt like", first_of_kind_and_length: "true"}]->(:Person {name:"Lex Fridman"}),
(:Person {name:"Lex Fridman"})-[:HAD_CONVERSATION_WITH {details: "could have talked for many more hours, will talk again", mutual_assurance: "true", assured_future_conversation: "true"}]->(:Person {name:"Jeff Bezos"}),
(:Person {name:"Jeff Bezos"})-[:SPENT_CHILDHOOD_IN {location: "ranch", details: "spent a lot of time, did a lot of work, remembers doing the coolest job"}]->(:Place {name:"Texas"}),
(:Person {name:"Jeff Bezos"})-[:SPENT_SUMMERS_WITH {age_range: "from age four to 16"}]->(:Person {name:"Jeff Bezos's grandfather"}),
(:Person {name:"Jeff Bezos"})-[:GRANDFATHER {relationship: 

In [8]:
graph = CreateGraph.run(docs=doc, graph_name="Convo-1")

[92m[39m
[92m▶︎ GRAPH MAKER LOG - 2024-06-02 18:19:03 - INFO [39m
[92mDocument: 1[39m
[92m[39m


Using Model:  gpt-4-turbo


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m[39m
[92m▶︎ GRAPH MAKER LOG - 2024-06-02 18:19:23 - INFO [39m
[92mTrying JSON Parsing: 
[
    {
        "node_1": {"label": "Person", "name": "Jeff Bezos"},
        "node_2": {"label": "Organisation", "name": "Amazon"},
        "relationship": "Jeff Bezos is the founder of Amazon."
    },
    {
        "node_1": {"label": "Person", "name": "Jeff Bezos"},
        "node_2": {"label": "Organisation", "name": "Blue Origin"},
        "relationship": "Jeff Bezos is the founder of Blue Origin."
    },
    {
        "node_1": {"label": "Person", "name": "Jeff Bezos"},
        "node_2": {"label": "Person", "name": "Lex Fridman"},
        "relationship": "Jeff Bezos had a conversation with Lex Fridman."
    },
    {
        "node_1": {"label": "Person", "name": "Lex Fridman"},
        "node_2": {"label": "Document", "name": "Lex Friedman Podcast"},
        "relationship": "Lex Fridman hosts the Le

Graph creatd with total number of Edges: 10
FIRST CREATION QUERY: CREATE
(:Person {name:"Jeff Bezos"})-[:Jeff_Bezos_is_the_founder_of_Amazon]->(:Organisation {name:"Amazon"}),
(:Person {name:"Jeff Bezos"})-[:Jeff_Bezos_is_the_founder_of_Blue_Origin]->(:Organisation {name:"Blue Origin"}),
(:Person {name:"Jeff Bezos"})-[:Jeff_Bezos_had_a_conversation_with_Lex_Fridman]->(:Person {name:"Lex Fridman"}),
(:Person {name:"Lex Fridman"})-[:Lex_Fridman_hosts_the_Lex_Friedman_Podcast]->(:Document {name:"Lex Friedman Podcast"}),
(:Person {name:"Jeff Bezos"})-[:Jeff_Bezos_spent_a_lot_of_his_childhood_with_his_grandfather_on_a_ranch_in_Texas]->(:Place {name:"Texas"}),
(:Person {name:"Jeff Bezos"})-[:Jeff_Bezos_spent_his_summers_from_age_four_to_16_on_a_ranch_under_the_care_of_his_grandfather]->(:Person {name:"Jeff Bezos's grandfather"}),
(:Person {name:"Jeff Bezos's grandfather"})-[:Jeff_Bezoss_grandfather_took_care_of_him_during_the_summers,_allowing_him_to_pretend_to_help_on_the_ranch]->(:Person {

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


MODIFIED QUERY: CREATE
(:Person {name:"Jeff Bezos"})-[:FOUNDER_OF]->(:Organisation {name:"Amazon"}),
(:Person {name:"Jeff Bezos"})-[:FOUNDER_OF]->(:Organisation {name:"Blue Origin"}),
(:Person {name:"Jeff Bezos"})-[:HAD_CONVERSATION_WITH]->(:Person {name:"Lex Fridman"}),
(:Person {name:"Lex Fridman"})-[:HOSTS]->(:Document {name:"Lex Friedman Podcast"}),
(:Person {name:"Jeff Bezos"})-[:SPENT_CHILDHOOD_AT {location: 'ranch', state: 'Texas'}]->(:Place {name:"Texas"}),
(:Person {name:"Jeff Bezos"})-[:CARED_FOR_BY {age_range: 'four to 16', during: 'summers'}]->(:Person {name:"Jeff Bezos's grandfather"}),
(:Person {name:"Jeff Bezos's grandfather"})-[:CARED_FOR {activity: 'pretend help on ranch', during: 'summers'}]->(:Person {name:"Jeff Bezos"}),
(:Person {name:"Jeff Bezos's mom"})-[:HAD_AT_AGE {age: '17'}]->(:Person {name:"Jeff Bezos"}),
(:Person {name:"Jeff Bezos's grandfather"})-[:GAVE_BREAK_TO {during: 'summers'}]->(:Person {name:"Jeff Bezos's mom"}),
(:Person {name:"Jeff Bezos's grandmo

In [9]:
graph_to_query = RetrieveFromGraph(graph_name="Convo-1")
nodes  = graph_to_query.view_nodes()
edges = graph_to_query.view_edges()

In [13]:
question = "How old was Jeff Bezos when his grandfather took care of him?"
ans = graph_to_query.get_answer(question)
print(ans)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


QUERY: To find out the age range during which Jeff Bezos's grandfather took care of him, we can use the relationship type `CARED_FOR_BY` which includes an age range attribute. Here's the Cypher query to retrieve this information:

```cypher
MATCH (p:Person {name: "Jeff Bezos"})-[r:CARED_FOR_BY]->(g:Person {name: "Jeff Bezos's grandfather"})
RETURN r.age_range AS AgeRange
```

This query matches the nodes representing Jeff Bezos and his grandfather connected by the `CARED_FOR_BY` relationship and returns the age range during which his grandfather took care of him.
['four to 16']
