In [11]:
from dotenv import load_dotenv
import os

import pandas as pd

# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
NEO4J_URL = os.getenv('NEO4J_URL')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
# OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
# # Note the code below is unique to this course environment, and not a 
# # standard part of Neo4j's integration with OpenAI. Remove if running 
# # in your own environment.
# OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

In [5]:
AUTH = (os.getenv('NEO4J_USERNAME'), os.getenv('NEO4J_PASSWORD'))
with GraphDatabase.driver(NEO4J_URI, auth = AUTH) as driver:
    driver.verify_connectivity()
    

In [6]:
graph = Neo4jGraph(url=NEO4J_URI, 
                   username=NEO4J_USERNAME, 
                   password=NEO4J_PASSWORD,
                   refresh_schema=False)

def clean_graph():
    query = """
    MATCH (n)
    DETACH DELETE n
    """
    graph.query(query)

In [7]:
from langchain_core.documents import Document

text = """
Marie Curie, 7 November 1867 – 4 July 1934, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.
""" 
# She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.
# Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
# She was, in 1906, the first woman to become a professor at the University of Paris.
# Also, Robin Williams!
# """
documents = [Document(page_content=text)]

In [8]:
from langchain_openai import ChatOpenAI
import getpass
import os

# os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI api key")

llm = ChatOpenAI(model='gpt-4o-mini', max_tokens=50,api_key=OPENAI_API_KEY )

In [9]:


no_schema = LLMGraphTransformer(llm=llm)
no_schema_prompt = LLMGraphTransformer(llm=llm, ignore_tool_usage=True)

In [10]:
clean_graph()

data = await no_schema_prompt.aconvert_to_graph_documents(documents)
print(data)
graph.add_graph_documents(data)

[GraphDocument(nodes=[], relationships=[], source=Document(metadata={}, page_content='\nMarie Curie, 7 November 1867 – 4 July 1934, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.\n'))]


# this be kinda working

In [18]:
chunk = 1000
data = pd.read_json('data/test/arxiv-metadata-oai-snapshot.json', lines=True, chunksize=chunk)

for chunk in data:
    data_crop = chunk
    break

In [53]:
documents = [Document(page_content=data_crop['abstract'][i], metadata={"authors":data_crop["authors"][i], "category":data_crop["categories"][i]}) for i in range(10)]

In [49]:
clean_graph()
allowed_nodes = ["authors", "category"]
allowed_relations = [("authors", "FIELD", "category")]

llm = ChatOpenAI(temperature=0, model_name="gpt-4-turbo")
llm_transformer = LLMGraphTransformer(llm=llm, strict_mode = True)

data = await llm_transformer.aconvert_to_graph_documents(documents)
print(data)
graph.add_graph_documents(data)

[GraphDocument(nodes=[Node(id='John Doe', type='Person', properties={}), Node(id='Jane Smith', type='Person', properties={}), Node(id='Acme Corporation', type='Organization', properties={})], relationships=[Relationship(source=Node(id='John Doe', type='Person', properties={}), target=Node(id='Jane Smith', type='Person', properties={}), type='COLLEAGUE', properties={}), Relationship(source=Node(id='John Doe', type='Person', properties={}), target=Node(id='Acme Corporation', type='Organization', properties={}), type='EMPLOYEE', properties={}), Relationship(source=Node(id='Jane Smith', type='Person', properties={}), target=Node(id='Acme Corporation', type='Organization', properties={}), type='EMPLOYEE', properties={})], source=Document(metadata={'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan", 'category': 'hep-ph'}, page_content='')), GraphDocument(nodes=[Node(id='John Doe', type='Person', properties={}), Node(id='Jane Smith', type='Person', properties={}), Node(id='Ac