In [1]:
# Standard library imports
import os
import time
from collections import OrderedDict

from grag.create_graph import (
    create_driver, 
    initialize_embeddings,
    initialize_llm,
    initialize_vector_store,
    create_vector_index,
    create_documents_from_df,
    create_batches,
    delete_all_nodes_and_indexes,
    embed_and_add_batch,
    qa_pipeline
)

# Third-party imports
import logging
from dotenv import load_dotenv
import polars as pl
from tqdm import tqdm

# LangChain imports
from langchain.chains import RetrievalQA


In [2]:
# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Variables
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_URI = os.getenv("NEO4J_URI")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Define max tokens per batch
max_tokens_per_batch = 100000

In [4]:
CREATE_VECTOR_INDEX_CYPHER = """
CREATE VECTOR INDEX vector_index FOR (c:Chunk) ON (c.embedding)
OPTIONS {
  indexConfig:{
    `vector.dimensions`: 1536,
    `vector.hnsw.m`: 48,
    `vector.hnsw.ef_construction`: 200,
    `vector.quantization.enabled`: false,
    `vector.similarity_function`: 'cosine'
  }
};
"""

In [None]:
# Import data
chunk_df = pl.read_parquet("/Users/borja/Documents/Somniumrema/projects/genai/grag/pipeline_outcomes/chunks_df.parquet")

# Import questions
questions = pl.read_csv("/Users/borja/Documents/Somniumrema/projects/genai/grag/data/acquired-qa-evaluation.csv",
                        encoding="latin")

In [4]:
# Initialize Neo4j driver
driver = create_driver(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)

# Delete all nodes and indexes
# delete_all_nodes_and_indexes(driver)

# Initialize embeddings
embeddings = initialize_embeddings()
        
# Initialize language model
llm = initialize_llm(model_name = "gpt-3.5-turbo", temperature = 0)
        
# Initialize vector store
vector_store = initialize_vector_store(
    embeddings=embeddings,
    uri=NEO4J_URI,
    user=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
        )

# Initialize the RetrievalQA Chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # Simple concatenation of retrieved docs into the prompt
    retriever=vector_store.as_retriever(search_kwargs={"k": 3}),  # Retrieve top 5 relevant documents
    return_source_documents=True
)

INFO:grag.create_graph:Neo4j driver created successfully.
INFO:grag.create_graph:Embeddings initialized with model: text-embedding-ada-002
INFO:grag.create_graph:Language model initialized with model: gpt-3.5-turbo
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:grag.create_graph:Neo4jVector store initialized with index: vector_index


In [65]:
# Initialize the RetrievalQA Chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # Simple concatenation of retrieved docs into the prompt
    retriever=vector_store.as_retriever(search_kwargs={"k": 3}),  # Retrieve top 5 relevant documents
    return_source_documents=True
)

In [8]:
# Create the vector index
create_vector_index(driver, CREATE_VECTOR_INDEX_CYPHER)

INFO:grag.create_graph:Index 'vector_index' created successfully.


In [9]:
# Create documents from DataFrame
documents = create_documents_from_df(chunk_df)
logger.info(f"Total documents created: {len(documents)}")

INFO:grag.create_graph:Starting to create Document objects from DataFrame.
INFO:grag.create_graph:Created 1304 Document objects.
INFO:__main__:Total documents created: 1304


In [10]:
# Split documents into manageable batches
batches = create_batches(documents, max_tokens_per_batch=max_tokens_per_batch)
total_batches = len(batches)
print(f"Total Batches to Process: {total_batches}")


# Process each batch
for i, batch in enumerate(tqdm(batches, desc="Processing Batches")):
    embed_and_add_batch(batch, embeddings, vector_store, batch_num=i+1)
    time.sleep(1)  # Sleep for 1 second between batches

Total Batches to Process: 45


Processing Batches:   0%|          | 0/45 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 1: Successfully added to Neo4jVector.


Processing Batches:   2%|▏         | 1/45 [00:05<04:02,  5.52s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 2: Successfully added to Neo4jVector.


Processing Batches:   4%|▍         | 2/45 [00:09<03:29,  4.88s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 3: Successfully added to Neo4jVector.


Processing Batches:   7%|▋         | 3/45 [00:14<03:22,  4.81s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 4: Successfully added to Neo4jVector.


Processing Batches:   9%|▉         | 4/45 [00:18<03:00,  4.39s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 5: Successfully added to Neo4jVector.


Processing Batches:  11%|█         | 5/45 [00:22<02:49,  4.23s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 6: Successfully added to Neo4jVector.


Processing Batches:  13%|█▎        | 6/45 [00:26<02:40,  4.12s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 7: Successfully added to Neo4jVector.


Processing Batches:  16%|█▌        | 7/45 [00:29<02:29,  3.92s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 8: Successfully added to Neo4jVector.


Processing Batches:  18%|█▊        | 8/45 [00:33<02:24,  3.91s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 9: Successfully added to Neo4jVector.


Processing Batches:  20%|██        | 9/45 [00:38<02:34,  4.29s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 10: Successfully added to Neo4jVector.


Processing Batches:  22%|██▏       | 10/45 [00:43<02:36,  4.48s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 11: Successfully added to Neo4jVector.


Processing Batches:  24%|██▍       | 11/45 [00:48<02:33,  4.52s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 12: Successfully added to Neo4jVector.


Processing Batches:  27%|██▋       | 12/45 [00:53<02:34,  4.69s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 13: Successfully added to Neo4jVector.


Processing Batches:  29%|██▉       | 13/45 [00:58<02:30,  4.69s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 14: Successfully added to Neo4jVector.


Processing Batches:  31%|███       | 14/45 [01:02<02:20,  4.54s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 15: Successfully added to Neo4jVector.


Processing Batches:  33%|███▎      | 15/45 [01:06<02:12,  4.40s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 16: Successfully added to Neo4jVector.


Processing Batches:  36%|███▌      | 16/45 [01:10<02:03,  4.25s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 17: Successfully added to Neo4jVector.


Processing Batches:  38%|███▊      | 17/45 [01:15<02:03,  4.41s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 18: Successfully added to Neo4jVector.


Processing Batches:  40%|████      | 18/45 [01:18<01:53,  4.19s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 19: Successfully added to Neo4jVector.


Processing Batches:  42%|████▏     | 19/45 [01:22<01:48,  4.17s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 2.533000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 20: Successfully added to Neo4jVector.


Processing Batches:  44%|████▍     | 20/45 [01:30<02:11,  5.25s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 1.841000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 21: Successfully added to Neo4jVector.


Processing Batches:  47%|████▋     | 21/45 [01:38<02:21,  5.90s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 1.684000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 22: Successfully added to Neo4jVector.


Processing Batches:  49%|████▉     | 22/45 [01:43<02:14,  5.87s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 3.135000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 23: Successfully added to Neo4jVector.


Processing Batches:  51%|█████     | 23/45 [01:50<02:17,  6.23s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 3.412000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 24: Successfully added to Neo4jVector.


Processing Batches:  53%|█████▎    | 24/45 [01:59<02:26,  6.96s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 2.073000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 25: Successfully added to Neo4jVector.


Processing Batches:  56%|█████▌    | 25/45 [02:07<02:23,  7.18s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 1.536000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 26: Successfully added to Neo4jVector.


Processing Batches:  58%|█████▊    | 26/45 [02:12<02:07,  6.72s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 2.691000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 27: Successfully added to Neo4jVector.


Processing Batches:  60%|██████    | 27/45 [02:20<02:06,  7.02s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 2.692000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 28: Successfully added to Neo4jVector.


Processing Batches:  62%|██████▏   | 28/45 [02:28<02:02,  7.20s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 2.316000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 29: Successfully added to Neo4jVector.


Processing Batches:  64%|██████▍   | 29/45 [02:35<01:53,  7.07s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 2.769000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 30: Successfully added to Neo4jVector.


Processing Batches:  67%|██████▋   | 30/45 [02:42<01:49,  7.31s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 2.121000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 31: Successfully added to Neo4jVector.


Processing Batches:  69%|██████▉   | 31/45 [02:51<01:48,  7.76s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 0.522000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 32: Successfully added to Neo4jVector.


Processing Batches:  71%|███████   | 32/45 [02:57<01:32,  7.12s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 1.995000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 33: Successfully added to Neo4jVector.


Processing Batches:  73%|███████▎  | 33/45 [03:05<01:28,  7.42s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 1.261000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 34: Successfully added to Neo4jVector.


Processing Batches:  76%|███████▌  | 34/45 [03:11<01:18,  7.14s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 1.800000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 35: Successfully added to Neo4jVector.


Processing Batches:  78%|███████▊  | 35/45 [03:19<01:12,  7.20s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 1.743000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 36: Successfully added to Neo4jVector.


Processing Batches:  80%|████████  | 36/45 [03:24<01:00,  6.71s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 3.230000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 37: Successfully added to Neo4jVector.


Processing Batches:  82%|████████▏ | 37/45 [03:36<01:04,  8.08s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 38: Successfully added to Neo4jVector.


Processing Batches:  84%|████████▍ | 38/45 [03:40<00:48,  6.92s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 2.298000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 39: Successfully added to Neo4jVector.


Processing Batches:  87%|████████▋ | 39/45 [03:48<00:44,  7.41s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 0.085000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 40: Successfully added to Neo4jVector.


Processing Batches:  89%|████████▉ | 40/45 [03:54<00:34,  6.96s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 1.989000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 41: Successfully added to Neo4jVector.


Processing Batches:  91%|█████████ | 41/45 [04:02<00:28,  7.09s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 1.838000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 42: Successfully added to Neo4jVector.


Processing Batches:  93%|█████████▎| 42/45 [04:07<00:19,  6.65s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 3.275000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 43: Successfully added to Neo4jVector.


Processing Batches:  96%|█████████▌| 43/45 [04:15<00:13,  6.83s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 3.039000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 44: Successfully added to Neo4jVector.


Processing Batches:  98%|█████████▊| 44/45 [04:23<00:07,  7.22s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
INFO:openai._base_client:Retrying request to /embeddings in 1.683000 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batch 45: Successfully added to Neo4jVector.


Processing Batches: 100%|██████████| 45/45 [04:30<00:00,  6.00s/it]


In [6]:
# Example usage
user_question = "When did Airbnb go public?"
answer, sources = qa_pipeline(qa, user_question)

print(f"Question: {user_question}")
print(f"Answer: {answer}\n")

print("Sources:")
unique_sources = OrderedDict()

for doc in sources:
    # Combine multiple metadata fields for a richer source description
    post_title = doc.metadata.get('post_title', 'No Title')
    series_number = doc.metadata.get('series_number', 'No Series Info')
    file_name = doc.metadata.get('file_name', 'No File Info')
    date = doc.metadata.get('blog_date', 'No Date Info')
    
    # Format the source string with context (file_name)
    source = f"Post title: {post_title} | Series number: {series_number} | File name: {file_name} | Blog date: {date}"
    
    # Use OrderedDict to maintain insertion order and uniqueness
    unique_sources[source] = None

# Print unique sources
for source in unique_sources.keys():
    print(f"- {source}")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Question: When did Airbnb go public?
Answer: Airbnb went public on December 10, 2020.

Sources:
- Post title: Airbnb | Series number: Season 7, Episode 8 | File name: airbnb | Blog date: 2020-12-10
