In [None]:
!pip install llama-index llama-index-llms-groq groq llama-index-embeddings-huggingface ipywidgets

## Without Knowledge Graph

In [4]:
from IPython.display import display
import ipywidgets as widgets
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    Settings,
    load_index_from_storage
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.groq import Groq
import warnings
import os

warnings.filterwarnings('ignore')

# Set the API key as an environment variable
os.environ["GROQ_API_KEY"] = "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"

# Now you can access it in your code using os.getenv("GROQ_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Define your prompt template
prompt_template = """
You are an expert on Harry Potter and the Deathly Hallows book. Users will ask you questions related to it.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Answer the question and provide additional helpful information,
based on the pieces of information, if applicable. Be succinct.

Responses should be properly formatted to be easily read.
"""

# Define the context for your prompt
context = "This directory contains the Harry Potter and Deathly Hallows book."

# Data ingestion: load all files from a directory
directory_path = "/kaggle/input/hp-deathly-hallows-book"  # Update this with your directory path
reader = SimpleDirectoryReader(input_dir=directory_path)
documents = reader.load_data()

# Split the documents into nodes
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=200)
nodes = text_splitter.get_nodes_from_documents(documents, show_progress=True)

# Set up embedding model and LLM
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
llm = Groq(model="llama3-70b-8192", api_key=GROQ_API_KEY)

# Create service context
Settings.llm = llm
Settings.embed_model = embed_model

# Create and persist the vector store index
vector_index = VectorStoreIndex.from_documents(documents, show_progress=True, node_parser=nodes)
vector_index.storage_context.persist(persist_dir="./storage_mini")

# Load the index from storage
storage_context = StorageContext.from_defaults(persist_dir="./storage_mini")
index = load_index_from_storage(storage_context)

# Create the interactive widgets
input_box = widgets.Text(
    value='Explain Harry Potter?',
    placeholder='Type your question here',
    description='Question:',
    disabled=False
)

output_area = widgets.Output()

def on_button_click(b):
    with output_area:
        output_area.clear_output()
        question = input_box.value
        query_prompt = prompt_template.format(context=context, question=question)
        resp = query_engine.query(query_prompt)
        print(resp.response)

button = widgets.Button(
    description='Ask',
    disabled=False,
    button_style='',
    tooltip='Ask the question',
    icon='check'
)

button.on_click(on_button_click)

display(input_box, button, output_area)

# Set up query engine
query_engine = index.as_query_engine()

Parsing nodes:   0%|          | 0/781 [00:00<?, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Parsing nodes:   0%|          | 0/781 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/774 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Text(value='Explain Harry Potter?', description='Question:', placeholder='Type your question here')

Button(description='Ask', icon='check', style=ButtonStyle(), tooltip='Ask the question')

Output()

### With Knowledge Graph

In [5]:
!pip install neo4j langchain-experimental

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting neo4j
  Downloading neo4j-5.24.0-py3-none-any.whl.metadata (5.7 kB)
Collecting langchain-experimental
  Downloading langchain_experimental-0.3.0-py3-none-any.whl.metadata (1.7 kB)
Collecting langchain-community<0.4.0,>=0.3.0 (from langchain-experimental)
  Downloading langchain_community-0.3.0-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain-core<0.4.0,>=0.3.0 (from langchain-experimental)
  Downloading langchain_core-0.3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain<0.4.0,>=0.3.0 (from langchain-community<0.4.0,>=0.3.0->langchain-experimental)
  Downloading langchain-0.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting langsmith<0.2.0,>=0.1.112 (from langchain-community<0.4.0,>=0.3.0->langchain-experimental)
  Downloading langsmith-0.1.125-py3-none-any.whl.metadata (13 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community<0.4.0,>=0.3.0->langchain-experimental)
  Downloading pydantic_settings-2.5.2-py3-none-any.whl.metadata (3.5 kB)
Colle

In [6]:
from IPython.display import display
import ipywidgets as widgets
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings, load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.groq import Groq
import warnings
import os
from neo4j import GraphDatabase
import spacy

warnings.filterwarnings('ignore')

# ---- NEO4J SETUP ----
neo4j_uri = "neo4j+s://c6794fc6.databases.neo4j.io"
neo4j_user = "neo4j"
neo4j_password = "neo4j"
driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))

In [7]:
# ---- ENVIRONMENT VARIABLES ----
os.environ["GROQ_API_KEY"] = "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"
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# ---- PROMPT TEMPLATE ----
prompt_template = """
You are an expert on Harry Potter and the Deathly Hallows book. Users will ask you questions related to it.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Answer the question and provide additional helpful information,
based on the pieces of information, if applicable. Be succinct.

Responses should be properly formatted to be easily read.
"""

In [8]:
# Define the context for your prompt
context = "This directory contains the Harry Potter and Deathly Hallows book."

# Data ingestion: load all files from a directory
directory_path = "/kaggle/input/hp-deathly-hallows-book"
reader = SimpleDirectoryReader(input_dir=directory_path)
documents = reader.load_data()

# Load spacy model (you can choose a different model)
nlp = spacy.load("en_core_web_sm")

In [9]:
# Function to extract entities and relationships from documents
def populate_graph(documents, driver, nlp):
    with driver.session() as session:
        for doc in documents:
            doc_text = doc.text  # Assuming each document has a 'text' attribute
            nlp_doc = nlp(doc_text)
            concepts = [ent.text for ent in nlp_doc.ents if ent.label_ == "ORG" or ent.label_ == "PRODUCT"] # Adjust entity types as needed

            for concept in concepts:
                session.run("MERGE (:Concept {name: $concept})", concept=concept)

            for i, concept in enumerate(concepts):
                if i + 1 < len(concepts):
                    next_concept = concepts[i + 1]
                    session.run(
                        """
                        MATCH (c1:Concept {name: $concept}), (c2:Concept {name: $next_concept})
                        MERGE (c1)-[:RELATED_TO]->(c2)
                        """,
                        concept=concept, next_concept=next_concept
                    )

# Populate the Neo4j graph
populate_graph(documents, driver, nlp)

AuthError: {code: Neo.ClientError.Security.Unauthorized} {message: The client is unauthorized due to authentication failure.}

In [None]:
# Split the documents into nodes
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=200)
nodes = text_splitter.get_nodes_from_documents(documents, show_progress=True)

# Set up embedding model and LLM
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
llm = Groq(model="llama3-70b-8192", api_key=GROQ_API_KEY)

# Create service context
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)

# Create vector store index
vector_index = VectorStoreIndex.from_documents(documents, show_progress=True, service_context=service_context, node_parser=nodes)
vector_index.storage_context.persist(persist_dir="./storage_mini")

# Load the index from storage
storage_context = StorageContext.from_defaults(persist_dir="./storage_mini")
index = load_index_from_storage(storage_context, service_context=service_context)

In [None]:
# Create the interactive widgets
input_box = widgets.Text(
    value='Explain Python?',
    placeholder='Type your question here',
    description='Question:',
    disabled=False
)

output_area = widgets.Output()

In [None]:
#Query Enhancement with Neo4j

def get_graph_insights(question):
  with driver.session() as session:
    result = session.run(
         """
            MATCH (c:Concept)
            WHERE toLower(c.name) CONTAINS toLower($question)
            OPTIONAL MATCH (c)-[r:RELATED_TO]->(other:Concept)
            RETURN c.name AS concept, collect(other.name) AS related_concepts
            """,
         question=question
         )
    insights = []
    for record in result:
       insights.append(f"Concept: {record['concept']}, Related Concepts: {', '.join(record['related_concepts'])}")
       return "\n".join(insights) if insights else "No relevant graph insights found."


In [None]:
def on_button_click(b):
  with output_area:
    output_area.clear_output()
    question = input_box.value
    graph_insights = get_graph_insights(question)
    query_prompt = prompt_template.format(context=context, graph_insights=graph_insights, question=question)
    resp = query_engine.query(query_prompt)
    print(resp.response)


button = widgets.Button(
    description='Ask',
    disabled=False,
    button_style='',
    tooltip='Ask the question',
    icon='check'
)

button.on_click(on_button_click)

display(input_box, button, output_area)

#Query Engine Setup
query_engine = index.as_query_engine(service_context=service_context)

In [None]:
button = widgets.Button(
    description='Ask',
    disabled=False,
    button_style='',
    tooltip='Ask the question',
    icon='check'
)

button.on_click(on_button_click)

display(input_box, button, output_area)

# ---- QUERY ENGINE SETUP ----
query_engine = index.as_query_engine(service_context=service_context)


In [None]:
button = widgets.Button(
    description='Ask',
    disabled=False,
    button_style='',
    tooltip='Ask the question',
    icon='check'
)

button.on_click(on_button_click)

display(input_box, button, output_area)

# ---- QUERY ENGINE SETUP ----
query_engine = index.as_query_engine(service_context=service_context)


In [None]:
button = widgets.Button(
    description='Ask',
    disabled=False,
    button_style='',
    tooltip='Ask the question',
    icon='check'
)

button.on_click(on_button_click)

display(input_box, button, output_area)

# ---- QUERY ENGINE SETUP ----
query_engine = index.as_query_engine(service_context=service_context)
