In [None]:
GROQ_API_KEY=""
HF_TOKEN = ""
REQUIREMENTS_FOLDER = "projects/Aloha/requirements.txt"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = ""
NEO4J_URL = "bolt://localhost:7687"
KG_FOLDER = "kg/aloha"
INDEX_ID = "4fb4a4b6-12e0-4fa1-8249-6d096d6f3cd8"
READ = True
USE_BACKEND = True

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
from llama_index.core.graph_stores import SimpleGraphStore
from llama_index.graph_stores.neo4j import Neo4jGraphStore
from llama_index.core import StorageContext

if USE_BACKEND:
    graph_store = Neo4jGraphStore(
        username=NEO4J_USERNAME,
        password=NEO4J_PASSWORD,
        url=NEO4J_URL,
    )
else:
    graph_store = SimpleGraphStore()

storage_context = StorageContext.from_defaults(graph_store=graph_store, persist_dir=KG_FOLDER if READ else None)

# Preprocess requirements file

In [None]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

with open(REQUIREMENTS_FOLDER, 'r') as f:
    requirements = f.read()

#documents = [Document(text=requirements)]

#"""
splitter = SentenceSplitter.from_defaults(chunk_size=64, chunk_overlap=10, paragraph_separator="/n")
nodes = splitter.get_nodes_from_documents([Document(text=requirements)])

for node in nodes:
    print(node.get_content())
    print("---")

documents = [Document(text=node.get_content()) for node in nodes]
#"""

In [None]:
from llama_index.llms.groq import Groq
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

llm = Groq(model="llama3-70b-8192", api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")
Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-large-en-v1.5"
)

In [None]:
from llama_index.core import PromptTemplate

DEFAULT_KG_TRIPLET_EXTRACT_TMPL = PromptTemplate("""
    Extract knowledge triplets from the text below. Each triplet should be in the format:
    (Entity, Relationship, Entity).

    Guidelines:
    - Entities or actors should be meaningful system concepts. An actor might be a person, a company or organization, a computer program, or a computer system—hardware, software, or both (e.g., "User", "Account", "Profile", "Order").
    - Relationships should be **verbs or actions** connecting entities (e.g., "logs into", "creates", "modifies"). Relationships or interactions between external actors and the system under consideration occur to accomplish a goal.
    - Ensure extracted entities are distinct and avoid duplicates.

    Example:
    Text: "User logs into the system."
    Triplet: (User, logs into, System)

    Now extract up to {max_knowledge_triplets} from the following text:
    {text}
    """
)

# Create KG

In [None]:
from llama_index.core.indices.loading import load_index_from_storage
from llama_index.core import KnowledgeGraphIndex

if READ:
    index = load_index_from_storage(storage_context=storage_context, index_id=INDEX_ID)
else:
    if USE_BACKEND:
        graph_store.query("MATCH (n) DETACH DELETE n")

    index = KnowledgeGraphIndex.from_documents(
        documents,
        storage_context=storage_context,
        max_triplets_per_chunk=10,
        kg_triplet_extract_template=DEFAULT_KG_TRIPLET_EXTRACT_TMPL,
        include_embeddings=True
    )
    index.storage_context.persist(KG_FOLDER)

# Remove duplicated nodes

In [None]:
import ast
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


nodes = list(index._index_struct.table.keys())
edges = [ast.literal_eval(edge) for edge in list(index._index_struct.embedding_dict.keys())]
node_embeddings = {node: Settings.embed_model.get_text_embedding(node) for node in nodes}
node_names = list(node_embeddings.keys())
embedding_vectors = np.array(list(node_embeddings.values()))

similarity_matrix = cosine_similarity(embedding_vectors)

threshold = 0.85
to_remove = set()

for i in range(len(node_names)):
    for j in range(i + 1, len(node_names)):
        if similarity_matrix[i, j] > threshold:
            to_remove.add(node_names[j])

deduplicated_nodes = [node for node in node_names if node not in to_remove]
remaining_nodes = deduplicated_nodes
remaining_embeddings = np.array([node_embeddings[node] for node in remaining_nodes])
node_mapping = {}

for node in nodes:
    if node not in remaining_nodes:
        emb = np.array(node_embeddings[node]).reshape(1, -1)
        similarities = cosine_similarity(emb, remaining_embeddings)[0]
        closest_idx = np.argmax(similarities)
        node_mapping[node] = remaining_nodes[closest_idx]


updated_edges = [
    (node_mapping.get(h, h), r, node_mapping.get(t, t))
    for h, r, t in edges
]

text_nodes = index.docstore.get_nodes(list(index.ref_doc_info.values())[0].node_ids)

In [None]:
import networkx as nx
import gravis as gv


G = nx.DiGraph()

for head, relation, tail in updated_edges:
    G.add_edge(head, tail, label=relation)

gv.d3(G, use_node_size_normalization=True, node_size_normalization_max=30,
      use_edge_size_normalization=True, edge_size_data_source='weight', edge_curvature=0.3)

# Rebuild index

In [None]:
deduplicated_index = KnowledgeGraphIndex(
    [],
)

triplet_node_mapping = {}

for entity_a, relationship, entity_b in updated_edges:
    nodes_a = set(index._index_struct.search_node_by_keyword(entity_a))
    nodes_b = set(index._index_struct.search_node_by_keyword(entity_b))
    nodes = nodes_a & nodes_b
    deduplicated_index.upsert_triplet((entity_a, relationship, entity_b), True)
    for node_id in nodes:
        node = storage_context.docstore.docs.get(node_id)
        deduplicated_index.add_node([entity_a, entity_b], node)


# Get use cases

In [None]:
query_engine = deduplicated_index.as_query_engine(
    include_text=True,
    response_mode ="tree_summarize",
    embedding_mode="hybrid",
)

In [None]:
from IPython.display import Markdown, display

def query(question):
    response = query_engine.query(question).response
    display(Markdown(f"<b>{response}</b>"))

In [None]:
query("List all use cases as possible. A use case is a list of actor and interactions.")

In [None]:
responsibilities = [x.lower() for x in [ "Chats with Online friends", "Invites Friend to join aloha", "Gets Friend suggestions", "Unfriends Friend", "Searches Friends", "Searches Friends on aloha", "Resets Password", "Clicks Unlock link", "Clicks Verification link", "Retrieves Chitchat history", "Stores Chitchat on exit", "Sends Message to Intended user", "Receives Chitchat from User in friend list", "Initiates Chitchat with User in friend list", "Logs in to edit account details", "Logs in to add account details", "Logs in to add personal details", "Logs in to add educational details", "Has Verified account to add personal details", "Has Verified account to add educational details", "Has Option to Accept or ignore friend request", "Makes Personal information public or private", "Makes Account details public or private", "Must be 18 years old or more", "Has Options to Change personal settings", "Has Options to Search people", "Has Options to Logout", "Dislikes Scribble", "Comments on Scribble", "Erases Scribble", "Deletes Comment", "Deletes Account", "Retrieves Chitchat", "Stores Chitchat", "Receives Chitchat", "Sends Text files", "Sends Message", "Deletes Scribble", "Sees Count of liked scribble", "Sees Count of disliked scribble", "Sees Delete link", "Ignores Friend request", "Accepts Friend request", "Makes Personal information public", "Makes Personal information private", "Makes Account details public", "Makes Account details private", "Uploads Profile picture", "Edits Account details", "Adds Account details", "Adds Personal details", "Adds Educational details", "Logs in with Email-id and password", "Creates Account", "Create Account", "Login", "Initiate Chitchat", "View Friends", "Send Friend Request", "Accept Friend Request", "Ignore Friend Request", "Delete Friend Request", "Upload Profile Picture", "Edit Account Details", "Add Account Details", "Add Personal Details", "Add Educational Details", "Make Personal Information Public", "Make Personal Information Private", "Make Account Details Public", "Make Account Details Private", "Like Scribble", "Dislike Scribble", "Share Scribble", "View Online Friends", "Receive Friend Request", "Receive Chitchat", "Receive Verification Link", "Receive Unlock Link", "Receive Captcha" ]]
print(f"Extracted responsibilities: {len(responsibilities)}")

# Fuzzy match against ground truth

In [None]:
gt = ["accept request", "add educational skills", "add friend to list", "add languages, add gender, add interests", "attach text file", "change personal settings", "chat", "check password", "comment on scribble", "create account", "delete comment", "delete profile", "dislike scribble", "display friends", "enter account details", "enter scribble", "erase scribbles", "forward chat message", "give friend suggestion", "give priviledges to account", "invite friend", "like scribble", "lock account", "login", "notify wrong password", "receive chat", "receive request", "register details", "reject request", "remove friend", "resend verification link", "reset password", "search for friend", "see chat history", "see likes count", "see online friends", "see other profile", "see scribbles", "send friend request", "send verification link", "set public private", "show captcha", "store chats", "unfriend", "unlock account", "upload profile picture", "validate user info", "verify account", "view profile of friend"]
print(f"Ground truth: {len(gt)}")

In [None]:
from llama_index.core.schema import TextNode
from llama_index.core import VectorStoreIndex

nodes = []
results = []
for actual in gt:
    nodes.append(TextNode(text=actual))

fuzzy_match_index = VectorStoreIndex(nodes=nodes)

fuzzy_match_retriever = fuzzy_match_index.as_retriever(similarity_top_k=10, choice_batch_size=1)

def get_best_match(sample, threshold=0.8):
    for node in nodes:
        if node.text.lower() == sample.lower():
            return node.text

    results = fuzzy_match_retriever.retrieve(f"find {sample}")
    for result in results:
        similarity_score = result.score
        if similarity_score >= threshold:
            return result.node.text

    return results[0].node.text if results else None

for responsibility in responsibilities:
    best_match = get_best_match(responsibility, threshold=0.8)
    print(f"{responsibility} : {best_match}")
    print()
    results.append({'responsibility': responsibility, 'gt_match': best_match})

# Alternative retriever strategy

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer


bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=1,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)

for sample in responsibilities:
    results = bm25_retriever.retrieve(f"find {sample}")
    print(sample,':', results[0].node.text)
    print()

# Save results

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer


bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=1,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)

for sample in responsibilities:
    results = bm25_retriever.retrieve(f"find {sample}")
    print(sample,':', results[0].node.text)
    print()

# Save results

In [None]:
import pandas as pd

pd.DataFrame(results).to_csv(f"{KG_FOLDER}/results.csv")


In [None]:
responsibilities = [x.lower() for x in [ "Chats with Online friends", "Invites Friend to join aloha", "Gets Friend suggestions", "Unfriends Friend", "Searches Friends", "Searches Friends on aloha", "Resets Password", "Clicks Unlock link", "Clicks Verification link", "Retrieves Chitchat history", "Stores Chitchat on exit", "Sends Message to Intended user", "Receives Chitchat from User in friend list", "Initiates Chitchat with User in friend list", "Logs in to edit account details", "Logs in to add account details", "Logs in to add personal details", "Logs in to add educational details", "Has Verified account to add personal details", "Has Verified account to add educational details", "Has Option to Accept or ignore friend request", "Makes Personal information public or private", "Makes Account details public or private", "Must be 18 years old or more", "Has Options to Change personal settings", "Has Options to Search people", "Has Options to Logout", "Dislikes Scribble", "Comments on Scribble", "Erases Scribble", "Deletes Comment", "Deletes Account", "Retrieves Chitchat", "Stores Chitchat", "Receives Chitchat", "Sends Text files", "Sends Message", "Deletes Scribble", "Sees Count of liked scribble", "Sees Count of disliked scribble", "Sees Delete link", "Ignores Friend request", "Accepts Friend request", "Makes Personal information public", "Makes Personal information private", "Makes Account details public", "Makes Account details private", "Uploads Profile picture", "Edits Account details", "Adds Account details", "Adds Personal details", "Adds Educational details", "Logs in with Email-id and password", "Creates Account", "Create Account", "Login", "Initiate Chitchat", "View Friends", "Send Friend Request", "Accept Friend Request", "Ignore Friend Request", "Delete Friend Request", "Upload Profile Picture", "Edit Account Details", "Add Account Details", "Add Personal Details", "Add Educational Details", "Make Personal Information Public", "Make Personal Information Private", "Make Account Details Public", "Make Account Details Private", "Like Scribble", "Dislike Scribble", "Share Scribble", "View Online Friends", "Receive Friend Request", "Receive Chitchat", "Receive Verification Link", "Receive Unlock Link", "Receive Captcha" ]]
print(f"Extracted responsibilities: {len(responsibilities)}")

# Fuzzy match against ground truth

In [None]:
gt = ["accept request", "add educational skills", "add friend to list", "add languages, add gender, add interests", "attach text file", "change personal settings", "chat", "check password", "comment on scribble", "create account", "delete comment", "delete profile", "dislike scribble", "display friends", "enter account details", "enter scribble", "erase scribbles", "forward chat message", "give friend suggestion", "give priviledges to account", "invite friend", "like scribble", "lock account", "login", "notify wrong password", "receive chat", "receive request", "register details", "reject request", "remove friend", "resend verification link", "reset password", "search for friend", "see chat history", "see likes count", "see online friends", "see other profile", "see scribbles", "send friend request", "send verification link", "set public private", "show captcha", "store chats", "unfriend", "unlock account", "upload profile picture", "validate user info", "verify account", "view profile of friend"]
print(f"Ground truth: {len(gt)}")

In [None]:
from llama_index.core.schema import TextNode
from llama_index.core import VectorStoreIndex

nodes = []
results = []
for actual in gt:
    nodes.append(TextNode(text=actual))

fuzzy_match_index = VectorStoreIndex(nodes=nodes)

fuzzy_match_retriever = fuzzy_match_index.as_retriever(similarity_top_k=10, choice_batch_size=1)

def get_best_match(sample, threshold=0.8):
    for node in nodes:
        if node.text.lower() == sample.lower():
            return node.text
    
    results = fuzzy_match_retriever.retrieve(f"find {sample}")
    for result in results:
        similarity_score = result.score
        if similarity_score >= threshold:
            return result.node.text
    
    return results[0].node.text if results else None

for responsibility in responsibilities:
    best_match = get_best_match(responsibility, threshold=0.8)
    print(f"{responsibility} : {best_match}")
    print()
    results.append({'responsibility': responsibility, 'gt_match': best_match})

# Alternative retriever strategy

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer


bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=1,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)

for sample in responsibilities:
    results = bm25_retriever.retrieve(f"find {sample}")
    print(sample,':', results[0].node.text)
    print()

# Save results

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer


bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=1,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)

for sample in responsibilities:
    results = bm25_retriever.retrieve(f"find {sample}")
    print(sample,':', results[0].node.text)
    print()

# Save results

In [None]:
import pandas as pd

pd.DataFrame(results).to_csv(f"{KG_FOLDER}/results.csv")