In [191]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

def generate_embedding(text):
    model_path =  "/Users/abhishekbairagi/Desktop/experiments/devcon/sent-transformer/all-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)
    # model = SentenceTransformer(model_path)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embedding[0]


In [150]:
# Let's define the Python script for ingesting sample graph data into Neo4j with vector embeddings.
# This script assumes you have Neo4j Python driver installed and a running Neo4j instance.

from neo4j import GraphDatabase

# Sample data for Slack product structure
data = {
    "products": [
        {"product_id": "p1", "name": "Slack", "aliases": ["slack", "slackapp"]}
    ],
    "subtopics": [
        {"subtopic_id": "s1", "name": "Channels", "aliases": ["channel", "ch"], "has_further_subtopic": False, "parent_topic_id": None},
        {"subtopic_id": "s2", "name": "Notifications", "aliases": ["alerts", "notifs"], "has_further_subtopic": False, "parent_topic_id": None}
    ],
    "issues": [
        {"issue_id": "i1", "description": "Unable to join Slack channels", "keywords": ["join", "channel", "access"], "embedding": [0.1]*384, "frequency": "high", "severity": "medium"},
        {"issue_id": "i2", "description": "Channels not loading", "keywords": ["load", "channel", "error"], "embedding": [0.2]*384, "frequency": "medium", "severity": "high"},
        {"issue_id": "i3", "description": "Slack notifications delayed", "keywords": ["notifications", "delay", "late"], "embedding": [0.3]*384, "frequency": "high", "severity": "medium"},
        {"issue_id": "i4", "description": "Notifications not appearing on desktop", "keywords": ["notifications", "desktop", "missing"], "embedding": [0.4]*384, "frequency": "low", "severity": "low"}
    ],
    "causes": [
        {"cause_id": "c1", "description": "User lacks permission", "conditions": "user not part of workspace"},
        {"cause_id": "c2", "description": "Slack cache issue", "conditions": "cache corrupted"},
        {"cause_id": "c3", "description": "Do Not Disturb mode enabled", "conditions": "DND enabled on Slack"},
        {"cause_id": "c4", "description": "System notifications disabled", "conditions": "desktop notifications off"}
    ],
    "solutions": [
        {"solution_id": "sol1", "description": "Request access from admin", "related_article_ids": ["a1"]},
        {"solution_id": "sol2", "description": "Clear Slack cache", "related_article_ids": ["a2"]},
        {"solution_id": "sol3", "description": "Disable Do Not Disturb", "related_article_ids": ["a3"]},
        {"solution_id": "sol4", "description": "Enable desktop notifications", "related_article_ids": ["a3"]}
    ],
    "articles": [
        {"article_id": "a1", "title": "Getting Slack Channel Access", "summary": "Guide to requesting access to channels", "url": "https://kb.example.com/slack-access", "tags": ["access", "admin"], "embedding": [0.01]*384},
        {"article_id": "a2", "title": "Fix Slack Cache Issues", "summary": "Steps to clear Slack cache", "url": "https://kb.example.com/slack-cache", "tags": ["cache", "clear"], "embedding": [0.02]*384},
        {"article_id": "a3", "title": "Manage Notification Settings", "summary": "Manage DND and notification preferences", "url": "https://kb.example.com/slack-notifs", "tags": ["dnd", "notifications"], "embedding": [0.03]*384}
    ],
    "user": {"user_id": "u1", "location": "NY", "device_type": "laptop", "issue_history": ["i1", "i3"]},
    "devices": [
        {"device_id": "d1", "type": "Mac", "os_version": "12.6"},
        {"device_id": "d2", "type": "Windows", "os_version": "11"}
    ],
    "outage": {"incident_id": "o1", "description": "Slack outage in US East", "timestamp": "2024-05-12T10:00:00Z", "affected_users": ["u1"]}
}

data.keys()  # Display keys for structure confirmation before writing ingestion logic.


dict_keys(['products', 'subtopics', 'issues', 'causes', 'solutions', 'articles', 'user', 'devices', 'outage'])

In [153]:
from neo4j import GraphDatabase

class KGIngestor:
    def __init__(self, uri, user, password, embedding_function):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        self.generate_embedding = embedding_function

    def close(self):
        self.driver.close()

    def ingest(self, data):
        with self.driver.session() as session:
            session.write_transaction(self._create_products_subtopics, data)
            session.write_transaction(self._create_issues_causes_solutions_articles, data)
            session.write_transaction(self._create_users_devices_outages, data)
            session.write_transaction(self._create_relationships, data)

    def _create_products_subtopics(self, tx, data):
        for product in data["products"]:
            tx.run("""
            MERGE (p:Product {id: $id})
            SET p.name = $name, p.aliases = $aliases
            """, id=product["product_id"], name=product["name"], aliases=product["aliases"])

        for sub in data["subtopics"]:
            tx.run("""
            MERGE (s:Subtopic {id: $id})
            SET s.name = $name, s.aliases = $aliases,
                s.has_further_subtopic = $has_further_subtopic, s.parent_topic_id = $parent_topic_id
            """, id=sub["subtopic_id"], name=sub["name"], aliases=sub["aliases"],
                 has_further_subtopic=sub["has_further_subtopic"], parent_topic_id=sub["parent_topic_id"])

    def _create_issues_causes_solutions_articles(self, tx, data):
        for issue in data["issues"]:
            embedding = self.generate_embedding(issue["description"] + ' ' + ' '.join(issue["keywords"]))
            tx.run("""
            MERGE (i:Issue {id: $id})
            SET i.description = $description, i.keywords = $keywords,
                i.embedding = $embedding, i.frequency = $frequency, i.severity = $severity
            """, id=issue["issue_id"], description=issue["description"],
                 keywords=issue["keywords"], embedding=embedding,
                 frequency=issue["frequency"], severity=issue["severity"])

        for cause in data["causes"]:
            tx.run("""
            MERGE (c:Cause {id: $id})
            SET c.description = $description, c.conditions = $conditions
            """, id=cause["cause_id"], description=cause["description"], conditions=cause["conditions"])

        for solution in data["solutions"]:
            tx.run("""
            MERGE (s:Solution {id: $id})
            SET s.description = $description, s.related_article_ids = $related_article_ids
            """, id=solution["solution_id"], description=solution["description"],
                 related_article_ids=solution["related_article_ids"])

        for article in data["articles"]:
            embedding = self.generate_embedding(article["title"] + ' ' + article["summary"])
            tx.run("""
            MERGE (a:Article {id: $id})
            SET a.title = $title, a.summary = $summary, a.url = $url, a.tags = $tags, a.embedding = $embedding
            """, id=article["article_id"], title=article["title"], summary=article["summary"],
                 url=article["url"], tags=article["tags"], embedding=embedding)

    def _create_users_devices_outages(self, tx, data):
        user = data["user"]
        tx.run("""
        MERGE (u:User {id: $id})
        SET u.location = $location, u.device_type = $device_type
        """, id=user["user_id"], location=user["location"], device_type=user["device_type"])

        for device in data["devices"]:
            tx.run("""
            MERGE (d:Device {id: $id})
            SET d.type = $type, d.os_version = $os_version
            """, id=device["device_id"], type=device["type"], os_version=device["os_version"])

        outage = data["outage"]
        tx.run("""
        MERGE (o:Outage {id: $id})
        SET o.description = $description, o.timestamp = $timestamp, o.affected_users = $affected_users
        """, id=outage["incident_id"], description=outage["description"],
             timestamp=outage["timestamp"], affected_users=outage["affected_users"])

    def _create_relationships(self, tx, data):
        # Product - Subtopic
        for product in data["products"]:
            for sub in data["subtopics"]:
                tx.run("""
                MATCH (p:Product {id: $pid}), (s:Subtopic {id: $sid})
                MERGE (p)-[:HAS_SUBTOPIC]->(s)
                """, pid=product["product_id"], sid=sub["subtopic_id"])

        # Subtopic - Issue
        subtopic_map = {
            "s1": ["i1", "i2"],
            "s2": ["i3", "i4"]
        }
        for sid, issues in subtopic_map.items():
            for iid in issues:
                tx.run("""
                MATCH (s:Subtopic {id: $sid}), (i:Issue {id: $iid})
                MERGE (s)-[:HAS_ISSUE]->(i)
                """, sid=sid, iid=iid)

        # Issue - Cause - Solution - Article
        cause_map = {
            "i1": ["c1"], "i2": ["c2"], "i3": ["c3"], "i4": ["c4"]
        }
        solution_map = {
            "c1": "sol1", "c2": "sol2", "c3": "sol3", "c4": "sol4"
        }
        article_map = {
            "sol1": "a1", "sol2": "a2", "sol3": "a3", "sol4": "a3"
        }

        for issue_id, cause_ids in cause_map.items():
            for cid in cause_ids:
                tx.run("""
                MATCH (i:Issue {id: $iid}), (c:Cause {id: $cid})
                MERGE (i)-[:HAS_CAUSE]->(c)
                """, iid=issue_id, cid=cid)

        for cid, sid in solution_map.items():
            tx.run("""
            MATCH (c:Cause {id: $cid}), (s:Solution {id: $sid})
            MERGE (c)-[:HAS_SOLUTION]->(s)
            """, cid=cid, sid=sid)

        for sid, aid in article_map.items():
            tx.run("""
            MATCH (s:Solution {id: $sid}), (a:Article {id: $aid})
            MERGE (s)-[:MENTIONED_IN]->(a)
            """, sid=sid, aid=aid)

        # User relationships
        user = data["user"]
        for issue_id in user["issue_history"]:
            tx.run("""
            MATCH (u:User {id: $uid}), (i:Issue {id: $iid})
            MERGE (u)-[:HAS_ISSUE]->(i)
            """, uid=user["user_id"], iid=issue_id)

        for device in data["devices"]:
            tx.run("""
            MATCH (u:User {id: $uid}), (d:Device {id: $did})
            MERGE (u)-[:HAS_DEVICE]->(d)
            """, uid=user["user_id"], did=device["device_id"])

        for product in data["products"]:
            tx.run("""
            MATCH (u:User {id: $uid}), (p:Product {id: $pid})
            MERGE (u)-[:USES_PRODUCT]->(p)
            """, uid=user["user_id"], pid=product["product_id"])

        tx.run("""
        MATCH (o:Outage {id: $oid}), (p:Product {id: $pid})
        MERGE (o)-[:RELATED_TO_PRODUCT]->(p)
        """, oid=data["outage"]["incident_id"], pid=data["products"][0]["product_id"])

        for user_id in data["outage"]["affected_users"]:
            tx.run("""
            MATCH (u:User {id: $uid}), (o:Outage {id: $oid})
            MERGE (u)-[:AFFECTED_BY]->(o)
            """, uid=user_id, oid=data["outage"]["incident_id"])

    def setup_vector_indexes(self):
        with self.driver.session() as session:
            session.run("""
            CREATE VECTOR INDEX `issue-embeddings`
            FOR (i:Issue) ON (i.embedding)
            OPTIONS {
              indexConfig: {
                `vector.dimensions`: 384,
                `vector.similarity_function`: 'cosine'
              }
            }
            """)
            session.run("""
            CREATE VECTOR INDEX `article-embeddings`
            FOR (a:Article) ON (a.embedding)
            OPTIONS {
              indexConfig: {
                `vector.dimensions`: 384,
                `vector.similarity_function`: 'cosine'
              }
            }
            """)


In [154]:


uri = "bolt://localhost:7687"
user = "neo4j"
password =kg_password

ingestor = KGIngestor(uri, user, password, generate_embedding)
ingestor.setup_vector_indexes()
ingestor.ingest(data)
ingestor.close()



  session.write_transaction(self._create_products_subtopics, data)
  session.write_transaction(self._create_issues_causes_solutions_articles, data)
  session.write_transaction(self._create_users_devices_outages, data)
  session.write_transaction(self._create_relationships, data)


In [166]:
from neo4j import GraphDatabase
from typing import List, Dict

class KGSearcher:
    def __init__(self, uri, user, password, embedding_function):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        self.embedding_function = embedding_function

    def close(self):
        self.driver.close()

    def search_knowledge(self, query_text: str, top_k: int = 5) -> List[Dict]:
        embedding = self.embedding_function(query_text)

        query_str = """WITH split(toLower($query_text), ' ') AS words, $embedding AS vector

CALL {
    WITH vector
    CALL db.index.vector.queryNodes('issue-embeddings', $top_k * 5, vector)
    YIELD node, score
    RETURN node, score, labels(node)[0] AS type

    UNION

    WITH vector
    CALL db.index.vector.queryNodes('article-embeddings', $top_k * 5, vector)
    YIELD node, score
    RETURN node, score, labels(node)[0] AS type
}

WITH node, score, type, words
WHERE ANY(word IN words WHERE
    ANY(k IN coalesce(node.keywords, []) WHERE toLower(k) CONTAINS word)
) OR
(
    type = 'Issue' AND (
        ANY(word IN words WHERE toLower(node.description) CONTAINS word)
    )
) OR
(
    type = 'Article' AND (
        ANY(word IN words WHERE toLower(node.summary) CONTAINS word)
    )
)

OPTIONAL MATCH (s:Subtopic)-[:HAS_ISSUE]->(node)
OPTIONAL MATCH (p:Product)-[:HAS_SUBTOPIC]->(s)
OPTIONAL MATCH (node)-[:HAS_CAUSE]->(c:Cause)
OPTIONAL MATCH (c)-[:HAS_SOLUTION]->(sol:Solution)
OPTIONAL MATCH (sol)-[:MENTIONED_IN]->(a:Article)

RETURN node, score, type,
       collect(DISTINCT p) AS products,
       collect(DISTINCT s) AS subtopics,
       collect(DISTINCT c) AS causes,
       collect(DISTINCT sol) AS solutions,
       collect(DISTINCT a) AS articles
ORDER BY score DESC
LIMIT $top_k
"""
        with self.driver.session() as session:
            result = session.run(
                query_str,
                query_text=query_text,
                embedding=embedding,
                top_k=top_k
            )
            return [
                {
                    "id": r["node"].get("id"),
                    "type": r["type"],
                    "score": round(r["score"], 3),
                    "description": r["node"].get("description") or r["node"].get("summary", ""),
                    "keywords": r["node"].get("keywords", []),
                    "products": [p.get("name") for p in r["products"] if p],
                    "subtopics": [s.get("name") for s in r["subtopics"] if s],
                    "causes": [c.get("description") for c in r["causes"] if c],
                    "solutions": [s.get("description") for s in r["solutions"] if s],
                    "articles": [
                        {"title": a.get("title"), "url": a.get("url")}
                        for a in r["articles"] if a
                    ]
                }
                for r in result
            ]


In [167]:

uri = "bolt://localhost:7687"
user = "neo4j"
password = kg_password

searcher = KGSearcher(uri, user, password, generate_embedding)
results = searcher.search_knowledge("Slack channels not loading", top_k=5)
searcher.close()

for res in results:
    print(res)


{'id': 'i2', 'type': 'Issue', 'score': 0.862, 'description': 'Channels not loading', 'keywords': ['load', 'channel', 'error'], 'products': ['Slack'], 'subtopics': ['Channels'], 'causes': ['Slack cache issue'], 'solutions': ['Clear Slack cache'], 'articles': [{'title': 'Fix Slack Cache Issues', 'url': 'https://kb.example.com/slack-cache'}]}
{'id': 'i1', 'type': 'Issue', 'score': 0.855, 'description': 'Unable to join Slack channels', 'keywords': ['join', 'channel', 'access'], 'products': ['Slack'], 'subtopics': ['Channels'], 'causes': ['User lacks permission'], 'solutions': ['Request access from admin'], 'articles': [{'title': 'Getting Slack Channel Access', 'url': 'https://kb.example.com/slack-access'}]}
{'id': 'a1', 'type': 'Article', 'score': 0.847, 'description': 'Guide to requesting access to channels', 'keywords': [], 'products': [], 'subtopics': [], 'causes': [], 'solutions': [], 'articles': []}
{'id': 'i3', 'type': 'Issue', 'score': 0.783, 'description': 'Slack notifications dela