### KG Ingestion

In [362]:
import json
from neo4j import GraphDatabase
from typing import List

In [363]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

def embed(text):
    model_path =  "/Users/abhishekbairagi/Desktop/experiments/devcon/sent-transformer/all-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)
    # model = SentenceTransformer(model_path)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embedding[0]

In [None]:
from google import genai
client = genai.Client(api_key="")
import json

def generate_text(prompt):
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt,
    )
    # print(response.text)
    return response.text[7:-3]  if '```json' in response.text  else  response.text

# generate_text('hi')

In [365]:

class KnowledgeGraphIngestor:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def setup_vector_index(self):
        query = """
        CREATE VECTOR INDEX `issue-embeddings`
        FOR (i:Issue) ON (i.embedding)
        OPTIONS {
            indexConfig: {
                `vector.dimensions`: 384,
                `vector.similarity_function`: 'cosine'
            }
        }
        """
        with self.driver.session() as session:
            session.run(query)

    def create_nodes(self, data):
        with self.driver.session() as session:
            for product in data["products"]:
                session.run("MERGE (p:Product {id: $id, name: $name, alias: $alias})", **product)

            for sub in data["subtopics"]:
                session.run("""
                    MERGE (s:SubTopic {
                        id: $id,
                        name: $name,
                        alias: $alias,
                        Constraints: $Constraints,
                        Access: $Access,
                        `Common Issues`: $Common_Issues
                    })
                """, **sub)

            for issue in data["issues"]:
                issue["embedding"] = []  # Placeholder; will be set later
                session.run("""
                    MERGE (i:Issue {id: $id, description: $description, keywords: $keywords, 
                                    frequency: $frequency, severity: $severity})
                """, **issue)

            for user in data["users"]:
                session.run("MERGE (u:User {id: $id, name: $name, email: $email, location: $location, band: $band, team: $team})", **user)

            for device in data["devices"]:
                session.run("""
                    MERGE (d:Device {id: $id, os: $os, lastupdate: $lastupdate, 
                                     os_version: $os_version, ram: $ram, 
                                     storage: $storage, model: $model, issued_on: $issued_on, 
                                     pending_updates: $pending_updates})
                """, **device)

            for ticket in data["tickets"]:
                session.run("""
                    MERGE (t:Ticket {
                        id: $id,
                        query: $ticket_query,
                        category: $category,
                        solved_by: $solved_by,
                        timestamp: $timestamp
                    })
                """, 
                id=ticket["id"],
                ticket_query=ticket["query"],
                category=ticket["category"],
                solved_by=ticket["solved_by"],
                timestamp=ticket["timestamp"]
                )


            for article in data["articles"]:
                session.run("""
                    MERGE (a:Article {id: $id, title: $title, content: $content, 
                                      keywords: $keywords, created_at: $created_at, updated_at: $updated_at})
                """, **article)

            for outage in data["outages"]:
                session.run("""
                    MERGE (o:Outage {id: $id, title: $title, description: $description, 
                                     start_time: $start_time, end_time: $end_time, 
                                     affected_services: $affected_services, status: $status, 
                                     impact: $impact, location: $location, 
                                     expected_resolution: $expected_resolution})
                """, **outage)

    def create_relationships(self, data):
        with self.driver.session() as session:
            def link(from_id, to_id, label_from, label_to, rel):
                session.run(f"""
                    MATCH (a:{label_from} {{id: $from_id}}), (b:{label_to} {{id: $to_id}})
                    MERGE (a)-[:{rel}]->(b)
                """, from_id=from_id, to_id=to_id)


            for rel in data["product-subtopic-relationships"]:
                link(rel["from"], rel["to"], "Product", "SubTopic", rel["relation"])

            for rel in data.get("user-device-relationships", []):
                link(rel["from"], rel["to"], "User", "Device", rel["relation"])

            for rel in data.get("user-ticket-relationships", []):
                link(rel["from"], rel["to"], "User", "Ticket", rel["relation"])

            for rel in data.get("ticket-issue-relationships", []):
                link(rel["from"], rel["to"], "Ticket", "Issue", rel["relation"])

            for rel in data.get("article-issue-relationships", []):
                link(rel["from"], rel["to"], "Issue", "Article", rel["relation"])

            for rel in data.get("product-outage-relationships", []):
                link(rel["from"], rel["to"], "Product", "Outage", rel["relation"])

            for rel in data.get("subtopic-issue-relationships", []):  # If this exists
                link(rel["from"], rel["to"], "SubTopic", "Issue", rel["relation"])

    def set_embeddings(self, embedding_fn):
        """
        embedding_fn: function that takes issue description and returns embedding (List[float])
        """
        with self.driver.session() as session:
            issues = session.run("MATCH (i:Issue) RETURN i.id AS id, i.description AS description")
            for record in issues:
                id_ = record["id"]
                desc = record["description"]
                embedding = embedding_fn(desc)
                session.run("""
                    MATCH (i:Issue {id: $id})
                    CALL db.create.setNodeVectorProperty(i, 'embedding', $embedding)
                """, id=id_, embedding=embedding)


    def delete_all_data(self):
        with self.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
        #deleting indexes
        self.delete_indexes()
        print("All data deleted.")

    def delete_indexes(self):
        with self.driver.session() as session:
            indexes = session.run("SHOW INDEXES")
            indexes_to_drop = [record["name"] for record in indexes]
            print("Indexes to drop:", indexes_to_drop)
            for index in indexes_to_drop:
                try:
                    session.run(f"DROP INDEX `{index}`")
                except Exception as e:
                    print(f"Failed to drop index {index}: {e}")

In [366]:
#reading data
import json
with open("/Users/abhishekbairagi/Desktop/experiments/devcon/data generation/synthetic_data.json", "r") as f:
    data = json.load(f)


In [367]:
len(data['articles'])

37

In [368]:
# Example usage
kg_password = "strongpass123"
uri = "bolt://localhost:7687"
user = "neo4j"
kg = KnowledgeGraphIngestor(uri,user ,kg_password)
kg.delete_all_data()  # Clear existing data if needed
kg.setup_vector_index()
kg.create_nodes(data)
kg.create_relationships(data)


kg.set_embeddings(embed)
# kg.close()



Indexes to drop: ['issue-embeddings']
All data deleted.


In [279]:
def count_article_nodes():
    with kg.driver.session() as session:
        result = session.run("MATCH (a:Article) RETURN count(a) AS article_count")
        article_count = result.single()["article_count"]
    print(f"Total Article Nodes: {article_count}")
    return article_count

count_article_nodes()

Total Article Nodes: 37


37

In [284]:
def get_articles_for_issue(issue_id):
    """
    Fetch all articles associated with a specific issue ID based on the 'related_to' relationship.
    """
    with kg.driver.session() as session:
        query = """
        MATCH (i:Issue {id: $issue_id})-[:solved_by]->(a:Article)
        RETURN i.id AS issue_id, a.id AS article_id, a.title AS title, a.content AS content
        """
        result = session.run(query, issue_id=issue_id)
        articles = [record.data() for record in result]
    return articles

# Example usage
articles_for_issue3 = get_articles_for_issue("issue29")
for article in articles_for_issue3:
    print(f"Article ID: {article['article_id']}")
    print(f"Title: {article['title']}")
    print(f"Content: {article['content']}")
    print("---")

Article ID: article36
Title: Eliminating Echo in Webex Calls
Content: Background echo during Webex calls can be caused by improper audio settings or hardware issues. Ensure that your microphone and speakers are not placed too close to each other, as this can create feedback loops. Use headphones to minimize echo. Check Webex audio settings to ensure the correct input and output devices are selected. If the issue persists, try muting participants who are not speaking, as their devices may be causing the echo. Additionally, ensure that your audio drivers are up-to-date and that your operating system is configured to suppress background noise.
---


In [295]:
def get_relevant_issues_with_articles(product_name, subtopic_name, query_text, threshold=0.5, top_k=5):
    """
    Fetch top semantically matching issues for a given product and subtopic from the knowledge graph,
    including articles related to the issues via the 'solved_by' relationship.
    """
    driver = GraphDatabase.driver(uri, auth=(user, kg_password))
    query_embedding = embed(query_text)

    cypher = """
    MATCH (p:Product)
    WHERE toLower(p.name) = toLower($product_name) OR any(alias IN p.alias WHERE toLower(alias) = toLower($product_name))

    OPTIONAL MATCH (p)-[:has_subtopic]->(s:SubTopic)
    WHERE $subtopic_name IS NULL OR any(word IN split(toLower($subtopic_name), " ") WHERE word IN split(toLower(s.name), " ") OR any(alias IN s.alias WHERE toLower(alias) = word))

    OPTIONAL MATCH (s)-[:has_issue]->(i:Issue)
    WHERE i.embedding IS NOT NULL

    OPTIONAL MATCH (i)-[:solved_by]->(a:Article)

    WITH p, s, i, a, vector.similarity.cosine(i.embedding, $query_embedding) AS score
    WHERE score > $threshold

    RETURN 
        i.id AS issue_id,
        i.description AS description,
        score,
        p.name AS product,
        s.name AS subtopic,
        collect(DISTINCT {id: a.id, title: a.title, content: a.content}) AS related_articles
    ORDER BY score DESC
    LIMIT $top_k
    """

    with driver.session() as session:
        result = session.run(
            cypher,
            product_name=product_name,
            subtopic_name=subtopic_name,
            query_embedding=query_embedding,
            threshold=threshold,
            top_k=top_k
        )
        return [record.data() for record in result]

# Example usage
relevant_issues_with_articles = get_relevant_issues_with_articles(
    product_name="webex",
    subtopic_name="Audio",
    query_text="I am facing audio issues in webex",
    threshold=0.3,
    top_k=10
)

def format_issues_with_articles(issues):
    """
    Format the list of issues with their related articles into a readable string format.

    Args:
        issues (list): List of issue dictionaries.

    Returns:
        str: Formatted string containing issue and article details.
    """
    output = ""
    for issue in issues:
        output += f"Issue ID: {issue['issue_id']}\n"
        output += f"Description: {issue['description']}\n"
        output += f"Product: {issue['product']}\n"
        output += f"Subtopic: {issue['subtopic']}\n"
        output += f"Score: {issue['score']:.4f}\n"
        output += "Related Articles:\n"
        for article in issue['related_articles']:
            output += f" - {article['title']} (ID: {article['id']})\n"
        output += "---\n"
    return output

# Example usage
formatted_issues = format_issues_with_articles(relevant_issues_with_articles)
print(formatted_issues)



Issue ID: issue3
Description: No audio during call
Product: Webex
Subtopic: Audio Issues
Score: 0.7444
Related Articles:
 - Resolving Audio Failures in Webex Calls Due to App Settings (ID: article28)
 - Fixing Audio Issues During Webex Calls on macOS (ID: article27)
 - Troubleshooting No Sound During Meetings in Windows (ID: article3)
---
Issue ID: issue4
Description: Microphone not detected
Product: Webex
Subtopic: Audio Issues
Score: 0.6814
Related Articles:
 - Fixing Microphone Detection Problems (ID: article4)
---
Issue ID: issue30
Description: Noise interference during calls
Product: Webex
Subtopic: Audio Issues
Score: 0.6284
Related Articles:
 - Dealing with Random Noise in Webex Calls (ID: article37)
---
Issue ID: issue29
Description: Background echo during calls
Product: Webex
Subtopic: Audio Issues
Score: 0.5962
Related Articles:
 - Eliminating Echo in Webex Calls (ID: article36)
---



In [294]:
def get_active_outages(product_name):
    """
    Fetch active outages for a specific product from the knowledge graph.

    Args:
        product_name (str): Name of the product to check for outages.

    Returns:
        list: List of active outages with details.
    """
    with kg.driver.session() as session:
        query = """
        MATCH (p:Product {name: $product_name})-[:has_outage]->(o:Outage)
        WHERE o.status = 'Active'
        RETURN o.id AS outage_id, o.title AS title, o.description AS description, 
               o.start_time AS start_time, o.expected_resolution AS expected_resolution, 
               o.impact AS impact, o.location AS location
        """
        result = session.run(query, product_name=product_name)
        outages = [record.data() for record in result]
    return outages

def format_outage_output(outages):
    """
    Format the list of outages into a readable string format.

    Args:
        outages (list): List of outage dictionaries.

    Returns:
        str: Formatted string containing outage details.
    """
    output = ""
    for outage in outages:
        output += f"Outage ID: {outage['outage_id']}\n"
        output += f"Title: {outage['title']}\n"
        output += f"Description: {outage['description']}\n"
        output += f"Start Time: {outage['start_time']}\n"
        output += f"Expected Resolution: {outage['expected_resolution']}\n"
        output += f"Impact: {outage['impact']}\n"
        output += f"Location: {outage['location']}\n"
        output += "---\n"
    return output


# Example usage
active_outages = get_active_outages("Outlook")
# Example usage
formatted_output = format_outage_output(active_outages)
print(formatted_output)


Outage ID: out3
Title: Mail Delivery Delays and Failures
Description: Active outage affecting mail delivery in the India Office. Users are experiencing delays and failures in email delivery.
Start Time: 2025-06-10T09:00:00
Expected Resolution: 2025-06-10T14:00:00
Impact: Critical
Location: India Office
---



In [315]:
def get_user_device_relationship(user_id):
    """
    Fetch the device associated with a specific user ID based on the 'has_device' relationship.

    Args:
        user_id (str): The ID of the user.

    Returns:
        dict: A dictionary containing user and device details.
    """
    driver = GraphDatabase.driver(uri, auth=(user, kg_password))
    with driver.session() as session:
        query = """
        MATCH (u:User {id: $user_id})-[:has_device]->(d:Device)
        RETURN u.name AS name, u.email AS email, u.location AS location, u.band AS band, u.team AS team,
               d.id AS device_id, d.model AS device_model, d.os AS device_os, d.os_version AS os_version, 
               d.ram AS ram, d.storage AS storage, d.lastupdate AS last_update, d.issued_on AS issued_on, 
               d.pending_updates AS pending_updates
        """
        result = session.run(query, user_id=user_id)
        user_device_details = result.single()
        if user_device_details:
            return {
                "name": user_device_details["name"],
                "email": user_device_details["email"],
                "location": user_device_details["location"],
                "band": user_device_details["band"],
                "team": user_device_details["team"],
                "device": {
                    "model": user_device_details["device_model"],
                    "os": user_device_details["device_os"],
                    "os_version": user_device_details["os_version"],
                    "ram": user_device_details["ram"],
                    "storage": user_device_details["storage"],
                    "last_update": user_device_details["last_update"],
                    "issued_on": user_device_details["issued_on"],
                    "pending_updates": user_device_details["pending_updates"],
                    "device_id": user_device_details["device_id"]
                    
                }
            }
        return None

def format_user_details(user_details, user_id):
    """
    Format user details and device details into a readable string format.

    Args:
        user_details (dict): A dictionary containing user and device details.
        user_id (str): The ID of the user.

    Returns:
        str: Formatted string containing user and device details.
    """
    if not user_details:
        return "User details not found."

    output = f"User Info:\n"
    output += f"User ID: {user_id}\n"
    output += f"Name: {user_details['name']}\n"
    output += f"Email: {user_details['email']}\n"
    output += f"Location: {user_details['location']}\n"
    output += f"Band: {user_details['band']}\n"
    output += f"Team: {user_details['team']}\n"
    output += f"\nDevice Info:\n"
    output += f"Device ID: {user_details['device']['device_id']}\n"
    output += f"Model: {user_details['device']['model']}\n"
    output += f"OS: {user_details['device']['os']}\n"
    output += f"OS Version: {user_details['device']['os_version']}\n"
    output += f"RAM: {user_details['device']['ram']} GB\n"
    output += f"Storage: {user_details['device']['storage']} GB\n"
    output += f"Last Update: {user_details['device']['last_update']}\n"
    output += f"Issued On: {user_details['device']['issued_on']}\n"
    output += f"Pending Updates: {user_details['device']['pending_updates']}\n"
    return output

# Example usage
user_id = "user10"
user_details = get_user_device_relationship(user_id)
formatted_details = format_user_details(user_details, user_id)
print(formatted_details)


User Info:
User ID: user10
Name: Julia
Email: julia@example.com
Location: UK
Band: 50
Team: GSG

Device Info:
Device ID: dev10
Model: iPad Pro
OS: ios
OS Version: 16.4
RAM: 16 GB
Storage: 256 GB
Last Update: 2025-06-02T21:29:14.482241
Issued On: 2024-11-03T21:29:14.482246
Pending Updates: 



In [None]:
def get_product_details_with_outages(product_id):
    """
    Fetch product details along with any active outages associated with the product.

    Args:
        product_id (str): The ID of the product.

    Returns:
        dict: A dictionary containing product details and active outages.
    """
    driver = GraphDatabase.driver(uri, auth=(user, kg_password))
    with driver.session() as session:
        # Query to fetch product details
        product_query = """
        MATCH (p:Product {id: $product_id})
        RETURN p.id AS product_id, p.name AS name, p.alias AS alias
        """
        product_result = session.run(product_query, product_id=product_id)
        product_details = product_result.single()

        if not product_details:
            return {"error": "Product not found"}

        # Query to fetch active outages related to the product
        outage_query = """
        MATCH (p:Product {id: $product_id})-[:has_outage]->(o:Outage)
        WHERE o.status = 'Active'
        RETURN o.id AS outage_id, o.title AS title, o.description AS description, 
               o.start_time AS start_time, o.expected_resolution AS expected_resolution, 
               o.impact AS impact, o.location AS location
        """
        outage_result = session.run(outage_query, product_id=product_id)
        outages = [record.data() for record in outage_result]

        # Combine product details and outages
        return {
            "product": {
                "id": product_details["product_id"],
                "name": product_details["name"],
                "alias": product_details["alias"]
            },
            "active_outages": outages
        }

def format_product_details_with_outages(details):
    """
    Format product details and active outages into a readable string format.

    Args:
        details (dict): A dictionary containing product details and active outages.

    Returns:
        str: Formatted string containing product and outage details.
    """
    output = f"Product Info:\n"
    output += f"ID: {details['product']['id']}\n"
    output += f"Name: {details['product']['name']}\n"
    output += f"Aliases: {', '.join(details['product']['alias'])}\n\n"

    if details['active_outages']:
        output += "Active Outages:\n"
        for outage in details['active_outages']:
            output += f"Outage ID: {outage['outage_id']}\n"
            output += f"Title: {outage['title']}\n"
            output += f"Description: {outage['description']}\n"
            output += f"Start Time: {outage['start_time']}\n"
            output += f"Expected Resolution: {outage['expected_resolution']}\n"
            output += f"Impact: {outage['impact']}\n"
            output += f"Location: {outage['location']}\n"
            output += "---\n"
    else:
        output += "No active outages.\n"

    return output


# Example usage
product_id = "prod2"
product_info = get_product_details_with_outages(product_id)
formatted_product_info = format_product_details_with_outages(product_info)
print(formatted_product_info)

Product Info:
ID: prod2
Name: Outlook
Aliases: Microsoft Outlook, Outlook Email, MS Outlook

Active Outages:
Outage ID: out3
Title: Mail Delivery Delays and Failures
Description: Active outage affecting mail delivery in the India Office. Users are experiencing delays and failures in email delivery.
Start Time: 2025-06-10T09:00:00
Expected Resolution: 2025-06-10T14:00:00
Impact: Critical
Location: India Office
---



In [334]:
def get_subtopic_details(product_name, subtopic_name):
    """
    Fetch subtopic details for a given product and subtopic name.

    Args:
        product_name (str): Name of the product.
        subtopic_name (str): Name of the subtopic.

    Returns:
        dict: A dictionary containing subtopic details.
    """
    driver = GraphDatabase.driver(uri, auth=(user, kg_password))
    with driver.session() as session:
        query = """
        MATCH (p:Product)-[:has_subtopic]->(s:SubTopic)
        WHERE toLower(p.name) = toLower($product_name) AND  any(word IN split(toLower($subtopic_name), " ") WHERE word IN split(toLower(s.name), " ") OR any(alias IN s.alias WHERE toLower(alias) = word))

        RETURN s.id AS id, s.name AS name, s.alias AS alias, s.Constraints AS constraints, 
               s.Access AS access, s.`Common Issues` AS common_issues
        """
        result = session.run(query, product_name=product_name, subtopic_name=subtopic_name)
        subtopic_details = result.single()
        if subtopic_details:
            return {
                "id": subtopic_details["id"],
                "name": subtopic_details["name"],
                "alias": subtopic_details["alias"],
                "Constraints": subtopic_details["constraints"],
                "Access": subtopic_details["access"],
                "Common_Issues": subtopic_details["common_issues"]
            }
        return None

def universal_formatting_function(input:dict, heading = ""):
    out = f"{heading}:\n" if heading else "" 
    for k, v in input.items():
        out += f"{k}: {', '.join(v) if isinstance(v, list) else v}\n"
    return out 
# Example usage
product_name = "Webex"
subtopic_name = "Webinar"
subtopic_info = get_subtopic_details(product_name, subtopic_name)
print(universal_formatting_function(subtopic_info, heading = "Subtopic Info"))
# if subtopic_info:
#     print(f"Subtopic ID: {subtopic_info['id']}")
#     print(f"Name: {subtopic_info['name']}")
#     print(f"Alias: {', '.join(subtopic_info['alias'])}")
#     print(f"Constraints: {subtopic_info['Constraints']}")
#     print(f"Access: {subtopic_info['Access']}")
#     print(f"Common Issues: {', '.join(subtopic_info['Common_Issues'])}")
# else:
#     print("Subtopic details not found.")

Subtopic Info:
id: sub20
name: Webinar - Webex
alias: Webex Webinar, Hosting Webex Webinar, Webex Events
Constraints: Band 45 or above
Access: Full-time employees
Common_Issues: Audio dropouts, sInvite errors, Permissions problems



### Elasticsearch Ingestion

In [296]:
import requests
import json

# Elasticsearch URL
es_url = "http://localhost:9200/articles_1"

# Define the index mapping
index_mapping = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "title": {"type": "text"},
            "content": {"type": "text"},
            "keywords": {"type": "keyword"},
            "created_at": {"type": "date"},
            "updated_at": {"type": "date"},
            "embedding": {
                "type": "dense_vector",
                "dims": 384
            }
        }
    }
}

# Create the index
response = requests.put(es_url, headers={"Content-Type": "application/json"}, data=json.dumps(index_mapping))
if response.status_code == 200:
    print("Index created successfully.")
else:
    print(f"Failed to create index: {response.text}")

Index created successfully.


In [297]:
# Read articles from the file
with open("/Users/abhishekbairagi/Desktop/experiments/devcon/data generation/synthetic_data.json", "r") as f:
    data = json.load(f)

# Ingest articles into Elasticsearch
for article in data["articles"]:
    # Convert numpy.float32 to standard Python float
    article["embedding"] = [float(value) for value in embed(article["content"])]  # Assuming the embed function returns a list of floats

    # Ingest the article
    print(article)
    ingest_response = requests.post(f"{es_url}/_doc/{article['id']}", headers={"Content-Type": "application/json"}, data=json.dumps(article))   
    if ingest_response.status_code == 201:
        print(f"Article {article['id']} ingested successfully.")
    else:
        print(f"Failed to ingest article {article['id']}: {ingest_response.text}")

{'id': 'article1', 'title': 'Troubleshooting Issues with Meeting Scheduling', 'content': "If you're experiencing problems when trying to schedule a meeting, there are several possible causes and corresponding solutions. First, check your calendar permissions. If your calendar is shared or connected to another service (like Outlook or Google Calendar), ensure that you have the proper permissions to create events. In some cases, meeting scheduling may fail due to sync delays or outdated calendar tokens.\n\nAnother common issue involves time zone mismatches between participants. Always confirm that all participants' calendars are set to the correct time zone. Additionally, browser extensions or security software can sometimes block the scripts required to render the scheduling interface.\n\nClear your browser's cache and cookies, or try scheduling the meeting in a different browser or in incognito/private mode. If you're using a third-party scheduling tool, ensure it’s properly integrated

In [298]:
import requests
import json

def semantic_search(query, es_url, index_name, embed_fn, top_k=5):
    """
    Perform semantic search on Elasticsearch index using query embedding.

    Args:
        query (str): The search query.
        es_url (str): Elasticsearch base URL.
        index_name (str): Name of the Elasticsearch index.
        embed_fn (function): Function to generate embedding for the query.
        top_k (int): Number of top results to return.

    Returns:
        list: List of top matching documents with their scores.
    """
    # Generate embedding for the query
    query_embedding = [float(value) for value in embed(query)]
    # Construct the search query for Elasticsearch
    search_query = {
        "size": top_k,
        "query": {
            "script_score": {
                "query": {"match": {"content":query}},
                "script": {
                    "source": "_score*2 + cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                    "params": {"query_vector": query_embedding}
                }
            }
        }
    }

    # Perform the search request
    response = requests.post(
        f"{es_url}/_search",
        headers={"Content-Type": "application/json"},
        data=json.dumps(search_query)
    )

    # Check for errors
    if response.status_code != 200:
        raise Exception(f"Failed to perform search: {response.text}")

    # Parse the response
    results = response.json()["hits"]["hits"]
    return [{"id": hit["_id"], "score": hit["_score"], "source": hit["_source"]} for hit in results]

# Example usage
query = "I am facing audio issue in webex"
top_results = semantic_search(query, es_url, "articles", embed, top_k=5)
for result in top_results:
    print(f"ID: {result['id']} Score: {result['score']:.4f}")
    print(f"Title: {result['source']['title']}")
    print(f"Content: {result['source']['content']}")
    print("---")

ID: article31 Score: 14.2993
Title: Hosting Webinars in Zoom
Content: Zoom's webinar features are similar to Webex, including support for registration, Q&A, and panelists. Some users find Zoom easier to use for external-facing events. Remember that hosting rights in Zoom, like Webex, might be tied to your license or access level, which can vary depending on your team or region.
---
ID: article36 Score: 14.2790
Title: Eliminating Echo in Webex Calls
Content: Background echo during Webex calls can be caused by improper audio settings or hardware issues. Ensure that your microphone and speakers are not placed too close to each other, as this can create feedback loops. Use headphones to minimize echo. Check Webex audio settings to ensure the correct input and output devices are selected. If the issue persists, try muting participants who are not speaking, as their devices may be causing the echo. Additionally, ensure that your audio drivers are up-to-date and that your operating system is 