# Import Libraries

In [34]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
import hashlib
from pinecone import Pinecone
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
import json
import ast
from rapidfuzz import fuzz
from datetime import datetime
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Initialization

In [53]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

# Pinecone Initialization
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("scs")
namespaces = ["Kickoff Meeting", "Project Meeting"]

# OpenAI Initialization
client = OpenAI(api_key=OPENAI_API_KEY)
EMBEDDINGS = OpenAIEmbeddings(model='text-embedding-3-small', openai_api_key=OPENAI_API_KEY)
LLM = ChatOpenAI(temperature=0, model_name="gpt-4-turbo", openai_api_key=OPENAI_API_KEY)

# Query

In [64]:
query = "Give me 2 identical keypoints from all meetings recorded?"

# Generate Embeddings

In [65]:
def get_embeddings(text):
    """
    This function returns a list of the embeddings for a given query
    """
    text_embeddings = EMBEDDINGS.embed_query(text)
    print("Generating Embeddings: Done!")
    return text_embeddings

query_embeddings = get_embeddings(text=query)
print(query_embeddings)

Generating Embeddings: Done!
[-0.05483885481953621, 0.011230586096644402, 0.04175231233239174, -0.03925963491201401, -0.06952396780252457, -0.0060657355934381485, 0.032133836299180984, 0.030047575011849403, -0.010634511709213257, -0.02000913769006729, 0.035005830228328705, -0.054080214351415634, 0.010668379254639149, -0.015768880024552345, -0.009090136736631393, -0.028638672083616257, 0.00517839752137661, 0.011616679839789867, -0.011230586096644402, 0.03687533736228943, -0.02128257043659687, 0.013398129492998123, 0.018112536519765854, 0.013757129199802876, 0.003065042197704315, -0.01713714189827442, -0.014332883059978485, -0.04112914204597473, 0.07033679634332657, 0.03018304705619812, 0.002093034330755472, -0.02034781686961651, -0.01566050387918949, -0.00945590902119875, -0.03137519583106041, 0.05662707984447479, -0.025766676291823387, 0.016988124698400497, -0.051858484745025635, -0.00836536381393671, 0.02858448214828968, 0.0109731899574399, 0.02067294903099537, -0.007776062935590744, 

# Get Namespace

### Semantic Similarity

In [66]:
def get_most_similar_namespace(query, namespaces, threshold=0.05):
    """
    Rank namespaces by semantic similarity to the query.
    """
    # Get query embeddings
    query_embeddings = get_embeddings(query)
    print(query_embeddings)

    # Get embeddings for each namespace in list
    namespace_embeddings = {ns: get_embeddings(ns) for ns in namespaces}
    print(namespace_embeddings)

    # Compute similarities
    similarities = {
        ns: cosine_similarity([query_embeddings], [embedding])[0][0] for ns, embedding in namespace_embeddings.items()
    }
    print(similarities.items)

    # Rank namespaes by similarity score
    ranked_namespaces = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    print(ranked_namespaces)

    # Check if the top two are close in similarity
    top_two = ranked_namespaces[:2]
    print(top_two)
    if len(top_two) > 1 and abs(top_two[0][1] - top_two[1][1]) < threshold:
        return None, top_two # Ambiguous case, return for user clarification
    
    return ranked_namespaces[0][0], ranked_namespaces # Returns most similar namespace and the ranked_namespaces with its embeddings

namespace, ranked = get_most_similar_namespace(query, namespaces)

Generating Embeddings: Done!
[-0.05483885481953621, 0.011230586096644402, 0.04175231233239174, -0.03925963491201401, -0.06952396780252457, -0.0060657355934381485, 0.032133836299180984, 0.030047575011849403, -0.010634511709213257, -0.02000913769006729, 0.035005830228328705, -0.054080214351415634, 0.010668379254639149, -0.015768880024552345, -0.009090136736631393, -0.028638672083616257, 0.00517839752137661, 0.011616679839789867, -0.011230586096644402, 0.03687533736228943, -0.02128257043659687, 0.013398129492998123, 0.018112536519765854, 0.013757129199802876, 0.003065042197704315, -0.01713714189827442, -0.014332883059978485, -0.04112914204597473, 0.07033679634332657, 0.03018304705619812, 0.002093034330755472, -0.02034781686961651, -0.01566050387918949, -0.00945590902119875, -0.03137519583106041, 0.05662707984447479, -0.025766676291823387, 0.016988124698400497, -0.051858484745025635, -0.00836536381393671, 0.02858448214828968, 0.0109731899574399, 0.02067294903099537, -0.007776062935590744, 

### User Clarification Loop

In [67]:
def clarify_with_user(ambiguous_namespaces):
    """
    Ask the user to clarify when multiple namespaces are similar.
    """
    options = [ns[0] for ns in ambiguous_namespaces]
    print(options)
    print(f"Did you mean:\n1. {options[0]}\n2. {options[1]}")

    # Simulate user input for demonstration
    user_choice = int(input("Please choose 1 or 2: "))-1
    return options[user_choice]

### Integration

In [68]:
def resolve_namespace(query, namespaces):
    """
    Resolves the namespace by either selecting the most similar one or prompting the user for clarification.
    """
    namespace, ranked = get_most_similar_namespace(query, namespaces)
    print(namespaces, ranked)

    if namespace:
        print(f"Selected namespace: {namespace}")
        return namespace
    else:
        print("Ambiguity detected!")
        return clarify_with_user(ranked)
    
resolved_namespace = resolve_namespace(query, namespaces)
print(f"Namespace: {resolved_namespace}")

Generating Embeddings: Done!
[-0.05483885481953621, 0.011230586096644402, 0.04175231233239174, -0.03925963491201401, -0.06952396780252457, -0.0060657355934381485, 0.032133836299180984, 0.030047575011849403, -0.010634511709213257, -0.02000913769006729, 0.035005830228328705, -0.054080214351415634, 0.010668379254639149, -0.015768880024552345, -0.009090136736631393, -0.028638672083616257, 0.00517839752137661, 0.011616679839789867, -0.011230586096644402, 0.03687533736228943, -0.02128257043659687, 0.013398129492998123, 0.018112536519765854, 0.013757129199802876, 0.003065042197704315, -0.01713714189827442, -0.014332883059978485, -0.04112914204597473, 0.07033679634332657, 0.03018304705619812, 0.002093034330755472, -0.02034781686961651, -0.01566050387918949, -0.00945590902119875, -0.03137519583106041, 0.05662707984447479, -0.025766676291823387, 0.016988124698400497, -0.051858484745025635, -0.00836536381393671, 0.02858448214828968, 0.0109731899574399, 0.02067294903099537, -0.007776062935590744, 

# Metadata Filtering

## Fuzzy Match

In [24]:
def fuzzy_match(title1, title2, threshold=80):
    """
    Perform a fuzzy match between two titles using RapidFuzz.
    Returns True if the similarity score is above the threshold.
    """
    similarity_score = fuzz.partial_ratio(title1.lower(), title2.lower())
    return similarity_score >= threshold

## Extract Metadata

In [27]:
def extract_metadata_from_query(query):
  prompt = f"""
  You are a helpful assistant. Extract the meeting title and the meeting date from the following query.
  If the meeting title or date is not explicitly mentioned, return 'unknown'.
  If the date is mentioned as word, it should be formatted as 'YYYY-MM-DD'

  Query: {query}

  Provide the meeting title and date as a Python dictionary in this format:
  {{"meeting_title": "title_here", "date": "date_here"}}
  """

  response = LLM.invoke(prompt)
  metadata_str = response.content.strip()
  metadata_dict = ast.literal_eval(metadata_str)
  return metadata_dict

metadata = extract_metadata_from_query(query)
print(metadata)

{'meeting_title': 'project meeting', 'date': 'unknown'}


# Query Pinecone Index

In [69]:
def query_pinecone_index(query_embeddings, meeting_title, date, top_k=3, include_metadata=True):
    """
    Query a Pinecone index.
    """
    # Build filter conditions directly for Pinecone
    filter_conditions = {}

    # Include date and meeting title if specified
    if date.lower() != 'unknown':
        filter_conditions['date'] = date
    if meeting_title.lower() != 'unknown':
        filter_conditions['title'] = meeting_title

    # Query Pinecone using the build filter conditions
    query_response = index.query(
        vector=query_embeddings,
        filter=filter_conditions,
        top_k=top_k,
        include_metadata=include_metadata,
        namespace=meeting_title) # Filter based on metadata

    print("Querying Pinecone Index: Done!")
    return query_response

answers = query_pinecone_index(query_embeddings=query_embeddings, meeting_title=resolved_namespace, date="unknown")
print(answers)

Querying Pinecone Index: Done!
{'matches': [{'id': 'e773312289360121c6069407784bbc732f6f8d4505330eb28deaaa0add6a0cd7',
              'metadata': {'date': '2024-12-01',
                           'text': 'Czech: Hello my name is Czech.\n'
                                   'Gian: Hello my name is Gian.\n'
                                   'Shaundyl: Hello my name is Shaundyl.\n'
                                   'Czech (Team Lead): Alright, everyone, '
                                   'thanks for joining today’s meeting. We '
                                   'have about 10 minutes to go over the final '
                                   "details before the product launch. Let's "
                                   'start with the progress update. Bob, how '
                                   'are we doing on the development front?',
                           'title': 'Project Meeting'},
              'score': 0.199812382,
              'values': []},
             {'id': 'ae52e04

# Combining Text from Multiple Document Matches

In [61]:
text_answer = " ".join([doc['metadata']['text'] for doc in answers['matches']])
print(text_answer)

[00:00:00] John: Good morning, everyone. Thank you for joining today's kickoff meeting for our new     software development project. We'll be discussing the project scope, timelines, and
responsibilities. Let's get started with a quick round of introductions. I'll go first. I'm John, the     project manager. I'll be overseeing the project and ensuring we stay on track. Alice, would you     like to go next?
[00:00:20] research, collaborate with developers. Sara develop and execute the testing plan, ensure
software quality. John oversee the project, coordinate between teams, ensure timely delivery.
If there are no further questions, we'll conclude the meeting. Thank you all for your time and let's
make this project a success.
[00:03:50]
All: Thank you, John. to streamline the process.
[00:02:20]
John: Excellent. Now, let's talk about the project timeline. We have a six-month timeframe to
complete this project. Here's a high-level breakdown of the phases:
Planning and Design: 1 month
Deve

## Prompt

In [62]:
prompt = f"""You are a meeting facilitator.
        This user will ask you a questions about the conversation of the meeting.
        Use following piece of context to answer the question.
        If you don't know the answer, just say you don't know.
        Keep the answer within 2 sentences and concise.
        Context: {text_answer}
        Question: {query}"""

print(prompt)

You are a meeting facilitator.
        This user will ask you a questions about the conversation of the meeting.
        Use following piece of context to answer the question.
        If you don't know the answer, just say you don't know.
        Keep the answer within 2 sentences and concise.
        Context: [00:00:00] John: Good morning, everyone. Thank you for joining today's kickoff meeting for our new     software development project. We'll be discussing the project scope, timelines, and
responsibilities. Let's get started with a quick round of introductions. I'll go first. I'm John, the     project manager. I'll be overseeing the project and ensuring we stay on track. Alice, would you     like to go next?
[00:00:20] research, collaborate with developers. Sara develop and execute the testing plan, ensure
software quality. John oversee the project, coordinate between teams, ensure timely delivery.
If there are no further questions, we'll conclude the meeting. Thank you all for you

# LLM

In [63]:
def better_query_response(prompt):
    """
    This function returns a better response using LLM
    """
    better_answer = LLM.invoke(prompt)
    print("Generating Better Response: Done!")
    return better_answer

final_answer = better_query_response(prompt=prompt)
print(final_answer.content)

Generating Better Response: Done!
The key points of the meeting include discussing the project scope, timelines, and responsibilities for a new software development project. The project timeline is set for six months, divided into planning and design, development, testing, and deployment and review phases, with bi-weekly check-ins planned to monitor progress.


# Chatbot Response

In [12]:
def Chatbot(query, meeting_title=None, date=None):
    print(query)
    metadata = extract_metadata_from_query(query)
    print(metadata)
    meeting_title = metadata.get('meeting_title', 'unknown')
    date = metadata.get('date', 'unknown')

    query_embeddings = get_query_embeddings(query=query)

    answers = query_pinecone_index(
        query_embeddings=query_embeddings,
        meeting_title=meeting_title,
        date=date
        )
    print(answers)

    text_answers = " ".join([doc['metadata']['text'] for doc in answers['matches']])
    print(text_answers)
    final_answer = better_query_response(prompt=prompt)
    return final_answer.content

response = Chatbot(query)
print(response)

What are the keypoints from the project meeting?


SyntaxError: invalid syntax (<unknown>, line 1)