# Import Libraries

In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
import hashlib
from pinecone import Pinecone
from langchain.llms import OpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import json
import ast
from rapidfuzz import fuzz
from datetime import datetime
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.autonotebook import tqdm

## Initialization

In [3]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

# Pinecone Initialization
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("scs")
namespaces = ["Kickoff Meeting", "Project Meeting"]

# OpenAI Initialization
client = OpenAI(api_key=OPENAI_API_KEY)
EMBEDDINGS = OpenAIEmbeddings(model='text-embedding-3-small', openai_api_key=OPENAI_API_KEY)
LLM = ChatOpenAI(temperature=0, model_name="gpt-4-turbo", openai_api_key=OPENAI_API_KEY)

  client = OpenAI(api_key=OPENAI_API_KEY)


# Query

In [4]:
query = "Who are present in kickoff meeting and what did they contribute?"

# Generate Embeddings

In [5]:
def get_embeddings(text):
    """
    This function returns a list of the embeddings for a given query
    """
    text_embeddings = EMBEDDINGS.embed_query(text)
    print("Generating Embeddings: Done!")
    return text_embeddings

query_embeddings = get_embeddings(text=query)
print(query_embeddings)

Generating Embeddings: Done!
[-0.02175448089838028, 0.04156634584069252, 0.028882181271910667, -0.002205087337642908, -0.019954701885581017, -0.015498104505240917, 0.03288169205188751, 0.0408807136118412, 0.02429702877998352, -0.009270294569432735, 0.0397094301879406, -0.03930947557091713, -0.0006802739226259291, -0.04450884088873863, -0.0333673469722271, -0.0010596917709335685, -0.03171040862798691, 0.022925766184926033, -0.01635514199733734, 0.042680494487285614, 0.01634085923433304, 0.033938705921173096, -0.006431356072425842, 0.006534914951771498, -0.04059503600001335, 0.035195693373680115, -0.06427785009145737, 0.013584052212536335, 0.012955558486282825, -0.010698691010475159, 0.007613354362547398, -0.02716810442507267, -0.0028675063513219357, 0.0008905160939320922, -0.03793821483850479, 0.0086346585303545, -0.011727136559784412, 0.019311923533678055, -0.007166980300098658, 0.018040649592876434, 0.0253254733979702, -0.03476717695593834, 0.005210076924413443, -0.0016176592325791717

# Get Namespace

### Semantic Similarity

In [6]:
def get_most_similar_namespace(query, namespaces, threshold=0.05):
    """
    Rank namespaces by semantic similarity to the query.
    """
    # Get query embeddings
    query_embeddings = get_embeddings(query)
    print(query_embeddings)

    # Get embeddings for each namespace in list
    namespace_embeddings = {ns: get_embeddings(ns) for ns in namespaces}
    print(namespace_embeddings)

    # Compute similarities
    similarities = {
        ns: cosine_similarity([query_embeddings], [embedding])[0][0] for ns, embedding in namespace_embeddings.items()
    }
    print(similarities.items)

    # Rank namespaes by similarity score
    ranked_namespaces = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    print(ranked_namespaces)

    # Check if the top two are close in similarity
    top_two = ranked_namespaces[:2]
    print(top_two)
    if len(top_two) > 1 and abs(top_two[0][1] - top_two[1][1]) < threshold:
        return None, top_two # Ambiguous case, return for user clarification
    
    return ranked_namespaces[0][0], ranked_namespaces # Returns most similar namespace and the ranked_namespaces with its embeddings

namespace, ranked = get_most_similar_namespace(query, namespaces)

Generating Embeddings: Done!
[-0.02175448089838028, 0.04156634584069252, 0.028882181271910667, -0.002205087337642908, -0.019954701885581017, -0.015498104505240917, 0.03288169205188751, 0.0408807136118412, 0.02429702877998352, -0.009270294569432735, 0.0397094301879406, -0.03930947557091713, -0.0006802739226259291, -0.04450884088873863, -0.0333673469722271, -0.0010596917709335685, -0.03171040862798691, 0.022925766184926033, -0.01635514199733734, 0.042680494487285614, 0.01634085923433304, 0.033938705921173096, -0.006431356072425842, 0.006534914951771498, -0.04059503600001335, 0.035195693373680115, -0.06427785009145737, 0.013584052212536335, 0.012955558486282825, -0.010698691010475159, 0.007613354362547398, -0.02716810442507267, -0.0028675063513219357, 0.0008905160939320922, -0.03793821483850479, 0.0086346585303545, -0.011727136559784412, 0.019311923533678055, -0.007166980300098658, 0.018040649592876434, 0.0253254733979702, -0.03476717695593834, 0.005210076924413443, -0.0016176592325791717

### User Clarification Loop

In [7]:
def clarify_with_user(ambiguous_namespaces):
    """
    Ask the user to clarify when multiple namespaces are similar.
    """
    options = [ns[0] for ns in ambiguous_namespaces]
    print(options)
    print(f"Did you mean:\n1. {options[0]}\n2. {options[1]}")

    # Simulate user input for demonstration
    user_choice = int(input("Please choose 1 or 2: "))-1
    return options[user_choice]

### Integration

In [8]:
def resolve_namespace(query, namespaces):
    """
    Resolves the namespace by either selecting the most similar one or prompting the user for clarification.
    """
    namespace, ranked = get_most_similar_namespace(query, namespaces)
    print(namespaces, ranked)

    if namespace:
        print(f"Selected namespace: {namespace}")
        return namespace
    else:
        print("Ambiguity detected!")
        return clarify_with_user(ranked)
    
resolved_namespace = resolve_namespace(query, namespaces)
print(f"Namespace: {resolved_namespace}")

Generating Embeddings: Done!
[-0.02175448089838028, 0.04156634584069252, 0.028882181271910667, -0.002205087337642908, -0.019954701885581017, -0.015498104505240917, 0.03288169205188751, 0.0408807136118412, 0.02429702877998352, -0.009270294569432735, 0.0397094301879406, -0.03930947557091713, -0.0006802739226259291, -0.04450884088873863, -0.0333673469722271, -0.0010596917709335685, -0.03171040862798691, 0.022925766184926033, -0.01635514199733734, 0.042680494487285614, 0.01634085923433304, 0.033938705921173096, -0.006431356072425842, 0.006534914951771498, -0.04059503600001335, 0.035195693373680115, -0.06427785009145737, 0.013584052212536335, 0.012955558486282825, -0.010698691010475159, 0.007613354362547398, -0.02716810442507267, -0.0028675063513219357, 0.0008905160939320922, -0.03793821483850479, 0.0086346585303545, -0.011727136559784412, 0.019311923533678055, -0.007166980300098658, 0.018040649592876434, 0.0253254733979702, -0.03476717695593834, 0.005210076924413443, -0.0016176592325791717

# Query Pinecone Index

In [12]:
def query_pinecone_index(query_embeddings, meeting_title, top_k=5, include_metadata=True):
    """
    Query a Pinecone index.
    """
    # Build filter conditions directly for Pinecone
    filter_conditions = {}

    # Include date and meeting title if specified
    if meeting_title.lower() != 'unknown':
        filter_conditions['title'] = meeting_title

    # Query Pinecone using the build filter conditions
    query_response = index.query(
        vector=query_embeddings,
        filter=filter_conditions,
        top_k=top_k,
        include_metadata=include_metadata,
        namespace=meeting_title) # Filter based on metadata

    print("Querying Pinecone Index: Done!")
    return query_response

answers = query_pinecone_index(query_embeddings=query_embeddings, meeting_title=resolved_namespace)
print(answers)

Querying Pinecone Index: Done!
{'matches': [{'id': 'f827b99cc18de4e985ae02c506b3142be54603b909216cd22c26d130adaa4ad0',
              'metadata': {'date': '2024-12-09',
                           'text': '[00:00:00] John: Good morning, everyone. '
                                   "Thank you for joining today's kickoff "
                                   'meeting for our new\n'
                                   "software development project. We'll be "
                                   'discussing the project scope, timelines, '
                                   'and\n'
                                   "responsibilities. Let's get started with a "
                                   "quick round of introductions. I'll go "
                                   "first. I'm John, the\n"
                                   "project manager. I'll be overseeing the "
                                   'project and ensuring we stay on track. '
                                   'Alice, woul

# Combining Text from Multiple Document Matches

In [13]:
text_answer = " ".join([doc['metadata']['text'] for doc in answers['matches']])
print(text_answer)

[00:00:00] John: Good morning, everyone. Thank you for joining today's kickoff meeting for our new
software development project. We'll be discussing the project scope, timelines, and
responsibilities. Let's get started with a quick round of introductions. I'll go first. I'm John, the
project manager. I'll be overseeing the project and ensuring we stay on track. Alice, would you
like to go next?
[00:00:20] software quality. John oversee the project, coordinate between teams, ensure timely delivery.
If there are no further questions, we'll conclude the meeting. Thank you all for your time and let's
make this project a success.
[00:03:50]
All: Thank you, John. Sara: I'll make sure to start testing as soon as we have the first build ready. This way, we can
catch any issues early and avoid delays.
[00:03:20]
John: Sounds good. Before we wrap up, let's quickly go over the responsibilities. Alice lead the
development team, ensure code quality and performance. Bob design the UI/UX, conduct use

# Helper Functions

In [14]:
def output_parser(output):
    """
    Helps parses the LLM output, prints it, and returns it.
    """
    print("\n" + output.content + "\n")

    return output.content

## Prompt

In [15]:
prompt = f"""You are a meeting facilitator.
        This user will ask you a questions about the conversation of the meeting.
        Use following piece of context to answer the question.
        If you don't know the answer, just say you don't know.
        Keep the answer complete and concise.
        Context: {text_answer}
        Question: {query}"""

print(prompt)

You are a meeting facilitator.
        This user will ask you a questions about the conversation of the meeting.
        Use following piece of context to answer the question.
        If you don't know the answer, just say you don't know.
        Keep the answer complete and concise.
        Context: [00:00:00] John: Good morning, everyone. Thank you for joining today's kickoff meeting for our new
software development project. We'll be discussing the project scope, timelines, and
responsibilities. Let's get started with a quick round of introductions. I'll go first. I'm John, the
project manager. I'll be overseeing the project and ensuring we stay on track. Alice, would you
like to go next?
[00:00:20] software quality. John oversee the project, coordinate between teams, ensure timely delivery.
If there are no further questions, we'll conclude the meeting. Thank you all for your time and let's
make this project a success.
[00:03:50]
All: Thank you, John. Sara: I'll make sure to start te

# LLM

In [16]:
def better_query_response(prompt):
    """
    This function returns a better response using LLM
    """
    better_answer = LLM.invoke(prompt)
    print("Generating Better Response: Done!")
    return better_answer

final_answer = better_query_response(prompt=prompt)
print(final_answer.content)

Generating Better Response: Done!
The kickoff meeting was attended by John, Alice, Bob, and Sara. 

- **John**, the project manager, facilitated the meeting, introduced the agenda, and discussed the project scope, timelines, and responsibilities. He also emphasized his role in overseeing the project and ensuring timely delivery.
  
- **Alice**, the lead developer, introduced herself and mentioned her responsibility for the overall architecture and development of the software. She also commented on the project timeline, expressing the need to adhere to the schedule and avoid scope creep.

- **Bob**, the UI/UX designer, introduced himself and described his role in handling the design aspects of the software, ensuring it is user-friendly and visually appealing.

- **Sara**, the QA analyst, introduced herself and outlined her role in testing the software to maintain quality standards and ensure it is free of bugs. She also committed to starting testing as soon as the first build is ready t

# Chatbot Response

In [46]:
def Chatbot(query, meeting_title=None, date=None):
    print(query)
    meeting_title = resolve_namespace(query, namespaces)

    query_embeddings = get_embeddings(text=query)

    answers = query_pinecone_index(
        query_embeddings=query_embeddings,
        meeting_title=meeting_title,
        )
    print(answers)

    text_answers = " ".join([doc['metadata']['text'] for doc in answers['matches']])
    print(text_answers)
    final_answer = better_query_response(prompt=prompt)
    return final_answer.content

response = Chatbot(query)
print(response)

Who are present in kickoff meeting?
Generating Embeddings: Done!
[-0.032539449632167816, 0.040285613387823105, 0.05508384481072426, -0.011771950870752335, -0.03995244577527046, 0.002146506914868951, 0.0392305813729763, 0.016366899013519287, 0.002141301054507494, -0.013541908003389835, 0.036121007055044174, -0.05133569985628128, -0.011771950870752335, -0.029735280200839043, -0.020947962999343872, -0.021961351856589317, -0.04761531949043274, 0.005396634340286255, 0.006611310876905918, 0.012209233827888966, 0.03217851743102074, 0.047643084079027176, -0.013180974870920181, 0.009536946192383766, -0.02470999024808407, 0.03387212008237839, -0.05105805769562721, -0.008232036605477333, 0.05078041926026344, -0.0002533468068577349, 0.012813101522624493, -0.03145664930343628, -0.0007019094773568213, 0.012056532315909863, -0.024501759558916092, -0.0006134116556495428, -0.009828467853367329, 0.01979575678706169, -0.017116528004407883, 0.004081313032656908, 0.03276155889034271, -0.04647699370980263, 

## RAG Chain

In [17]:
class chat_templates:
    def final_rag_template():
        prompt = """
            You are a meeting facilitator.
            This user will ask you a questions about the conversation of the meeting.
            Use following piece of context to answer the question.
            If you don't know the answer, just say you don't know.
            Keep the answer complete and concise.
            Context: {context}
            Here are some background questions and answers that will help you answer the question: {qa_pairs}
            Question: {question}
        """
        
        return prompt
    
    def decomposition_template():
        prompt = """
            Break the following user question into smaller, more specific questions.
            Provide these subquestions separated by newlines. 
            Do not rephrase if you see unknown terms.
            Question: {question}
            subquestions:
        """

        return prompt
    
    def qa_template():
        prompt = """
            Answer the question in the following context:\n{context}\n\nQuestion: {subquestion}
        """

        return prompt
        

prompt_template = ChatPromptTemplate.from_template(chat_templates.final_rag_template())
decomposition_template = ChatPromptTemplate.from_template(chat_templates.decomposition_template())
qa_template = ChatPromptTemplate.from_template(chat_templates.qa_template())

def decompose_question(question):
    """
    Decomposes a complex question into smaller questions.
    """
    prompt = decomposition_template.format(question=question)
    response = LLM.invoke(prompt)
    subquestions = response.content.split("\n")

    return subquestions

query_embeddings = get_embeddings(text=query)
answers = query_pinecone_index(
        query_embeddings=query_embeddings,
        meeting_title=namespace,
        )
text_answers = " ".join([doc['metadata']['text'] for doc in answers['matches']])

def generate_qa_pairs(subquestions, context):
    """Generates QA pairs by answering each subquestion."""
    qa_pairs = []
    for subquestion in subquestions:
        context = context
        rag_prompt = qa_template.format(context=context, subquestion=subquestion)
        answer = LLM.invoke(rag_prompt)
        qa_pairs.append((subquestion, answer))
    return qa_pairs

def build_final_answer(question, context, qa_pairs):
    """Builds a final answer by integrating the context and QA pairs."""
    qa_pairs_str = "\n".join([f"Q: {q}\nA: {a}" for q, a in qa_pairs])
    final_prompt = prompt_template.format(context=context, qa_pairs=qa_pairs_str, question=question)
    final_response = LLM.invoke(final_prompt)
    return final_response

def decomposition_query_process(question, text_answers):
    """Implements decomposition query"""
    subquestions = decompose_question(question)

    qa_pairs = generate_qa_pairs(subquestions, text_answers)
    print(qa_pairs)

    final_answer = build_final_answer(question, text_answers, qa_pairs)

    return output_parser(final_answer)

result = decomposition_query_process(question=query, text_answers=text_answers)

Generating Embeddings: Done!
Querying Pinecone Index: Done!
[('Who are the participants in a kickoff meeting?  ', AIMessage(content='The participants in the kickoff meeting are John, Alice, Bob, and Sara.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 16, 'prompt_tokens': 519, 'total_tokens': 535, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4-turbo-2024-04-09', 'system_fingerprint': 'fp_0f602ebbda', 'finish_reason': 'stop', 'logprobs': None}, id='run-7490c368-aced-4c03-b4dc-c905ab8596f9-0', usage_metadata={'input_tokens': 519, 'output_tokens': 16, 'total_tokens': 535})), ('What are the roles of each participant in the kickoff meeting?  ', AIMessage(content='In the kickoff meeting, the roles of each participant are as follows:\n\n- **John**: He is the project ma