# Import Libraries

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
import hashlib
from pinecone import Pinecone
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
import json
import ast
from rapidfuzz import fuzz
from datetime import datetime
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

  from tqdm.autonotebook import tqdm


## Initialization

In [2]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

# Pinecone Initialization
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("echo-openai")

# OpenAI Initialization
client = OpenAI(api_key=OPENAI_API_KEY)
EMBEDDINGS = OpenAIEmbeddings(model='text-embedding-3-small', openai_api_key=OPENAI_API_KEY)
LLM = ChatOpenAI(temperature=0, model_name="gpt-4-turbo", openai_api_key=OPENAI_API_KEY)

  client = OpenAI(api_key=OPENAI_API_KEY)


# Query

In [3]:
query = "What was the QA perspective on the kickoff meeting?"

# Generate Embeddings

In [4]:
def get_query_embeddings(query):
    """
    This function returns a list of the embeddings for a given query
    """
    query_embeddings = EMBEDDINGS.embed_query(query)
    print("Generating Embeddings: Done!")
    return query_embeddings

query_embeddings = get_query_embeddings(query=query)
print(query_embeddings)

Generating Embeddings: Done!
[-0.01954549551010132, 0.03269077092409134, 0.02002827078104019, 0.008572706952691078, -0.04405667632818222, -0.014152207411825657, -0.011662467382848263, -0.011531428433954716, 0.02058001421391964, -0.0013897026656195521, 0.06582293659448624, -0.023518044501543045, -0.012365940026938915, -0.008876165375113487, 0.009503772482275963, -0.021517977118492126, -0.01743507757782936, -0.008917545899748802, -0.011262454092502594, -0.012441804632544518, 0.04783611744642258, 0.033683910965919495, -0.04411185160279274, -0.005169142037630081, -0.060084812343120575, -0.02918720431625843, -0.020786916837096214, 0.0005030344473198056, 0.006138140801340342, -0.03928409889340401, 0.011034859344363213, -0.0328562930226326, 0.001862132572568953, 0.009172727353870869, -0.008400286547839642, 0.016014339402318, 0.01015207078307867, -0.009138243272900581, 0.021352453157305717, 0.0018966165371239185, -0.002672505099326372, -0.04728437215089798, 0.009131346829235554, 0.006189866457

# Metadata Filtering

# Self-Query Retriever

In [None]:
metadata_field_info = [
    AttributeInfo(
        name="date",
        description="The date the meeting was recorded",
        type="date",
    ),
    AttributeInfo(
        name="title",
        description="The title of the meeting",
        type="string",
    )
]

document_content_description = "Meeting transcripts"
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, Pinecone.from_documents, 
)

## Fuzzy Match

In [5]:
def fuzzy_match(title1, title2, threshold=80):
    """
    Perform a fuzzy match between two titles using RapidFuzz.
    Returns True if the similarity score is above the threshold.
    """
    similarity_score = fuzz.partial_ratio(title1.lower(), title2.lower())
    return similarity_score >= threshold

## Extract Metadata

In [6]:
def extract_metadata_from_query(query):
  prompt = f"""
  You are a helpful assistant. Extract the meeting title and the meeting date from the following query.
  If the meeting title or date is not explicitly mentioned, return 'unknown'.
  If the date is mentioned as word, it should be formatted as 'YYYY-MM-DD'

  Query: {query}

  Provide the meeting title and date as a Python dictionary in this format:
  {{"meeting_title": "title_here", "date": "date_here"}}
  """

  response = LLM.invoke(prompt)
  metadata_str = response.content.strip()
  metadata_dict = ast.literal_eval(metadata_str)
  return metadata_dict

metadata = extract_metadata_from_query(query)
print(metadata)

{'meeting_title': 'kickoff meeting', 'date': 'unknown'}


# Query Pinecone Index

In [7]:
def query_pinecone_index(query_embeddings, meeting_title, date, top_k=2, include_metadata=True):
    """
    Query a Pinecone index.
    """
    filter_conditions = {}
    if date.lower() != 'unknown':
      filter_conditions['date'] = date
    # if meeting_title.lower() != 'unknown':
    #   filter_conditions['title'] = meeting_title
    # if date.lower() != 'unknown':
    #   filter_conditions['date'] = date

    query_response = index.query(
        vector=query_embeddings,
        filter=filter_conditions,
        top_k=top_k,
        include_metadata=include_metadata,
        namespace="USJ-R") # Filter based on metadata
    print(query_response)

    filtered_matches = []
    for match in query_response['matches']:
      if 'metadata' in match and 'title' in match['metadata']:
        metadata_title = match['metadata']['title']
        if fuzzy_match(meeting_title, metadata_title):
          filtered_matches.append(match)

    if not filtered_matches:
      return query_response

    query_response['matches'] = filtered_matches

    print("Querying Pinecone Index: Done!")
    return query_response

answers = query_pinecone_index(query_embeddings=query_embeddings, meeting_title="kickoff meeting", date="unknown")
print(answers)

{'matches': [{'id': 'd265b793eff264ea2405a928724631fb86b50182c2d36fba3a07c41d98ff34db',
              'metadata': {'date': '2024-09-13',
                           'text': '[00:00:00] John: Good morning, everyone. '
                                   "Thank you for joining today's kickoff "
                                   'meeting for our new     software '
                                   "development project. We'll be discussing "
                                   'the project scope, timelines, and\n'
                                   "responsibilities. Let's get started with a "
                                   "quick round of introductions. I'll go "
                                   "first. I'm John, the     project manager. "
                                   "I'll be overseeing the project and "
                                   'ensuring we stay on track. Alice, would '
                                   'you     like to go next?\n'
                                 

# Combining Text from Multiple Document Matches

In [8]:
text_answer = " ".join([doc['metadata']['text'] for doc in answers['matches']])
print(text_answer)

[00:00:00] John: Good morning, everyone. Thank you for joining today's kickoff meeting for our new     software development project. We'll be discussing the project scope, timelines, and
responsibilities. Let's get started with a quick round of introductions. I'll go first. I'm John, the     project manager. I'll be overseeing the project and ensuring we stay on track. Alice, would you     like to go next?
[00:00:20]
Alice: Sure, thanks John. Hi, everyone. I'm Alice, the lead developer. I'll be responsible for the overall architecture and development of the software. Looking forward to working with all of you.
[00:00:35]
Bob: Hi, I'm Bob, the UI/UX designer. I'll be handling the design aspects of the software, making
sure it's user-friendly and visually appealing.
[00:00:45]
Sara: Hello, I'm Sara, the QA analyst. I'll be testing the software to ensure it meets our quality
standards and is free of bugs.
[00:00:55]
John: Great, thank you. Now that we've introduced ourselves, let's dive i

## Prompt

In [10]:
prompt = f"""You are a meeting facilitator.
        This user will ask you a questions about the conversation of the meeting.
        Use following piece of context to answer the question.
        If you don't know the answer, just say you don't know.
        Keep the answer within 2 sentences and concise.
        Context: {text_answer}
        Question: {query}"""

print(prompt)

You are a meeting facilitator.
        This user will ask you a questions about the conversation of the meeting.
        Use following piece of context to answer the question.
        If you don't know the answer, just say you don't know.
        Keep the answer within 2 sentences and concise.
        Context: [00:00:00] John: Good morning, everyone. Thank you for joining today's kickoff meeting for our new     software development project. We'll be discussing the project scope, timelines, and
responsibilities. Let's get started with a quick round of introductions. I'll go first. I'm John, the     project manager. I'll be overseeing the project and ensuring we stay on track. Alice, would you     like to go next?
[00:00:20]
Alice: Sure, thanks John. Hi, everyone. I'm Alice, the lead developer. I'll be responsible for the overall architecture and development of the software. Looking forward to working with all of you.
[00:00:35]
Bob: Hi, I'm Bob, the UI/UX designer. I'll be handling the 

# LLM

In [11]:
def better_query_response(prompt):
    """
    This function returns a better response using LLM
    """
    better_answer = LLM.invoke(prompt)
    print("Generating Better Response: Done!")
    return better_answer

final_answer = better_query_response(prompt=prompt)
print(final_answer)

Generating Better Response: Done!
content='Sara, the QA analyst, mentioned that she will be testing the software to ensure it meets quality standards and is free of bugs.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 27, 'prompt_tokens': 339, 'total_tokens': 366, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4-turbo-2024-04-09', 'system_fingerprint': 'fp_cfb5f9efa7', 'finish_reason': 'stop', 'logprobs': None} id='run-9086f9df-4253-4bb3-bcf3-c03517ca830d-0' usage_metadata={'input_tokens': 339, 'output_tokens': 27, 'total_tokens': 366}


# Chatbot Response

In [12]:
def Chatbot(query, meeting_title=None, date=None):
    print(query)
    metadata = extract_metadata_from_query(query)
    print(metadata)
    meeting_title = metadata.get('meeting_title', 'unknown')
    date = metadata.get('date', 'unknown')

    query_embeddings = get_query_embeddings(query=query)

    answers = query_pinecone_index(
        query_embeddings=query_embeddings,
        meeting_title=meeting_title,
        date=date
        )
    print(answers)

    text_answers = " ".join([doc['metadata']['text'] for doc in answers['matches']])
    print(text_answers)
    final_answer = better_query_response(prompt=prompt)
    return final_answer.content

response = Chatbot(query)
print(response)

What was the QA perspective on the kickoff meeting?
{'meeting_title': 'kickoff meeting', 'date': 'unknown'}
Generating Embeddings: Done!
{'matches': [{'id': 'd265b793eff264ea2405a928724631fb86b50182c2d36fba3a07c41d98ff34db',
              'metadata': {'date': '2024-09-13',
                           'text': '[00:00:00] John: Good morning, everyone. '
                                   "Thank you for joining today's kickoff "
                                   'meeting for our new     software '
                                   "development project. We'll be discussing "
                                   'the project scope, timelines, and\n'
                                   "responsibilities. Let's get started with a "
                                   "quick round of introductions. I'll go "
                                   "first. I'm John, the     project manager. "
                                   "I'll be overseeing the project and "
                                   'en