In [1]:
import pandas as pd
import openai
import os


import os
from dotenv import load_dotenv
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, accuracy_score
import plotly.express as px

import chromadb
from langchain_core.output_parsers import JsonOutputParser
from chromadb.utils import embedding_functions
tqdm.pandas()

In [4]:
#pip install chromadb==0.6.3

In [None]:
pip install langchain

In [6]:
os.chdir("..")

In [12]:
pwd

'c:\\git\\wwsi-2026-genai\\notebooks'

In [8]:
##Attention: You need to be in WSI-GenAI dir

In [13]:
load_dotenv()


True

## Setup keys and parsers for later

In [2]:
openai.api_key = OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

client = openai.Client()

output_parser = JsonOutputParser()

## Load data of 100 FAQ items from a travel company

In [7]:
df = pd.read_json("../data/travel_company_faq.json")
i=10
questions = df.loc[i].question


# Classify FAQ items with LLM

In [8]:
##TODO: Prepare system prompt to split FAQ items into 4 categories: [air-travel, hotels-and-booking, food, insurance, extra-activities]
# System prompt with instructions and the JSON format we expect
system_prompt = f"""You are a helpful assistant.
Your task: Classify the given question into one of the following categories:
[air-travel, hotels-and-booking, food, insurance, extra-activities].

Return the result ONLY in valid JSON, in the format:
{{"category":"<category>"}}

"""

In [9]:


# --- Step 2: Define a function to call OpenAI’s chat completion directly ---
def classify_question(question: str, system_prompt: str) -> str:
    """
    Calls the OpenAI ChatCompletion endpoint to classify the question.
    Returns the predicted category.
    """

    # Make a direct OpenAI chat call (using the fictional "gpt-4o-mini" as specified)

    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Updated to match available models
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": question}
        ],
        temperature=0,
    )


    # Extract the content of the model’s answer
    content = response.choices[0].message.content

    ##TODO: Use the LangChain parser to convert the JSON string into a Python dict and return only category as text
    parsed_response = output_parser.parse(content)

    return parsed_response["category"]



In [10]:
output = classify_question(questions, system_prompt)
print(output)

air-travel


## Get predicted category and evaluate metrics

In [26]:
df["predicted_category"] = df["question"].apply(
    lambda q: classify_question(q, system_prompt)
)

In [27]:
##TODO: calculate accuracy score using sklean
accuracy = accuracy_score(df["category"], df["predicted_category"])
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.96


In [28]:
def plot_confusion_matrix(df, pred_col = "predicted_category"):
    categories = sorted(df["category"].unique())
    cm = confusion_matrix(
        df["category"],
        df[pred_col],
        labels=categories
    )

    # 4. Plot the confusion matrix with Plotly
    fig = px.imshow(
        cm,
        x=categories,  # predicted labels across the top
        y=categories,  # actual labels along the side
        text_auto=True,  # show values in each cell
        labels=dict(x="Predicted", y="Actual", color="Count")
    )
    fig.update_layout(
        title="Confusion Matrix"
    )
    fig.show()

In [29]:
plot_confusion_matrix(df)

  0%|          | 0/100 [23:48<?, ?it/s]
  0%|          | 0/100 [04:10<?, ?it/s]
  0%|          | 0/100 [02:45<?, ?it/s]


## Few shot learning

In [16]:

def add_few_shot_examples(system_prompt: str, df: pd.DataFrame, n_per_category: int = 2) -> str:
    """
    Takes a base system prompt and appends n few-shot examples per category,
    drawn from the DataFrame (which must have columns: 'question' and 'category').

    :param system_prompt: The base system prompt string.
    :param df: DataFrame with columns ['question', 'answer', 'category'].
    :param n_per_category: How many examples per category to append.
    :return: A new system prompt including few-shot examples.
    """

    # Ensure categories are as expected (or adapt to your exact categories)
    # If your categories are known, you could explicitly list them:
    # categories = ["air-travel", "hotels-and-booking", "food", "insurance", "extra-activities"]
    categories = df["category"].unique()

    few_shot_examples_section = "\n\nHere are some examples:\n"

    ##TODO: Add n examples from df per category to system_prompt as few shot learning. 
    ##Iterate through each category and n rows and keep appending example questions to few_shot_examples_section
    for cat in categories:
        # Subset data for this category
        cat_df = df[df["category"] == cat]

        # If not enough examples in that category, sample up to what's available
        n_samples = min(n_per_category, len(cat_df))

        # Randomly sample n_per_category questions for that category
        # (If your dataset is small, you might avoid random and just take .head(n_samples))
        sampled_rows = cat_df.sample(n=n_samples, random_state=42)

        # Build the example text
        for _, row in sampled_rows.iterrows():
            # Feel free to refine or format these examples however you prefer
            example_text = f"""
            Q: {row['question']}
            Category: {row['category']}
                        """
            few_shot_examples_section += example_text

    # Append the few-shot examples to the base system prompt
    new_system_prompt = system_prompt + few_shot_examples_section
    return new_system_prompt

In [17]:
system_prompt_with_few_shot =  add_few_shot_examples(system_prompt, df, n_per_category = 2)

In [18]:
print(system_prompt_with_few_shot)

You are a helpful assistant.
Your task: Classify the given question into one of the following categories:
[air-travel, hotels-and-booking, food, insurance, extra-activities].

Return the result ONLY in valid JSON, in the format:
{"category":"<category>"}



Here are some examples:

            Q: What is the maximum baggage allowance for flights to Greece?
            Category: air-travel
                        
            Q: How do I change my flight date once I’ve booked?
            Category: air-travel
                        
            Q: How can I modify my hotel reservation in Italy after it's confirmed?
            Category: hotels-and-booking
                        
            Q: Can I book a hotel room without a credit card?
            Category: hotels-and-booking
                        
            Q: Do you cater to vegetarian or vegan diets at your partner hotels in Italy?
            Category: food
                        
            Q: What types of restaurants 

In [23]:
df["predicted_category_few_shot"] = df["question"].apply(
    lambda q: classify_question(q, system_prompt_with_few_shot )
)

  0%|          | 0/100 [02:07<?, ?it/s]


In [24]:
accuracy = accuracy_score(df["category"], df["predicted_category_few_shot"])
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.97


In [30]:
plot_confusion_matrix(df, pred_col ="predicted_category_few_shot")

# Introduction to vector search

In [31]:
chroma_db_path = "chroma_db"
chroma_client = chromadb.PersistentClient(path=chroma_db_path)

SELECTED_COLLECTION = "travel-company-faq"

In [32]:
embedding_model = "text-embedding-ada-002"
openai_ef = embedding_functions.OpenAIEmbeddingFunction(model_name=embedding_model, api_key = OPENAI_API_KEY)


collection = chroma_client.get_or_create_collection(name=SELECTED_COLLECTION , embedding_function=openai_ef)

In [33]:
def ingest_faq_data(df: pd.DataFrame, collection):
    """
    
    Ingest combined question and answer as vectorized documents. Store question, answer and category as metadata. 
    """
    all_ids = []
    all_documents = []
    all_metadatas = []

    for i, row in df.iterrows():
        # Combine Q + A as text
        doc_text = f"Question: {row['question']}\nAnswer: {row['answer']}"

        doc_id = f"faq_{i}"
        meta = {
            "question": row["question"],
            "answer": row["answer"],
            "category": row["category"],
        }

        all_ids.append(doc_id)
        all_documents.append(doc_text)
        all_metadatas.append(meta)

    collection.add(documents=all_documents, metadatas=all_metadatas, ids=all_ids)
    print(f"Ingested {len(df)} records into {SELECTED_COLLECTION}.")


In [34]:
ingest_faq_data(df, collection)

Ingested 100 records into travel-company-faq.


In [35]:
sample_items = collection.get(include=["documents", "embeddings", "metadatas"], limit =10)


In [36]:
sample_items["documents"][0]

'Question: What is the maximum baggage allowance for flights to Greece?\nAnswer: Our standard flights to Greece allow one checked bag up to 23kg plus a carry-on. Overweight fees will apply if you exceed these limits, so we recommend checking with your airline for the most up-to-date details. Additional baggage can be purchased if needed.'

In [37]:
sample_items["embeddings"][0]

array([ 3.16370204e-02,  2.02722661e-02,  1.47562651e-02, ...,
        1.02865114e-03, -2.94457386e-05, -3.51949073e-02], shape=(1536,))

In [39]:
def retrieve_similar_qas(question: str, collection ,n: int = 3,):
    """
    Query the Chroma collection for the n most similar FAQs
    to the given user question. Print them out.
    """

    ##TODO: Investidate ChromaDB documentation and evaluate how to extract n_results most similar to query from out selection
    ##Then print those texts and distances

    results = collection.query(query_texts=[question], n_results=n)

    # 'results' is a dictionary with keys: 'ids', 'embeddings', 'documents', 'metadatas', 'distances'
    # Each key returns a list (of length equal to number of queries); here it's 1 for the single query
    # So we access results["metadatas"][0] to get the list of top-n metadata items


    print(f"\nTop {n} similar questions & answers to:\n\"{question}\"\n")

    for i in range(n):
        # Retrieve metadata for the ith result
        meta = results["metadatas"][0][i]
        dist = results["distances"][0][i]  # similarity distance

        # Print out relevant fields
        print(f"--- Result #{i+1} ---")
        print(f"Question: {meta['question']}")
        print(f"Answer:   {meta['answer']}")
        print(f"Category: {meta['category']}")
        print(f"Distance: {dist:.4f}\n")


In [40]:
question = "What is the air travel lost baggage policy??"
retrieve_similar_qas(question, collection)


Top 3 similar questions & answers to:
"What is the air travel lost baggage policy??"

--- Result #1 ---
Question: How do I file a claim if I lose my luggage?
Answer:   First, report the loss to the airline and obtain a Property Irregularity Report (PIR). Then, contact our insurance provider and submit the required documentation, including the PIR and proof of ownership. They will guide you through the claim process.
Category: insurance
Distance: 0.1511

--- Result #2 ---
Question: Do airlines charge for checked baggage on trips to Italy?
Answer:   Many airlines include at least one free checked bag, but some budget carriers may charge extra. The cost will depend on your specific flight and fare type. Always review baggage details during the booking process to avoid any surprises at the airport.
Category: air-travel
Distance: 0.1728

--- Result #3 ---
Question: What is the maximum baggage allowance for flights to Greece?
Answer:   Our standard flights to Greece allow one checked bag up

## Basic RAG

In [41]:
results = collection.query(query_texts=[question], n_results=3)
metadatas = results["metadatas"][0]

In [42]:
results["documents"][0]

['Question: How do I file a claim if I lose my luggage?\nAnswer: First, report the loss to the airline and obtain a Property Irregularity Report (PIR). Then, contact our insurance provider and submit the required documentation, including the PIR and proof of ownership. They will guide you through the claim process.',
 'Question: Do airlines charge for checked baggage on trips to Italy?\nAnswer: Many airlines include at least one free checked bag, but some budget carriers may charge extra. The cost will depend on your specific flight and fare type. Always review baggage details during the booking process to avoid any surprises at the airport.',
 'Question: What is the maximum baggage allowance for flights to Greece?\nAnswer: Our standard flights to Greece allow one checked bag up to 23kg plus a carry-on. Overweight fees will apply if you exceed these limits, so we recommend checking with your airline for the most up-to-date details. Additional baggage can be purchased if needed.']

In [49]:
query = "What is the lost luggage policy?"
n_results = 3

results = collection.query(query_texts=[query], n_results=n_results)



In [50]:
def format_context(documents):
     ##TODO convert listed documents to context that can be fed to LLM -e.g. text. Recommended splitting docs with html tags
    context = ""
    for i, meta in enumerate(documents):
        context += f"<Relevant Document #{i+1}>\n{documents[i]}\n</Relevant Document #{i+1}>\n"
    return context

In [51]:

def basic_rag_pipeline(query: str, n: int = 5) -> str:
    """
    A minimal RAG-like function.
    1) Retrieves the top-n similar Q&As from Chroma.
    2) Builds a prompt including the retrieved context.
    3) Sends the augmented query to the LLM.
    4) Returns the final answer.
    """

    # 1. Retrieve top-n similar Q&As
    results = collection.query(query_texts=[query], n_results=n)
    documents = results["documents"][0]
   
    context = format_context(documents)

    ##TODO: Create system prompt with context
    # 2. Create the system prompt that instructs the model to use the context
    system_prompt = f"""You are a helpful assistant. 
    Use the following context to answer the user's question. 
    If the context does not provide enough information, say so.

    Context:
    {context}
    """

    # 3. Now make the final call to OpenAI with the user query

    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Updated to match available models
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": question}
        ],
        temperature=0,
    )

    # 4. Extract and return the answer text
    answer = response.choices[0].message.content
    return answer, context


In [52]:
answer, context = basic_rag_pipeline(question)

In [53]:
print(context)

<Relevant Document #1>
Question: How do I file a claim if I lose my luggage?
Answer: First, report the loss to the airline and obtain a Property Irregularity Report (PIR). Then, contact our insurance provider and submit the required documentation, including the PIR and proof of ownership. They will guide you through the claim process.
</Relevant Document #1>
<Relevant Document #2>
Question: Do airlines charge for checked baggage on trips to Italy?
Answer: Many airlines include at least one free checked bag, but some budget carriers may charge extra. The cost will depend on your specific flight and fare type. Always review baggage details during the booking process to avoid any surprises at the airport.
</Relevant Document #2>
<Relevant Document #3>
Question: What is the maximum baggage allowance for flights to Greece?
Answer: Our standard flights to Greece allow one checked bag up to 23kg plus a carry-on. Overweight fees will apply if you exceed these limits, so we recommend checking w

In [54]:
print(answer)

The air travel lost baggage policy typically involves the following steps: 

1. Report the loss to the airline immediately and obtain a Property Irregularity Report (PIR).
2. Contact your insurance provider (if you have travel insurance) and submit the required documentation, which includes the PIR and proof of ownership of the lost items.
3. The insurance provider will guide you through the claim process.

It's important to check with your specific airline for their detailed lost baggage policy, as procedures may vary.


# Rerank answers

### How do rerankers work??

https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1

In [None]:
from sentence_transformers import CrossEncoder

# Load the model, here we use our base sized model
model = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")

# Example query and documents
query = "Who wrote 'To Kill a Mockingbird'?"
documents = [
    "'To Kill a Mockingbird' is a novel by Harper Lee published in 1960. It was immediately successful, winning the Pulitzer Prize, and has become a classic of modern American literature.",
    "The novel 'Moby-Dick' was written by Herman Melville and first published in 1851. It is considered a masterpiece of American literature and deals with complex themes of obsession, revenge, and the conflict between good and evil.",
    "Harper Lee, an American novelist widely known for her novel 'To Kill a Mockingbird', was born in 1926 in Monroeville, Alabama. She received the Pulitzer Prize for Fiction in 1961.",
    "Jane Austen was an English novelist known primarily for her six major novels, which interpret, critique and comment upon the British landed gentry at the end of the 18th century.",
    "The 'Harry Potter' series, which consists of seven fantasy novels written by British author J.K. Rowling, is among the most popular and critically acclaimed books of the modern era.",
    "'The Great Gatsby', a novel written by American author F. Scott Fitzgerald, was published in 1925. The story is set in the Jazz Age and follows the life of millionaire Jay Gatsby and his pursuit of Daisy Buchanan."
]

results = model.rank(query, documents, return_documents=True, top_k=3)

In [82]:
query = "What is the lost luggage policy?"

In [83]:
results = collection.query(query_texts=[query], n_results=10)
documents = results["documents"][0]

In [84]:
##TODO: Get scores for results of initial RAG
results_with_scores = model.rank(query, documents, return_documents=True, top_k=3)

In [85]:
results_with_scores

[{'corpus_id': 0,
  'score': np.float32(0.59261125),
  'text': 'Question: How do I file a claim if I lose my luggage?\nAnswer: First, report the loss to the airline and obtain a Property Irregularity Report (PIR). Then, contact our insurance provider and submit the required documentation, including the PIR and proof of ownership. They will guide you through the claim process.'},
 {'corpus_id': 2,
  'score': np.float32(0.17916648),
  'text': 'Question: Does the insurance cover stolen personal belongings?\nAnswer: Most plans include coverage for theft or loss of personal items up to a certain limit. You will need a police report and proof of ownership for claims. High-value items like electronics may require additional coverage or have specific sub-limits.'},
 {'corpus_id': 3,
  'score': np.float32(0.07032193),
  'text': 'Question: Does travel insurance cover delayed flights or missed connections?\nAnswer: Most comprehensive plans offer coverage for trip delays, missed connections, or ad

In [60]:
def rerank_and_limit_context(query, documents, n_items=3, min_score_threshold = 0.5,):
    ##TODO: rerank documents and return only these with score above threshold
    documents_reranked_with_scores = model.rank(query, documents, return_documents=True, top_k=n_items)

    documents_reranked = [item["text"] for item in documents_reranked_with_scores if item["score"]>=min_score_threshold]

    return documents_reranked


In [86]:
rerank_and_limit_context(query, documents, n_items=3, min_score_threshold = 0.005,)

['Question: How do I file a claim if I lose my luggage?\nAnswer: First, report the loss to the airline and obtain a Property Irregularity Report (PIR). Then, contact our insurance provider and submit the required documentation, including the PIR and proof of ownership. They will guide you through the claim process.',
 'Question: Does the insurance cover stolen personal belongings?\nAnswer: Most plans include coverage for theft or loss of personal items up to a certain limit. You will need a police report and proof of ownership for claims. High-value items like electronics may require additional coverage or have specific sub-limits.',
 'Question: Does travel insurance cover delayed flights or missed connections?\nAnswer: Most comprehensive plans offer coverage for trip delays, missed connections, or additional transportation costs due to unforeseen events. You typically need to provide receipts and proof of the delay. Check your policy for the exact criteria and benefit limits.']

In [None]:

def rag_pipeline_with_reranking(query: str, n: int = 5) -> str:
    """
    A minimal RAG-like function.
    1) Retrieves the top-n similar Q&As from Chroma.
    2) Builds a prompt including the retrieved context.
    3) Sends the augmented query to the LLM.
    4) Returns the final answer.
    """

    # 1. Retrieve top-n similar Q&As
    results = collection.query(query_texts=[query], n_results=n*2)
    documents = results["documents"][0]

    documents = rerank_and_limit_context(query, documents, n_items=n, min_score_threshold = 0.5,)
    
    if documents:
        context = format_context(documents)
    else:
        context = "No relevant documents found for context"


    # 2. Create the system prompt that instructs the model to use the context
    system_prompt = f"""You are a helpful assistant. 
    Use the following context to answer the user's question. 
    If the context does not provide enough information, say so.

    Context:
    {context}
    """

    # 3. Now make the final call to OpenAI with the user query

    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Updated to match available models
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": question}
        ],
        temperature=0,
    )

    # 4. Extract and return the answer text
    answer = response.choices[0].message.content
    return answer, context

In [87]:
rag_pipeline_with_reranking(query)

('The context provided does not include specific details about the air travel lost baggage policy. However, it does mention that if you lose your luggage, you should report the loss to the airline and obtain a Property Irregularity Report (PIR), and then contact your insurance provider to file a claim. For more detailed information about the lost baggage policy, you may need to check with the specific airline or their official website.',
 '<Relevant Document #1>\nQuestion: How do I file a claim if I lose my luggage?\nAnswer: First, report the loss to the airline and obtain a Property Irregularity Report (PIR). Then, contact our insurance provider and submit the required documentation, including the PIR and proof of ownership. They will guide you through the claim process.\n</Relevant Document #1>\n')