##### Quick Intro
The dataset is scraped from https://www.appliancerepair.net/dishwasher-repair-1.html and follow-on pages.

Update Oct 23, the page is no longer available, but the chunking is already done, so we're OK

Scraper code is at scraper.py.

The embeddings are generated using the [SentenceTransformer](https://www.sbert.net/) package (no finetuning)

The overall flow is from 
https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb

The difference is, instead of using OpenAI Curie to create the embeddings, I create embeddings using SBERT and use cosine simialrity to retrieve relevant pieces of text.

You'll need an OpenAI key

In [None]:
%pip install pandas -q
%pip install openai -q
%pip install getpass4
%pip install sentence-transformers

In [None]:
import pandas as pd
import openai
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer, util
print(f'openai api version: {openai.__version__}')

In [None]:
def get_embedding(text: str, model: SentenceTransformer) -> list[float]:
    embeddings = model.encode([text])
    return embeddings

def get_doc_embedding(text: str, model: SentenceTransformer) -> list[float]:
    return get_embedding(text, model)

def get_query_embedding(text: str, model: SentenceTransformer) -> list[float]:
    return get_embedding(text, model)

def compute_doc_embeddings(df: pd.DataFrame, model: SentenceTransformer) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe.
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_doc_embedding(r.content.replace("\n", " "), model) for idx, r in df.iterrows()
    }

def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.
    Again, we have hosted the embeddings for you so you don't have to re-calculate them from scratch.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }


In [None]:
# The scraper generated the csv file below
df = pd.read_csv('dish-washer-data.csv')
df["tokens"] = pd.to_numeric(df["tokens"])  # convert column "tokens" of a DataFrame
df = df.set_index(["title", "heading"])
print(f"{len(df)} rows in the data.")
df.sample(10)
# TODO get some stats on max/min content length


In [None]:

embeddings_model_path = 'msmarco-distilbert-base-v4'
embeddings_model = SentenceTransformer(embeddings_model_path)
print(f'Default sequence length:{embeddings_model.max_seq_length}')


In [None]:
# This could take a bit of time
document_embeddings = compute_doc_embeddings(df, embeddings_model)

# An example embedding:
example_entry = list(document_embeddings.items())[0]
# print(example_entry)


In [None]:
print(f'Total documents, {len(document_embeddings)}')

In [None]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    We use cosine similarity 
    """
    return util.cos_sim(x,y)

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array], model: SentenceTransformer) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_query_embedding(query, model)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities



In [None]:
# ask a question and get top_k relevant text
top_k = 5
by_semantic_relevance = order_document_sections_by_query_similarity("Why is my dishwasher leaking?", document_embeddings, embeddings_model)[:top_k]
for i in range(top_k):
    index = by_semantic_relevance[i][1]  # index
    print(f"index: {index} {[i]}:  {df.loc[index]['content']}")


In [None]:
# ask another question
top_k = 5
by_semantic_relevance = order_document_sections_by_query_similarity("Why is my dish washer not cleaning well?", document_embeddings, embeddings_model)[:top_k]
for i in range(top_k):
    index = by_semantic_relevance[i][1]  # [0] similarity [1]index
    print(f"{[i]} {df.loc[index]['content']}")

In [None]:
# TODO what is this?
from transformers import GPT2TokenizerFast
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
separator_len = len(tokenizer.tokenize(SEPARATOR))

f"Context separator contains {separator_len} tokens"

In [None]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame, embeddings_model: SentenceTransformer) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings, embeddings_model)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]

        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            # print(f'Enough context---run out of length of {MAX_SECTION_LEN}')
            break
            
        chosen_sections.append(SEPARATOR + document_section['content'].replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    # print(f"Selected {len(chosen_sections)} document sections:")
    # print("\n".join(chosen_sections_indexes))
    
    # The context
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [None]:
# Try out the prompt
prompt = construct_prompt(
    "Why is my dish wasker leaking?",
    document_embeddings,
    df,
    embeddings_model
)

print("===\n", prompt)

In [None]:
COMPLETIONS_MODEL = "gpt-4o"

### Answering the question from a context

In [None]:
import os
import getpass
from openai import OpenAI
OPENAI_API_KEY = getpass.getpass("Enter OpenAI API key")
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
# print(os.environ.get("OPENAI_API_KEY"))
client = OpenAI()

In [None]:

def answer_query_from_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df,
        embeddings_model
    )

    completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": prompt
        }
    ]
    )
    print(completion.choices[0].message.content)
    return completion.choices[0].message.content


In [None]:
# response = answer_query_with_context("Who won the 2020 Summer Olympics men's high jump?", df, document_embeddings)
from scipy.__config__ import show


response = answer_query_from_context("What are the ways to prevent water leakage?", df, document_embeddings, show_prompt=True)

answer = response[0:len(response) + 1].split('A:')[-1].strip()

print(f'====Answer\n" {answer}')

In [None]:
response = answer_query_from_context("Explain the various cycles?", df, document_embeddings, show_prompt=True)

answer = response[0:len(response) + 1].split('A:')[-1].strip()

print(f'====Answer\n {answer}')

In [None]:

response = answer_query_from_context("Why is my dishwasher leaking?", df, document_embeddings, show_prompt=True)

answer = response[0:len(response) + 1].split('A:')[-1].strip()

print(f'====Answer\n" {answer}')

In [None]:
response = answer_query_from_context("What's the biggest thing to worry about?", df, document_embeddings, show_prompt=True)

answer = response[0:len(response) + 1].split('A:')[-1].strip()

print(f'====Answer\n" {answer}')

In [None]:
response = answer_query_from_context("What are the sources of water leaks?", df, document_embeddings, show_prompt=True)

answer = response[0:len(response) + 1].split('A:')[-1].strip()

print(f'====Answer\n" {answer}')

In [None]:
response = answer_query_from_context("Tell me a bit about the warranty", df, document_embeddings, show_prompt=True)

answer = response[0:len(response) + 1].split('A:')[-1].strip()

print(f'====Answer\n" {answer}')