In [1]:
# Define shared function calls

import numpy as np
import os
from typing import List
from scipy import spatial
from openai import OpenAI
print(os.getenv("OPENAI_KEY"))
client = OpenAI(
    api_key=os.getenv("OPENAI_KEY")
)

from IPython.display import display

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def get_embedding(text, model="text-embedding-ada-002"): # model = "deployment_name"
    return client.embeddings.create(input = [text], model=model).data[0].embedding

def search_docs(df, user_query, top_n=4, to_print=True):
    embedding = get_embedding(
        user_query,
        model="text-embedding-ada-002" # model should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model
    )
    df["similarities"] = df.ada_v2.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )
    if to_print:
        display(res)
    return res

def query(_pipeline, question):
    params = {}
    results = _pipeline.run(question, params=params)
    return results

# taken from old release: https://github.com/openai/openai-python/blob/release-v0.28.1/openai/embeddings_utils.py

def distances_from_embeddings(
    query_embedding: List[float],
    embeddings: List[List[float]],
    distance_metric="cosine",
) -> List[List]:
    """Return the distances between a query embedding and a list of embeddings."""
    distance_metrics = {
        "cosine": spatial.distance.cosine,
        "L1": spatial.distance.cityblock,
        "L2": spatial.distance.euclidean,
        "Linf": spatial.distance.chebyshev,
    }
    distances = [
        distance_metrics[distance_metric](query_embedding, embedding)
        for embedding in embeddings
    ]
    return distances


def indices_of_nearest_neighbors_from_distances(distances) -> np.ndarray:
    """Return a list of indices of nearest neighbors from a list of distances."""
    return np.argsort(distances)


None


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [6]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191


In [14]:
# load dataset, combine Summary and Content
import pandas as pd

input_datapath = "ingest/Reviews.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]

df = df.dropna()

df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.columns

Index(['Time', 'ProductId', 'UserId', 'Score', 'Summary', 'Text', 'combined'], dtype='object')

In [26]:
# subsample to 1k most recent reviews and remove samples that are to long
import tiktoken

top_n = 1000
df = df.sort_values("Time").tail(top_n * 2)  # first cut to first 2k entries, assuming less than half will be filtered out
df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

1000

In [27]:
#create embeddings
from openai import OpenAI
import os
client = OpenAI(
    api_key=os.getenv("OPENAI_KEY")
)

from embeddings import get_embedding
embedding_model = "text-embedding-ada-002"
# This may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, embedding_model))
df.to_csv("processed/fine_food_reviews_with_embeddings_1k.csv")


In [39]:
# Question and distance embeddings to provide context

embedding_model = "text-embedding-ada-002"

def create_context(
    question, df, max_len=1800, size="ada"
):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """
    # Get the embeddings for the question
    q_embeddings = client.embeddings.create(input=question, model=embedding_model).data[0].embedding
    
    # Get the distances from the embeddings
    df['distances'] = distances_from_embeddings(q_embeddings, df['embedding'].values, distance_metric='cosine')
    
    returns = []
    cur_len = 0

    # Sort by distance and add the text to the context until the context is too long
    for i, row in df.sort_values('distances', ascending=True).iterrows():

        # Add the length of the text to the current length
        cur_len += row['n_tokens'] + 4

        # If the context is too long, break
        if cur_len > max_len:
            break

        # Else add it to the text that is being returned
        returns.append(row["Text"])

    # Return the context
    return "\n\n###\n\n".join(returns)

In [51]:
# Get context, create answer
def answer_question(
    df,
    model="gpt-3.5-turbo",
    question="Am I allowed to publish model outputs to Twitter, without a human review?",
    max_len=1800,
    size="ada",
    debug=False,
    max_tokens=150,
    stop_sequence=None
):
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context = create_context(
        question,
        df,
        max_len=max_len,
        size=size,
    )
    
    # If debug, print the raw model response
    if debug:
        print("Context:\n" + context)
        print("\n\n")

    try:
        # Create a chat completion using the question and context
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\n"},
                {"role": "user", f"content": "Context: {context}\n\n---\n\nQuestion: {question}\nAnswer:"}
            ],
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=stop_sequence,
        )
        return response.choices[0].message.strip()
    except Exception as e:
        print(e)
        return ""

In [52]:
# print(df.columns)

answer_question(df, question="What is the best price for coffee?", debug=True)

# answer_question(df, question="What is the best coffee?")

# answer_question(df, question="Whoo sold the most coffee?")

Context:
Great coffee at a good price. I'm a subscription buyer and I buy this month after month. What more can I say?

###

Great coffee at a good price. I'm a subscription buyer and I buy this month after month. What more can I say?

###

Great coffee at a good price. I'm a subscription buyer and I buy this month after month. What more can I say?

###

Great coffee at a good price. I'm a subscription buyer and I buy this month after month. What more can I say?

###

I was very happy to find this deal.  Good price and excellent coffee. Gets my morning off to a good start.

###

San Francisco Bay Coffee Company makes the best coffee for the Kuerig by far.  It's also half the price.  It's always nice to have someone break a monopoly.

###

This is the best coffee ever! Wish I could order a box of 100 at a time as we go thru a box of 80 in about a month and a half. Buying it online is soooo much cheaper than buying at the grocery store.

###

In my opinion this is the best coffee ever!  

''