In [None]:
# IMPORTANT: DO NOT EXECUTE THIS CELL IF YOU'RE USING UDACITY WORKSPACE.

# Making sure we are using the appropriate version of OpenAI library and installing tiktoken if it is not already installed
# Restart the kernel/session after installation completes
!pip install -q openai==0.27.7
!pip install -q tiktoken

## Dataset Choice

I have chosen the ... dataset. (Explain your choice)

In [None]:
# ===============================
# DO NOT MODIFY THIS CELL
# ===============================
import getpass
import openai
import pandas as pd
import numpy as np
import tiktoken
from openai.embeddings_utils import distances_from_embeddings

In [None]:
# ===============================
# API Key Configuration (DO NOT MODIFY)
# ===============================

openai.api_base = "https://openai.vocareum.com/v1"

# This function is complete and should not be modified.
def get_openai_api_key():
    key = getpass.getpass("Enter OpenAI API key (input hidden): ").strip()
    while not key:
        print("API key cannot be empty!")
        key = getpass.getpass("Enter OpenAI API key (input hidden): ").strip()

    print(f"API key configured (last 4 chars): ****{key[-4:]}")
    return key

openai.api_key = get_openai_api_key()

In [None]:
# ===============================
# Dataset & Embedding Functions
# ===============================

def load_dataset(file_path):
    df = pd.read_csv(file_path)
    # TODO: Replace the following line with code that extracts the text from the appropriate column.
    # For example, if your text data is in a column named 'Trends', then you might use:
    #    df['text'] = df['Trends']
    # Otherwise, update 'Trends' to the correct column name.
    # >>>>> Hint #1 <<<<<
    # df['text'] = df['Trends']
    df['text'] = None  # <-- Replace this placeholder with the correct extraction code.
    return df[['text']]

def generate_embeddings(df, embedding_model_name="text-embedding-ada-002", batch_size=1):
    embeddings = []
    for i in range(0, len(df), batch_size):
        response = openai.Embedding.create(
            input=df.iloc[i:i + batch_size]["text"].tolist(),
            engine=embedding_model_name
        )
        # TODO: Extract each embedding from response["data"] and append it to the embeddings list.
        # >>>>> Hint #2 <<<<<
        # embeddings.extend([data["embedding"] for data in response["data"]])
        embeddings.extend( [None] )  # <-- Replace this placeholder with your extraction logic.
    df["embeddings"] = embeddings
    return df

def save_embeddings(df, output_file):
    # TODO: Save the DataFrame to a CSV file. Make sure you set the parameter to avoid saving row indices.
    # >>>>> Hint #3 <<<<<
    # df.to_csv(output_file, index=False)
    df.to_csv(output_file)  # <-- Update this call by adding the appropriate parameter.

def load_embeddings(file_path):
    df = pd.read_csv(file_path)
    # TODO: Convert the string representation in the 'embeddings' column back into numpy arrays.
    # >>>>> Hint #4 <<<<<
    # df["embeddings"] = df["embeddings"].apply(eval).apply(np.array)
    df["embeddings"] = df["embeddings"].apply(eval)  # <-- Complete the conversion as needed.
    return df

def get_relevant_rows(question, df, embedding_model_name="text-embedding-ada-002", top_n=10):
    question_embedding = openai.Embedding.create(
        model=embedding_model_name,
        input=question
    )['data'][0]['embedding']

    df_copy = df.copy()
    # TODO: Compute the cosine distances between the question embedding and each text embedding.
    # Hint: Use distances_from_embeddings() with the appropriate parameters.
    # >>>>> Hint #5 <<<<<
    # df_copy['distance'] = distances_from_embeddings(question_embedding, df_copy['embeddings'].values, distance_metric="cosine")
    df_copy['distance'] = None  # <-- Replace this placeholder with the proper distance computation.

    return df_copy.nsmallest(top_n, 'distance')

In [None]:
# ===============================
# Prompt Creation & Answering
# ===============================

def create_prompt(question, df, max_token_count=1500):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    prompt_template = """
    Answer the question based on the context below. If the question can't be answered based on the context, say "I don't know."

    Context: {}

    ---

    Question: {}

    Answer:
    """
    # TODO: Compute the initial token count using the prompt_template and question.
    # >>>>> Hint #1 <<<<<
    # current_token_count = len(tokenizer.encode(prompt_template)) + len(tokenizer.encode(question))
    current_token_count = 0  # <-- Replace with the actual token count calculation.

    context = []
    for text in df["text"].values:
        tokens_in_text = len(tokenizer.encode(text))
        if current_token_count + tokens_in_text <= max_token_count:
            context.append(text)
            # TODO: Update the current_token_count by adding tokens_in_text.
            # >>>>> Hint #2 <<<<<
            # current_token_count += tokens_in_text
        else:
            break

    return prompt_template.format("\n\n###\n\n".join(context), question)

def get_openai_answer(prompt, max_answer_tokens=150):
    try:
        response = openai.Completion.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=max_answer_tokens
        )
        # TODO: Extract and return the generated answer text from the response.
        # >>>>> Hint #3 <<<<<
        # return response["choices"][0]["text"].strip()
        return None  # <-- Replace this with the correct extraction code.
    except Exception as e:
        print(f"Error: {str(e)}")
        return "An error occurred."

In [None]:
# ===============================
# Question Answering Functions
# ===============================

def answer_basic_question(question, max_answer_tokens=150):
    try:
        response = openai.Completion.create(
            model="gpt-3.5-turbo-instruct",
            prompt=question,
            max_tokens=max_answer_tokens
        )
        # TODO: Extract and return the answer from the response.
        # >>>>> Hint #1 <<<<<
        # return response["choices"][0]["text"].strip()
        return None  # <-- Replace this placeholder with your extraction logic.
    except Exception as e:
        print(f"Error: {str(e)}")
        return "An error occurred."

def answer_question_with_context(question, df, max_prompt_tokens=1500, max_answer_tokens=150, top_n=10):
    relevant_rows = get_relevant_rows(question, df, top_n=top_n)
    # TODO: Construct a combined prompt using the relevant rows and the question.
    # >>>>> Hint #2 <<<<<
    # prompt = create_prompt(question, relevant_rows, max_token_count=max_prompt_tokens)
    prompt = None  # <-- Replace this placeholder with a call to create_prompt.

    # TODO: Generate and return the answer using the combined prompt.
    # >>>>> Hint #3 <<<<<
    # return get_openai_answer(prompt, max_answer_tokens=max_answer_tokens)
    return None  # <-- Replace this placeholder with the proper call.

In [None]:
# ===============================
# Main Function
# ===============================

def main():
    # Load the dataset from the CSV file.
    # >>>>> Hint #1 <<<<<
    # df = load_dataset("./2023_fashion_trends.csv")
    df = load_dataset(None)  # <-- Verify and adjust if needed.

    # Generate embeddings and save them to a CSV file.
    df = generate_embeddings(df)
    # TODO: Save the embeddings to a CSV file (ensure proper parameters).
    # >>>>> Hint #2 <<<<<
    # save_embeddings(df, "./embeddings_with_vectors.csv")
    save_embeddings(None)  # <-- Verify and adjust if needed.

    # TODO: Load the embeddings back from the CSV file.
    # >>>>> Hint #3 <<<<<
    # df = load_embeddings("./embeddings_with_vectors.csv")
    df = load_embeddings(None)  # <-- Verify and adjust if needed.

    # Example Question 1
    question1 = "What are the popular fashion trends in 2023?"    # <-- Replace with your own question depending on chosen dataset.
    basic_answer1 = answer_basic_question(question1)
    custom_answer1 = answer_question_with_context(question1, df)

    # Example Question 2
    question2 = "What is the trend for oversized bags?"  # <-- Replace with your own question depending on chosen dataset.
    basic_answer2 = answer_basic_question(question2)
    custom_answer2 = answer_question_with_context(question2, df)

    # TODO: Print the results for both questions.
    # >>>>> Hint #4 <<<<<
    # print(f"Question 1: {question1}\nBasic Answer: {basic_answer1}\n\nCustom Answer: {custom_answer1}\n\n")
    # print(f"Question 2: {question2}\nBasic Answer: {basic_answer2}\n\nCustom Answer: {custom_answer2}\n\n")
    print("<< Print statements here >>")  # <-- Replace with your final print statements.

In [None]:
# ===============================
# Execution
# ===============================
if __name__ == "__main__":
    main()