# Codealong Notebook

Use this notebook as your "scratch pad" as you go through the course contents. Feel free to copy any example code and tweak it to get a better understanding of how it works!

Use the **+** button or `Insert` menu to add additional code cells as needed.

## Step 1

### Loading the Data with `pandas`

In [None]:
import requests

In [None]:

# Get the Wikipedia page for "2022" since OpenAI's models stop in 2021
params = {
    "action": "query", 
    "prop": "extracts",
    "exlimit": 1,
    "titles": "2022",
    "explaintext": 1,
    "formatversion": 2,
    "format": "json"
}
resp = requests.get("https://en.wikipedia.org/w/api.php", params=params)
response_dict = resp.json()
response_dict["query"]["pages"][0]["extract"].split("\n")

In [None]:
import pandas as pd

# Load page text into a dataframe
df = pd.DataFrame()
df["text"] = response_dict["query"]["pages"][0]["extract"].split("\n")

In [None]:
df

In [None]:
from dateutil.parser import parse

# Clean up text to remove empty lines and headings
df = df[(df["text"].str.len() > 0) & (~df["text"].str.startswith("=="))]

# In some cases dates are used as headings instead of being part of the
# text sample; adjust so dated text samples start with dates
prefix = ""
for (i, row) in df.iterrows():
    # If the row already has " - ", it already has the needed date prefix
    if " – " not in row["text"]:
        try:
            # If the row's text is a date, set it as the new prefix
            parse(row["text"])
            prefix = row["text"]
        except:
            # If the row's text isn't a date, add the prefix
            row["text"] = prefix + " – " + row["text"]
df = df[df["text"].str.contains(" – ")].reset_index(drop=True)

In [None]:
df

### Creating an Embeddings Index with `openai.Embedding`

In [None]:
import openai

In [None]:
openai.api_base = "https://openai.vocareum.com/v1"

In [None]:
openai.api_key=""

In [None]:
EMBEDDING_MODEL_NAME ="text-embedding-ada-002"

In [None]:
# Generic example code
response = openai.Embedding.create(
    input=df["text"].tolist(),
    model=EMBEDDING_MODEL_NAME
)

In [None]:
type(response)

In [None]:
response.keys()

In [None]:
type(response["data"])

In [None]:
response["data"][0]

In [None]:
len(response["data"][0]["embedding"])

In [None]:
embeddings = [data["embedding"] for data in response["data"]]
embeddings

In [None]:
df["embeddings"] = embeddings
df

In [None]:
df.to_csv("embeddings.csv")

## Step 2

### Finding Relevant Data with Cosine Similarity

In [None]:
question = "When did Russia invade Ukraine"

In [None]:
from openai.embeddings_utils import get_embedding

In [None]:
EMBEDDING_MODEL_NAME ="text-embedding-ada-002"
question_embedding = get_embedding(question,engine= EMBEDDING_MODEL_NAME)
question_embedding

In [None]:
from openai.embeddings_utils import distances_from_embeddings

In [None]:
distances = distances_from_embeddings(
    question_embedding,
    df["embeddings"].tolist(),
    distance_metric="cosine"
)

In [None]:
df["distances"] = distances
df

In [None]:
df.to_csv("distances.csv")

## One Way of Sorting

In [None]:
current_shortest = df.iloc[0]["distances"]
current_shortest_index=0
current_shortest,current_shortest_index

In [None]:
for index,distance in enumerate(df["distances"].values):
    if distance < current_shortest:
        current_shortest = distance
        current_shortest_index= index

current_shortest,current_shortest_index

In [None]:
df.iloc[34]["text"]

## Another Way of Sorting

In [None]:
df.sort_values(by="distances")

In [None]:
df.iloc[55]["text"]

In [None]:
df.sort_values(by="distances").to_csv("distances_sorted.csv")

## Step 3

### Tokenizing with `tiktoken`

In [None]:
import tiktoken

In [None]:
tokenizer = tiktoken.get_encoding("cl100k_base")
tokenizer

In [None]:
tokenizer.encode("This is a question")

In [None]:
question = "When did Russia invade Ukraine?"

In [None]:
tokenizer.encode(question)


In [None]:
len(tokenizer.encode(question))

### Composing a Custom Text Prompt

In [None]:
prompt_template = """"
Answer the question based on the context below and if the question can't be answered on the context , say "I don't know"

Context:
{}

--------
Question :{}
Answer 

"""

In [None]:
question = "When did Russia invade Ukraine"

In [None]:
print(prompt_template.format("context", question))

In [None]:
max_token_count = 1000

In [None]:
import tiktoken

In [None]:
tokenizer = tiktoken.get_encoding("cl100k_base")
tokenizer

In [None]:
tokenizer.encode(question)

In [None]:
len(tokenizer.encode(question))

In [None]:
current_token_count = len(tokenizer.encode(prompt_template)) + len(tokenizer.encode(question))
current_token_count

In [None]:
context=[]
for text in df["text"].values:
    text_token_count = len(tokenizer.encode(text))
    current_token_count += text_token_count
    
    if current_token_count <= max_token_count:
        context.append(text)
    else:
        break

In [None]:
context

In [None]:
print(prompt_template.format("\n\n###\n\n".join(context), question))

### Another Way !!

In [None]:
import tiktoken

def create_prompt(question, df, max_token_count):
    """
    Given a question and a dataframe containing rows of text and their
    embeddings, return a text prompt to send to a Completion model
    """
    # Create a tokenizer that is designed to align with our embeddings
    tokenizer = tiktoken.get_encoding("cl100k_base")

    # Count the number of tokens in the prompt template and question
    prompt_template = """
Answer the question based on the context below, and if the question
can't be answered based on the context, say "I don't know"

Context: 

{}

---

Question: {}
Answer:"""

    current_token_count = len(tokenizer.encode(prompt_template)) + \
                            len(tokenizer.encode(question))

    context = []
    for text in get_rows_sorted_by_relevance(question, df)["text"].values:

        # Increase the counter based on the number of tokens in this row
        text_token_count = len(tokenizer.encode(text))
        current_token_count += text_token_count

        # Add the row of text to the list if we haven't exceeded the max
        if current_token_count <= max_token_count:
            context.append(text)
        else:
            break

    return prompt_template.format("\n\n###\n\n".join(context), question)

In [None]:
response = create_prompt(question, df, max_token_count)
response

## Step 4

### Getting a Custom Q&A Response with `openai.Completion`

In [None]:
import openai

In [None]:
openai.api_base = "https://openai.vocareum.com/v1"

In [None]:
openai.api_key=""

In [None]:
openai.Completion.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt_template.format("\n\n###\n\n".join(context), question))

In [None]:
openai.Completion.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt_template.format("\n\n###\n\n".join(context), question))["choices"][0]["text"]

In [None]:
ukraine_prompt = """
Question: "When did Russia invade Ukraine?"
Answer:
"""
initial_ukraine_answer = openai.Completion.create(
    model="gpt-3.5-turbo-instruct",
    prompt=ukraine_prompt,
    max_tokens=150
)["choices"][0]["text"].strip()
print(initial_ukraine_answer)

In [None]:
twitter_prompt = """
Question: "Who owns Twitter?"
Answer:
"""
initial_twitter_answer = openai.Completion.create(
    model="gpt-3.5-turbo-instruct",
    prompt=twitter_prompt,
    max_tokens=150
)["choices"][0]["text"].strip()
print(initial_twitter_answer)

In [None]:
COMPLETION_MODEL_NAME = "gpt-3.5-turbo-instruct"

def answer_question(
    question, df, max_prompt_tokens=1800, max_answer_tokens=150
):
    """
    Given a question, a dataframe containing rows of text, and a maximum
    number of desired tokens in the prompt and response, return the
    answer to the question according to an OpenAI Completion model

    If the model produces an error, return an empty string
    """

    prompt = create_prompt(question, df, max_prompt_tokens)

    try:
        response = openai.Completion.create(
            model=COMPLETION_MODEL_NAME,
            prompt=prompt,
            max_tokens=max_answer_tokens
        )
        return response["choices"][0]["text"].strip()
    except Exception as e:
        print(e)
        return ""


In [None]:
response = answer_question(question, df, max_prompt_tokens=1800, max_answer_tokens=150)
response