In [85]:
from openai.error import RateLimitError
import openai
import backoff
import pandas as pd
import numpy as np
import pickle
from transformers import GPT2TokenizerFast
from typing import List
from ratelimit import limits,sleep_and_retry
from time import sleep
import pickle



COMPLETIONS_MODEL = "text-davinci-003"

In [86]:
COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"
openai.api_key = "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"

In [87]:
@backoff.on_exception(backoff.expo, RateLimitError)
def get_embedding(text: str, model: str=EMBEDDING_MODEL, idx: int=0) -> list[float]:
    result = openai.Embedding.create(
    model=model,
    input=text
    )

    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }
    
def compute_text_embeddings(text: str) -> dict[tuple[str, str], list[float]]:
    return {
        idx: get_embedding(line, EMBEDDING_MODEL ,idx) for idx, line in enumerate(text)
    }

In [88]:
def load_embeddings(fname: str):
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [89]:
# document_embeddings = load_embeddings("olympics_sections_document_embeddings.csv")

# context_embeddings = compute_doc_embeddings(df)

# context_embeddings = compute_text_embeddings(open ("nyush.txt", "r").read().splitlines())
context_embeddings = pickle.load(open('nyush_embeddings.obj', "rb"))

# print(context_embeddings)

In [90]:
def vector_similarity(x: List[float], y: List[float]) -> float:
    """
    We could use cosine similarity or dot product to calculate the similarity between vectors.
    In practice, we have found it makes little difference. 
    """
    return np.dot(np.array(x), np.array(y))

In [91]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [92]:
order_document_sections_by_query_similarity("Who won the men's high jump?", context_embeddings)[:5]

[(0.7486145040198118, 206),
 (0.7477867066522546, 191),
 (0.746245176559252, 177),
 (0.744552465070722, 126),
 (0.7422233607528219, 144)]

In [93]:
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
separator_len = len(tokenizer.tokenize(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [97]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

def construct_prompt_with_text(question: str, context_embeddings: dict, text: list) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = text[section_index]
        
        chosen_sections_len += len(document_section.split()) + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [101]:
prompt = construct_prompt_with_text(
    "Where do I submit?",
    context_embeddings,
    open ("nyush.txt", "r").read().splitlines()
)

print("===\n", prompt)


Selected 40 document sections:
209
191
15
116
93
195
206
30
24
247
192
184
201
196
6
26
144
60
187
59
0
238
198
213
13
3
103
126
185
73
194
17
249
114
234
75
214
117
115
11
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* How to get started?
* Submission
* The prompt
* Input redirection
* Locating programs
* You need to upload the .zip archive to Gradescope. If you need to acknowledge any influences per our academic integrity policy, write them as comments in your source code.
* Tips
* The command
* For example, if you are in /home/abc123/cs202/lab2, then the prompt should be:
* Here are some additional hints:
* You must submit a .zip archive containing all files needed to compile nyush in the root of the archive. You can create the archive file with the following command in the Docker container:
* We will grade your submission in an x86_64 Rocky Linux 8 container on

In [102]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

In [115]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings,
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt_with_text(
        query,
        document_embeddings,
        df
    )
    print(prompt)
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [117]:
answer_query_with_context("Who won the F1 championship last year?",open ("nyush.txt", "r").read().splitlines(), context_embeddings)

Selected 71 document sections:
191
216
250
248
75
11
187
13
206
177
0
126
19
30
60
144
15
23
22
21
196
73
3
209
183
251
17
92
74
18
247
76
139
107
153
249
35
169
163
147
103
116
93
27
238
91
172
150
108
25
207
166
198
174
141
90
87
181
159
202
245
99
234
197
212
100
190
106
59
2
176
Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* Submission
* Click to reveal spoiler
* Click to reveal spoiler
* Click to reveal spoiler
* cat |
* Overview
* Evaluation
* Specifications
* Tips
* exit
* Introduction
* Pipe
* A whitespace.
* The command
* A blank line.
* jobs
* The prompt
* Another whitespace.
* A dollar sign $.
* A closing bracket ].
* Rubric
* cat <
* Objectives
* How to get started?
* Compilation
* This lab has borrowed some ideas from Prof. Arpaci-Dusseau and Dr. T. Y. Wong.
* An opening bracket [.
* (Again, the final █ character represents your cursor.)
* cat >
* The word 

"I don't know."