In [1]:
import pandas as pd
import openai
import numpy as np
import pickle
import docx
from transformers import GPT2TokenizerFast
from tqdm import tqdm
from time import sleep
import unicodedata

In [2]:
openai.api_key = 'YOUR OPENAI API KEY'
COMPLETIONS_MODEL = "text-davinci-003"

## 1) Extract the text from the 10Q report

In [3]:
def get_text(filename, tokenizer):
    doc = docx.Document(filename)
    full_text = ['']
    for para in doc.paragraphs:  # Loop through each chunk of text
        text = unicodedata.normalize("NFKD", para.text.replace("\n", "")).strip() # Clean up the text  
        num_tokens = len(tokenizer.tokenize(text)) # Measure how long the chunk is
        is_header = text == text.title() # Check if this chunk is a section header
        if is_header: # Omit section headers
            continue
        if num_tokens > 28: # if the text is large enough, consider it a chunk
            full_text.append(text)
        else:
            full_text[-1] += f"\n{text}".strip() # Otherwise, append it to the previous chunk as it likely belongs with it
    return full_text

In [4]:
# You can download the 10Q report from here:
# https://ir.netflix.net/financials/sec-filings/default.aspx. I downloaded and saved it in my computer as netflix_10Q.docx
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
text = get_text("netflix_10K.docx", tokenizer)

In [5]:
df = pd.DataFrame(data=[text, pd.Series(text).apply(lambda p: len(tokenizer.tokenize(p)))]).T
df.columns = ['paragraph', 'token_count']
df.sort_values(by="token_count") # We broke up the report into 158 chunks

Unnamed: 0,paragraph,token_count
0,Table of ContentsPART I,5
40,other impediments to delivering our streaming ...,29
72,The long-term and largely fixed cost nature of...,29
358,(9)Registration Statement (Form S-8 No. 333-23...,29
356,(6)Registration Statement (Form S-8 No. 333-14...,29
...,...,...
221,The Company has entered into operating leases ...,380
31,"As a producer and distributor of content, we f...",389
1,This Annual Report on Form 10-K contains forwa...,444
76,We have a substantial amount of indebtedness a...,475


In [9]:
# Let's look at a sample chunk
print(df.loc[200]['paragraph']) # This is useful information!!

The Company acquires, licenses and produces content, including original programming, in order to offer members unlimited viewing of video entertainment. The content licenses are for a fixed fee and specific windows of availability. Payment terms for certain content licenses and the production of content require more upfront cash payments relative to the amortization expense. Payments for content, including additions to content assets and the changes in related liabilities, are classified within "Net cash provided by (used in) operating activities" on the Consolidated Statements of Cash Flows.


## 2) Create an embedding vector for each paragraph

In [10]:
# NOTE: this code is taken from https://github.com/openai/openai-cookbook and was slightly modified
MODEL_NAME = "curie" # Very good for the task
DOC_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-doc-001"
QUERY_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-query-001"

In [12]:
def get_embedding(text: str, model: str) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def get_doc_embedding(text: str) -> list[float]:
    return get_embedding(text, DOC_EMBEDDINGS_MODEL)

def get_query_embedding(text: str) -> list[float]:
    return get_embedding(text, QUERY_EMBEDDINGS_MODEL)

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.

    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    embeddings_fn = "nflx_10k_embeddings.pkl"
    try:
        embeddings = pickle.load(open(embeddings_fn, "rb"))
    except (OSError, IOError) as e:
        embeddings = {}
        pickle.dump(embeddings, open(embeddings_fn, "wb"))
    
    for idx, r in tqdm(df.iterrows()):
        if idx not in embeddings:
            embeddings[idx] = get_doc_embedding(r['paragraph'].replace("\n", " "))
            pickle.dump(embeddings, open(embeddings_fn, 'wb'))
            sleep(0.5)
    return embeddings

def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [13]:
document_embeddings = compute_doc_embeddings(df)

394it [04:53,  1.34it/s]


## 3) Construct the GPT-3 prompt

### Step 1: Find the paragraphs that are mostly related to the question

In [14]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    We could use cosine similarity or dot product to calculate the similarity between vectors.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query, contexts):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_query_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

### Step 2: Add the most relevant paragraphs (context) to the query, and make GPT-3 answer based on the context provided only 


In [15]:
MAX_SECTION_LEN = 1000
SEPARATOR = "\n* "

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
separator_len = len(tokenizer.tokenize(SEPARATOR))

In [24]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section['token_count'] + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section['paragraph'].replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
    
    header = """Answer the question as truthfully as possible using the provided context on the company Netflix, and if the answer is not contained within the context below, say "Answer is not found."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [25]:
prompt = construct_prompt(
    "What are they major risks for the business?",
    document_embeddings,
    df
)

print(prompt)

Answer the question as truthfully as possible using the provided context on the company Netflix, and if the answer is not contained within the context below, say "Answer is not found."

Context:

* From time to time, we acquire or invest in businesses, content, and technologies that support our business. The risks associated with such acquisitions or investments include the difficulty of integrating solutions, operations, and personnel; inheriting liabilities and exposure to litigation; failure to realize anticipated benefits and expected synergies; and diversion of management’s time and attention, among other acquisition-related risks.
* We face risks, such as unforeseen costs and potential liability in connection with content we acquire, produce, license and/or distribute through our service.
* from any liability or unforeseen production risks could harm our results of operations. We may not be indemnified against claims or costs of these types and we may not have insurance coverage 

## 3) Answer the user's question based on the context.


In [26]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 1200,
    "model": COMPLETIONS_MODEL,
}

In [27]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [30]:
answer_query_with_context("What are the major risks for Netflix?", df, document_embeddings)

'Major risks for Netflix include the expiration of agreements with partners, consumer dissatisfaction, payment processing risk, unforeseen costs and potential liability in connection with content, technological or business-related disruptions, cybersecurity threats, regulatory interference, failure to protect domain names, potential liability for negligence, copyright and trademark infringement, and risks associated with production.'

In [31]:
answer_query_with_context("Explain to me the risk of failure to protect domain names", df, document_embeddings)

'Failure to protect our domain names could adversely affect our reputation and brand and make it more difficult for users to find our website and our service. We may be unable, without significant cost or at all, to prevent third parties from acquiring domain names that are similar to, infringe upon or otherwise decrease the value of our trademarks and other proprietary rights.'