In [2]:
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
import tiktoken
import pickle 
from PyPDF2 import PdfReader

In [3]:
# the document to be analysed
PDF_DOC = "data/2023_Q1.pdf"

In [4]:
# OPEN AI models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
EMBEDDING_ENCODING = "cl100k_base"  # this the encoding for text-embedding-ada-002

In [5]:
# extract all the text from a pdf file
def extract_text_frfom_pdf(doc, npages=None):
    reader = PdfReader(doc)  
        
    n = len(reader.pages)
    if npages is not None:
        npages = min(npages, n)
    else:
        npages = n
        
    print (f'You have {n:,} page(s) in your file, loading {npages:,}')

    text = ''
    for i in range(npages):
        text += reader.pages[i].extract_text()
        
    return text

In [6]:
def partition_text(full_text, psize=500, delim='\n'):    
    text_chunks = full_text.split(delim)
    
    ptext = []
    next_item = ''
    next_item_size = 0
    
    for i in range(len(text_chunks)):
        txt = text_chunks[i]
        
        next_item_size += len(txt)
        next_item += txt + ' '
            
        if next_item_size>psize:
            next_item_size = 0
            ptext.append(next_item)
            next_item = ''
                        
    return ptext 

In [7]:
def create_overlapped_partitions(ptext, overlap_pct=0.5, delim=' '):
    assert len(ptext)>1, 'Insufficient text to partition'
    
    opart = []
    pos1 = int(float(len(ptext[0])*overlap_pct))

    for i in range(1, len(ptext)):
        pos2 = int(float(len(ptext[i])*overlap_pct))
        
        while ptext[i-1][pos1] != delim:
            pos1 += 1 
        while ptext[i][pos2] != delim:
            pos2 += 1
        
        new_part = ptext[i-1][pos1:]
        new_part += ptext[i][:pos2]
        opart.append(new_part)
        
        pos1 = pos2
    
    return opart

In [9]:
# split text into partitions, including overlapping partitions
def create_text_partitions(full_text, psize=500, overlap_pct=0.5):
    ptext = partition_text(full_text, psize=200)
    optext = create_overlapped_partitions(ptext, overlap_pct)
    lp = len(ptext)
    lop = len(optext)
    ptext.extend(optext)
    nc = len(ptext)
    print(f'Loaded {nc:,} chuncks : {lp:,} chuncks and {lop:,} overlap chunks')
    return ptext

In [10]:
# convert a list of texts into embedding vectors
def create_embeddings(ptext, save=True, fname='data/embeddings.pkl'):
    emb_info = []
    encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
    
    for i in range(len(ptext)):
        txt = ptext[i]
        ntokens = len(encoding.encode(txt))
        emb = get_embedding(txt, engine=EMBEDDING_MODEL)
        
        emb_info.append([txt, ntokens, emb])
    
    if save:
        with open(fname, 'wb') as f:
            pickle.dump(emb_info, f)
    
    return emb_info

In [11]:
def load_embeddings(fname='data/embeddings.pkl'):
    with open(fname, 'rb') as f:
        emb_info = pickle.load(f)
    
    sl = [len(x[2]) for x in emb_info]
    assert max(sl)==min(sl), 'incompatible embedding sizes'
    print(f'Loaded {len(sl):,} embeddings, each of size {max(sl):,}')
    return emb_info

In [12]:
# compare the embedding vector of a query with the 
# embedding vectors corresponding to document chuncks
def find_similar_text(query, emb, sim_threshold=0.8):
    qe = get_embedding(query, engine=EMBEDDING_MODEL)
    sim = [cosine_similarity(e[2], qe) for e in emb]
    
    res_info = []
    sim_text = ''
    for i in range(len(emb)):
        if sim[i]>=sim_threshold:
            res_info.append([emb[i][0], emb[i][1], emb[i][2], sim[i]])
            sim_text += emb[i][0] + ' '
            
    return sim_text, res_info

In [13]:
# there is a whole "prompt engineering" field
# this is a super simple way to create a gpt prompt
def get_prompt(question, doc_specific=True, doc_text=''):
    if doc_specific:
        prompt = 'The document provided contains the following information: ' \
                + doc_text + ' ' + question
        role_descr = 'You answer questions about the document provided. If the information is not in the document say you do not know the answer.'
    else:
        prompt = question
        role_descr = 'You answer the question asked.'
        
    return prompt, role_descr

In [14]:
# this is the main function
def ask_question(question, # text of the question 
                 doc_embeddings, # the embeddings of the PDF
                 doc_specific=True, # makes the answer doc specific only
                 verbose=True, 
                 sim_threshold=0.8 # threshold to determine what embeddings to use in the prompt
                ):
    similar_text, sim_info = find_similar_text( question, 
                                                doc_embeddings,
                                                sim_threshold)
    
    prompt, role_descr = get_prompt(question, doc_specific, similar_text)

    messages = [
        {"role": "system", "content": role_descr},
        {"role": "user", "content": prompt},
        ]
    
    response_info = openai.ChatCompletion.create(
        model=GPT_MODEL,
        messages=messages,
        temperature=0
        )
    
    response = response_info['choices'][0]['message']['content']
    
    if verbose:
        print(f'Answer: {response}')
    
    return response, response_info

In [15]:
text = extract_text_frfom_pdf(PDF_DOC)
ptext = create_text_partitions(text, psize=200, overlap_pct=0.5)

You have 40 page(s) in your file, loading 40
Loaded 533 chuncks : 267 chuncks and 266 overlap chunks


In [16]:
# only need to do this once
# emb = create_embeddings(ptext)

In [17]:
emb = load_embeddings()

Loaded 533 embeddings, each of size 1,536


In [18]:
q = 'What was the first quarter 2023 revenue?'

In [19]:
answer, info = ask_question(q, emb, doc_specific=False)

Answer: I'm sorry, but as an AI language model, I do not have access to real-time financial data. Please provide more context or specify the company you are referring to.


In [20]:
answer, info = ask_question(q, emb, doc_specific=True)

Answer: The first quarter 2023 revenue was $21.4 billion.


In [21]:
q = 'What was the annual tax rate?'

In [22]:
answer, info = ask_question(q, emb, doc_specific=True, sim_threshold=0.78)

Answer: The document states that the full year effective tax rate for 2023 is expected to be around 23-24%, excluding discrete items and divestiture-related impacts. However, it does not provide information on the annual tax rate for any previous year.
