In [2]:
from transformers import GPT2TokenizerFast
import pandas as pd
import openai
import numpy as np
import pickle
import json
from dotenv import load_dotenv

COMPLETIONS_MODEL = "text-davinci-002"

In [9]:
import os
from dotenv import load_dotenv

load_dotenv()
api = os.getenv("OPENAI_API_KEY")

# load api_key

In [12]:
openai.api_key = api
# openai.Model.list()

### reading from config.json

In [33]:
with open('../../config.json') as json_file:
    data = json.load(json_file)
 
    # Print the type of data variable
    print("Type:", type(data))
    
print(data["MODEL_NAME"])
print(data["COMPLETIONS_MODEL"])

Type: <class 'dict'>
curie
text-davinci-002


# Demo openai.Completion

### without I don't know

In [13]:
prompt = "What is the member's capital in 2021?"

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=50,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

"The member's capital in 2021 is $1,000."

### with I don't know

In [11]:
prompt = """
Answer the question as truthfully as possible, and if you're unsure of the answer, say 
"Sorry, I don't know".

Q: What is the member's capital in 2021?
A:
"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

"Sorry, I don't know."

### with context

In [10]:
prompt = """
Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* Movements in members’ capital were as follows:£mBalance at 1 October 2019 86Capital introduced by members 5Transfer of amounts due to members allocated to capital 13Repayments of capital (17)Balance at 30 September 2020 87Capital introduced by members 5Allocation of profits arising in year 181Repayments of capital (15)Balance at 30 September 2021 258Non-current 181Current 77At 30 September 2021, £20 million of members’ capital owed to former partners was classified as a non-current liability (see note 13)
* Members’ capital is provided by each member on becoming a partner and is only repayable on retirement or resignation so generally remains stable from year to year. However, during the year, partners voted in favour of allocating £201 million of reserves to members’ capital, for repayment only after retirement, thereby retaining funds in the business for future investment. As a result, members’ capital increased to £258 million at 30 September 2021 (2020: £87 million)
* Group Partnership2021  £m2020  £m2021  £m2020  £mMembers’ capital 258 87 258 87Lease liabilities 487 534 462 502745 621 720 58945UK Members’ Report and Financial Statements 2021© 2022 KPMG LLP , a UK limited liability partnership and a member firm of the KPMG global organisation of independent member firms affiliated with KPMG International Limited, a private English company limited by guarantee. All rights reserved.
* Movements in members’ capital were as follows:£mBalance at 1 October 2018 73Capital introduced by members 8Transfer of amounts due to members allocated to capital 14Repayments of capital (9)Balance at 30 September 2019 86Capital introduced by members 5Transfer of amounts due to members, allocated to capital 13Repayments of capital (17)Balance at 30 September 2020 8722. Equity, members’ capital and other interests (continued)Amounts due from/(to) members In addition to other members’ interests classified as equity, members’ interests also comprise amounts due from/(to) members as follows:Group Partnership2020  £m2019  £m2020  £m2019  £mAmounts due from members 47 46 47 46Amounts due to members (178) (205) (178) (205) (131) (159) (131) (159)Amounts due from members relate to amounts advanced to members in their first year of appointment, to cover the liabilities arising for those individuals as a result of the change in tax basis to self-employed. These balances are repayable by the member upon retirement or earlier cessation of membership
* The availability of this revolving facility is dependent on certain conditions, including a minimum level of members’ capital, all of which were satisfied at 30 September 2020 and 2019

 Q: What is the member's capital in 2021?
 A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

'258 million'

# Try with own knowledge base

In [34]:
df = pd.read_csv('knowledge_doc.csv')
df = df.set_index(["title", "heading"])
print(f"{len(df)} rows in the data.")
df.sample(5)

1002 rows in the data.


Unnamed: 0_level_0,Unnamed: 1_level_0,content,tokens
title,heading,Unnamed: 2_level_1,Unnamed: 3_level_1
doc_title,heading755,Significant Estimate The net book value of the...,61
doc_title,heading888,NotesForming part of the consolidated financia...,89
doc_title,heading526,The group’s and partnership’s gross assets hav...,114
doc_title,heading830,GroupAssets £mLiabilities £mBalance at 1 Oct...,107
doc_title,heading300,Group Partnership2021 £m2020 £m2021 £m2020 ...,102


In [35]:
MODEL_NAME = "curie"

DOC_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-doc-001"
QUERY_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-query-001"

In [36]:
def get_embedding(text: str, model: str) -> list:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def get_doc_embedding(text: str) -> list:
    return get_embedding(text, DOC_EMBEDDINGS_MODEL)

def get_query_embedding(text: str) -> list:
    return get_embedding(text, QUERY_EMBEDDINGS_MODEL)

In [37]:
def load_embeddings(fname: str) -> dict:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [38]:
document_embeddings = load_embeddings("knowledge_embedding.csv")

In [39]:
example_entry = list(document_embeddings.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

('doc_title', 'heading1') : [0.0098580177873373, -0.0044100452214479, 0.0047049629501998, 0.0183168910443782, -0.019094193354249]... (4096 entries)


In [40]:
def vector_similarity(x: list, y: list) -> float:
    """
    We could use cosine similarity or dot product to calculate the similarity between vectors.
    In practice, we have found it makes little difference. 
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict) -> list:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_query_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [41]:
order_document_sections_by_query_similarity("What is the member's capital in 2021?", document_embeddings)[:5]

[(0.45895831866573666, ('doc_title', 'heading392')),
 (0.457307189554787, ('doc_title', 'heading13')),
 (0.44966349273582573, ('doc_title', 'heading300')),
 (0.43523718628067554, ('doc_title', 'heading889')),
 (0.42902872291174965, ('doc_title', 'heading805'))]

In [55]:
# length of context token
MAX_SECTION_LEN = 700
SEPARATOR = "\n* "

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
separator_len = len(tokenizer.tokenize(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [56]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [57]:
prompt = construct_prompt(
    "What is the member's capital in 2021?",
    document_embeddings,
    df
)

print("===\n", prompt)

Selected 5 document sections:
('doc_title', 'heading392')
('doc_title', 'heading13')
('doc_title', 'heading300')
('doc_title', 'heading889')
('doc_title', 'heading805')
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* Movements in members’ capital were as follows:£mBalance at 1 October 2019 86Capital introduced by members 5Transfer of amounts due to members allocated to capital 13Repayments of capital (17)Balance at 30 September 2020 87Capital introduced by members 5Allocation of profits arising in year 181Repayments of capital (15)Balance at 30 September 2021 258Non-current 181Current 77At 30 September 2021, £20 million of members’ capital owed to former partners was classified as a non-current liability (see note 13)
* Members’ capital is provided by each member on becoming a partner and is only repayable on retirement or resignation so generally remains stable fro

In [58]:
# Reference
# https://beta.openai.com/docs/api-reference/completions/create
    
COMPLETIONS_API_PARAMS = {
    "temperature": 0.0, # Higher values means the model will take more risks. Try 0.9 for more creative applications, and 0 (argmax sampling) for ones with a well-defined answer.
    "max_tokens": 200, # The maximum number of tokens to generate in the completion/answer. The token count of prompt + max_tokens <= model's context length (usually 2048)
    "model": COMPLETIONS_MODEL,
}

In [59]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict,
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [60]:
# from 2021 report
answer_query_with_context("What is the member's capital in 2021?", df, document_embeddings)

Selected 5 document sections:
('doc_title', 'heading392')
('doc_title', 'heading13')
('doc_title', 'heading300')
('doc_title', 'heading889')
('doc_title', 'heading805')


"The member's capital is 258 million in 2021."

In [62]:
query = "What is the revenue in 2020 and 2021?"
answer = answer_query_with_context(query, df, document_embeddings)

print(f"\nQ: {query}\nA: {answer}")

Selected 6 document sections:
('doc_title', 'heading723')
('doc_title', 'heading226')
('doc_title', 'heading224')
('doc_title', 'heading701')
('doc_title', 'heading203')
('doc_title', 'heading62')

Q: What is the revenue in 2020 and 2021?
A: The revenue in 2020 is £2,303 million and in 2021 is £2,433 million.


In [63]:
query = "How does Brexit impact KPMG?"
answer = answer_query_with_context(query, df, document_embeddings)

print(f"\nQ: {query}\nA: {answer}")

Selected 8 document sections:
('doc_title', 'heading33')
('doc_title', 'heading991')
('doc_title', 'heading489')
('doc_title', 'heading35')
('doc_title', 'heading587')
('doc_title', 'heading254')
('doc_title', 'heading1000')
('doc_title', 'heading502')

Q: How does Brexit impact KPMG?
A: The Brexit risk has reduced considerably since last year as we have taken significant steps to prepare the firm for the ongoing consequences of the UK leaving the European Union. However, uncertainty remains around the impact of new regulatory and other restrictions that may be implemented by EU member states in future periods and we will continue to evaluate the impact of Brexit on both the group’s activities and those of our clients.


In [64]:
# from 2020 report
query = "How is Discretionary Profit Share allocated to member?"
answer = answer_query_with_context(query, df, document_embeddings)

print(f"\nQ: {query}\nA: {answer}")

Selected 8 document sections:
('doc_title', 'heading519')
('doc_title', 'heading17')
('doc_title', 'heading521')
('doc_title', 'heading18')
('doc_title', 'heading234')
('doc_title', 'heading731')
('doc_title', 'heading733')
('doc_title', 'heading14')

Q: How is Discretionary Profit Share allocated to member?
A: Discretionary Profit Share is allocated to members on the basis of their relative in-year performance against their balanced scorecard goals.


In [65]:
query = "What is the revenue in 2019 and 2020?"
answer = answer_query_with_context(query, df, document_embeddings)

print(f"\nQ: {query}\nA: {answer}")

Selected 8 document sections:
('doc_title', 'heading701')
('doc_title', 'heading62')
('doc_title', 'heading723')
('doc_title', 'heading226')
('doc_title', 'heading224')
('doc_title', 'heading725')
('doc_title', 'heading565')
('doc_title', 'heading203')

Q: What is the revenue in 2019 and 2020?
A: The revenue in 2019 is £2,433 million and in 2020 is £2,303 million.


In [66]:
query = "What is Disclosure of information to the auditor?"
answer = answer_query_with_context(query, df, document_embeddings)

print(f"\nQ: {query}\nA: {answer}")

Selected 6 document sections:
('doc_title', 'heading545')
('doc_title', 'heading43')
('doc_title', 'heading606')
('doc_title', 'heading110')
('doc_title', 'heading210')
('doc_title', 'heading69')

Q: What is Disclosure of information to the auditor?
A: Disclosure of information to the auditor is the process of sharing relevant information with the auditor so that they can perform their duties accurately.


In [69]:
# out of scope question
query = "Who are the clients of KPMG UK?"
answer = answer_query_with_context(query, df, document_embeddings)

print(f"\nQ: {query}\nA: {answer}")

Selected 12 document sections:
('doc_title', 'heading991')
('doc_title', 'heading489')
('doc_title', 'heading1000')
('doc_title', 'heading502')
('doc_title', 'heading493')
('doc_title', 'heading508')
('doc_title', 'heading6')
('doc_title', 'heading509')
('doc_title', 'heading4')
('doc_title', 'heading19')
('doc_title', 'heading486')
('doc_title', 'heading228')

Q: Who are the clients of KPMG UK?
A: I don't know.
