In [2]:
from transformers import GPT2TokenizerFast
import PyPDF2
import re 
import numpy as np
import pandas as pd
import openai
import time
from pathlib import Path

# Data preprocess

In [8]:
pdf_search = Path("knowledge_base_doc/").glob("*.pdf")
pdf_files = [str(file.absolute()) for file in pdf_search]
pdf_files

['/Users/nora/Downloads/question_and_answer/qa_model/experiment/knowledge_base_doc/uk-members-report-and-financial-statements-2021.pdf',
 '/Users/nora/Downloads/question_and_answer/qa_model/experiment/knowledge_base_doc/uk-members-report-financial-statements-2020.pdf']

In [9]:
def extract_text_from_pdf(files: list) -> str:
    pdf_files_text = []
    for pdf_file in files:
        with open(pdf_file, 'rb') as pdf:
            reader = PyPDF2.PdfReader(pdf, strict=False)
            pdf_text = []
            
            for page in reader.pages:
                content = page.extract_text()
                pdf_text.append(content)
            pdf_files_text.extend(pdf_text)
                
    return pdf_files_text

In [10]:
extracted_text = extract_text_from_pdf(pdf_files)

In [11]:
len(extracted_text)

153

In [14]:
extracted_text

['UK Members’ Report \nand Financial  \nStatements 2021\nIn respect of the year ended 30 September 2021\nRegistered number OC301540\nJanuary 2022\nkpmg.com/uk',
 'Contents\nReport to the \nmembers\n3Independent auditor’s \nreport to the members \nof KPMG LLP\n6\nConsolidated \nincome \nstatement\n17Consolidated \nstatement of \ncomprehensive \nincome\n18\nStatements of  \nfinancial position\n19Statements of  \nchanges in equity\n20Statements of  \ncash flows\n21\nNotes\n22Appendix: Energy \nand Carbon report\n75\n2\nUK Members’ Report and Financial Statements 2021\n© 2022 KPMG LLP , a UK limited liability partnership and a member firm of the KPMG global organisation of independent \nmember firms affiliated with KPMG International Limited, a private English company limited by guarantee. All rights reserved.',
 'Report to the members\nThe Board submits its report together with the audited consolidated financial statements of \nKPMG LLP and its subsidiary undertakings (the group) for the 

In [15]:
splitted_text = [re.split(r"(\.\s*\n)", page) for page in extracted_text]
splitted_text

[['UK Members’ Report \nand Financial  \nStatements 2021\nIn respect of the year ended 30 September 2021\nRegistered number OC301540\nJanuary 2022\nkpmg.com/uk'],
 ['Contents\nReport to the \nmembers\n3Independent auditor’s \nreport to the members \nof KPMG LLP\n6\nConsolidated \nincome \nstatement\n17Consolidated \nstatement of \ncomprehensive \nincome\n18\nStatements of  \nfinancial position\n19Statements of  \nchanges in equity\n20Statements of  \ncash flows\n21\nNotes\n22Appendix: Energy \nand Carbon report\n75\n2\nUK Members’ Report and Financial Statements 2021\n© 2022 KPMG LLP , a UK limited liability partnership and a member firm of the KPMG global organisation of independent \nmember firms affiliated with KPMG International Limited, a private English company limited by guarantee. All rights reserved.'],
 ['Report to the members\nThe Board submits its report together with the audited consolidated financial statements of \nKPMG LLP and its subsidiary undertakings (the group) for

In [17]:
splitted_string = []
for ls in splitted_text:
    splitted_string.extend(ls)

In [18]:
len(splitted_string)

1851

In [34]:
new_string_list = [item.replace('\n', '') for item in splitted_string]

items_to_remove = ['.', '', '. ', '.  ']
string_list_clean = list(filter(lambda item: item not in items_to_remove, new_string_list))

string_list_clean

['UK Members’ Report and Financial  Statements 2021In respect of the year ended 30 September 2021Registered number OC301540January 2022kpmg.com/uk',
 'ContentsReport to the members3Independent auditor’s report to the members of KPMG LLP6Consolidated income statement17Consolidated statement of comprehensive income18Statements of  financial position19Statements of  changes in equity20Statements of  cash flows21Notes22Appendix: Energy and Carbon report752UK Members’ Report and Financial Statements 2021© 2022 KPMG LLP , a UK limited liability partnership and a member firm of the KPMG global organisation of independent member firms affiliated with KPMG International Limited, a private English company limited by guarantee. All rights reserved.',
 'Report to the membersThe Board submits its report together with the audited consolidated financial statements of KPMG LLP and its subsidiary undertakings (the group) for the year ended 30 September 2021 ',
 'Legal structureKPMG LLP is the UK member

In [35]:
len(string_list_clean)

1002

# Tokenize

In [36]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
MAX_TOKENS = 1024 
# the max_length that GPT2TokenizerFast can process, if the input length > MAX_TOKENS, truncate the input
tokens = [len(tokenizer.tokenize(i, max_length=MAX_TOKENS, truncation=True)) for i in string_list_clean]


In [37]:
columns = ['title', 'heading', 'content', 'tokens']
title = ['doc_title']*len(string_list_clean)
heading = [f'heading{i+1}' for i in range(len(string_list_clean))]

In [93]:
df = pd.DataFrame({'title': title, 'heading': heading, 'content': string_list_clean, 'tokens': tokens})

df

Unnamed: 0,title,heading,content,tokens
0,doc_title,heading1,UK Members’ Report and Financial Statements 2...,33
1,doc_title,heading2,ContentsReport to the members3Independent audi...,123
2,doc_title,heading3,Report to the membersThe Board submits its rep...,40
3,doc_title,heading4,Legal structureKPMG LLP is the UK member firm ...,44
4,doc_title,heading5,All member firms are committed to following co...,26
...,...,...,...,...
997,doc_title,heading998,"ECR KPI 2020 2019Scope 1Natural gas (kWh) 11,3...",303
998,doc_title,heading999,ECR KPI 2020 2019Scope 3Business-related car t...,367
999,doc_title,heading1000,"kpmg.com/uk© 2021 KPMG LLP , a UK limited liab...",53
1000,doc_title,heading1001,The KPMG name and logo are trademarks used und...,24


In [94]:
# find which content has more than <max_tokens> token
df[df.tokens == MAX_TOKENS].index

Int64Index([], dtype='int64')

In [95]:
if len(df[df.tokens == MAX_TOKENS].index) > 0:
    df.drop(df[df.tokens == MAX_TOKENS].index, inplace=True)
    df.reset_index(drop=True, inplace=True)

In [96]:
df.shape

(1002, 4)

In [42]:
df.to_csv('knowledge_doc.csv', index=False)

# Embedding

In [5]:
import os
from dotenv import load_dotenv

load_dotenv()
api = os.getenv("OPENAI_API_KEY")

openai.api_key = api

In [6]:
MODEL_NAME = "curie"

DOC_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-doc-001"
QUERY_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-query-001"

def get_embedding(text: str, model: str) -> list:
    result = openai.Embedding.create(
      model=model,
      max_tokens=2046,
      input=text
    )
    return result["data"][0]["embedding"]

def get_doc_embedding(text: str) -> list:
    return get_embedding(text, DOC_EMBEDDINGS_MODEL)

In [79]:
# get the dimension of the embedding
emb = get_doc_embedding(df['content'][0])

df2 = pd.DataFrame(columns=[i for i in range(len(emb))])

last_index = 757
resume_idx = last_index + 1

for idx in df.loc[resume_idx:].index:
    df2.loc[idx] = get_doc_embedding(df.loc[idx,'content'])
    time.sleep(3)
    

In [80]:
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
758,-0.013127,-0.021028,0.015811,0.005975,-0.013584,-0.000080,-0.013782,-0.011959,0.008723,-0.017041,...,-0.001192,0.014716,-0.000532,0.021387,0.010200,0.008166,-0.005252,0.026487,0.000112,0.019627
759,-0.022403,-0.003328,0.021790,0.016779,-0.002484,0.000422,-0.017607,-0.017086,-0.006712,-0.007497,...,0.005633,0.024523,-0.005987,0.010216,0.016445,0.009491,-0.013479,0.014818,0.010811,-0.004402
760,-0.010010,-0.011024,0.015025,0.013797,-0.011480,0.000703,-0.016318,-0.013992,0.001555,-0.001504,...,0.018988,0.005284,-0.003814,0.012643,0.006289,0.015034,-0.001094,0.002905,0.008201,0.011099
761,-0.008706,-0.010859,0.004401,-0.002217,0.011813,0.000435,0.008104,-0.017730,0.012257,-0.001518,...,-0.004827,0.020293,0.010349,-0.000717,0.002961,0.014280,-0.012950,0.010638,0.021777,-0.000344
762,-0.004453,-0.004556,0.007239,-0.005680,0.013464,-0.001216,0.002854,-0.008845,0.012957,-0.012422,...,0.008882,0.024975,0.010938,-0.004786,0.020731,0.015389,0.014553,0.019192,0.013342,-0.000343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,-0.022023,-0.012541,0.017417,0.024647,-0.010179,0.001896,-0.008292,-0.021480,-0.012296,-0.000766,...,0.007718,0.013174,-0.005135,0.019037,-0.003210,0.000527,-0.019146,-0.000727,0.018838,0.031541
998,-0.018099,-0.012854,0.013590,0.013381,-0.011863,0.011108,-0.006672,-0.018072,-0.014690,-0.008709,...,0.008554,0.002094,-0.002648,0.008090,-0.008859,-0.002973,-0.014481,0.010399,0.026271,0.033852
999,-0.003308,-0.008990,0.002737,0.013413,-0.015361,0.007884,-0.023802,-0.004067,0.009651,0.001637,...,0.009297,0.009675,0.012374,0.012260,0.020796,0.015937,-0.006787,0.001769,0.012411,0.020078
1000,-0.004768,-0.006661,-0.002423,0.008532,-0.006978,0.005500,-0.032082,-0.013919,0.004809,0.016003,...,-0.000947,0.000640,0.012469,0.004145,0.009428,0.014080,-0.007381,0.001932,0.013047,0.013284


In [81]:
# df2.to_csv('knowledge_base_embedding_temp/knowledge_embedding(temp).csv', index=False)

# 断点保存
df2.to_csv('knowledge_base_embedding_temp/knowledge_embedding(temp)_idx1001(part3).csv')

# concatenate to one embedding file

In [82]:
csv_search = Path("knowledge_base_embedding_temp/").glob("*.csv")
csv_files = [str(file.absolute()) for file in csv_search]
csv_files


['/Users/nora/Downloads/question_and_answer/qa_model/experiment/knowledge_base_embedding_temp/knowledge_embedding(temp)_idx281(part1).csv',
 '/Users/nora/Downloads/question_and_answer/qa_model/experiment/knowledge_base_embedding_temp/knowledge_embedding(temp)_idx1001(part3).csv',
 '/Users/nora/Downloads/question_and_answer/qa_model/experiment/knowledge_base_embedding_temp/knowledge_embedding(temp)_idx757(part2).csv']

In [103]:
embeddings = pd.DataFrame([])
for file in csv_files:
    _df = pd.read_csv(file).rename(columns={"Unnamed: 0": "index"})
    embeddings = pd.concat([embeddings, _df])

embeddings = embeddings.sort_values(by='index').set_index('index')   
embeddings

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.009858,-0.004410,0.004705,0.018317,-0.019094,0.007233,-0.029483,-0.014476,0.013891,0.001317,...,0.007695,0.001439,-0.000055,0.008802,0.005569,0.001749,-0.002647,0.011906,0.004657,0.023301
1,0.000246,-0.021595,0.002870,0.001398,-0.016845,-0.000114,-0.016727,-0.014957,-0.003483,0.005390,...,-0.000629,0.022212,0.004121,0.005889,-0.000065,0.008069,-0.007029,0.009054,0.004107,0.030858
2,-0.002593,-0.006904,-0.002627,-0.001049,-0.010546,0.001398,-0.022700,-0.002381,0.008357,0.000111,...,-0.014192,0.009468,0.002518,0.007343,0.013955,0.001652,-0.004958,0.009797,0.005876,0.015517
3,-0.007109,-0.001163,-0.005644,0.013460,-0.010816,0.002093,-0.031081,-0.004756,0.003608,0.002595,...,-0.003529,-0.000210,0.007978,0.000669,0.005233,0.008195,-0.006920,-0.002072,-0.001859,0.023352
4,-0.001805,-0.000603,-0.010486,0.018962,-0.022777,-0.000062,-0.020656,-0.007369,0.009611,0.000882,...,0.008486,0.001317,0.002078,0.002526,0.012793,0.012096,-0.006518,0.000187,0.014143,0.020414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,-0.022023,-0.012541,0.017417,0.024647,-0.010179,0.001896,-0.008292,-0.021480,-0.012296,-0.000766,...,0.007718,0.013174,-0.005135,0.019037,-0.003210,0.000527,-0.019146,-0.000727,0.018838,0.031541
998,-0.018099,-0.012854,0.013590,0.013381,-0.011863,0.011108,-0.006672,-0.018072,-0.014690,-0.008709,...,0.008554,0.002094,-0.002648,0.008090,-0.008859,-0.002973,-0.014481,0.010399,0.026271,0.033852
999,-0.003308,-0.008990,0.002737,0.013413,-0.015361,0.007884,-0.023802,-0.004067,0.009651,0.001637,...,0.009297,0.009675,0.012374,0.012260,0.020796,0.015937,-0.006787,0.001769,0.012411,0.020078
1000,-0.004768,-0.006661,-0.002423,0.008532,-0.006978,0.005500,-0.032082,-0.013919,0.004809,0.016003,...,-0.000947,0.000640,0.012469,0.004145,0.009428,0.014080,-0.007381,0.001932,0.013047,0.013284


In [106]:
df_merged = df.join(embeddings)
df_embedding_doc = df_merged.drop(['content', 'tokens'], axis=1)
df_embedding_doc

Unnamed: 0,title,heading,0,1,2,3,4,5,6,7,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,doc_title,heading1,0.009858,-0.004410,0.004705,0.018317,-0.019094,0.007233,-0.029483,-0.014476,...,0.007695,0.001439,-0.000055,0.008802,0.005569,0.001749,-0.002647,0.011906,0.004657,0.023301
1,doc_title,heading2,0.000246,-0.021595,0.002870,0.001398,-0.016845,-0.000114,-0.016727,-0.014957,...,-0.000629,0.022212,0.004121,0.005889,-0.000065,0.008069,-0.007029,0.009054,0.004107,0.030858
2,doc_title,heading3,-0.002593,-0.006904,-0.002627,-0.001049,-0.010546,0.001398,-0.022700,-0.002381,...,-0.014192,0.009468,0.002518,0.007343,0.013955,0.001652,-0.004958,0.009797,0.005876,0.015517
3,doc_title,heading4,-0.007109,-0.001163,-0.005644,0.013460,-0.010816,0.002093,-0.031081,-0.004756,...,-0.003529,-0.000210,0.007978,0.000669,0.005233,0.008195,-0.006920,-0.002072,-0.001859,0.023352
4,doc_title,heading5,-0.001805,-0.000603,-0.010486,0.018962,-0.022777,-0.000062,-0.020656,-0.007369,...,0.008486,0.001317,0.002078,0.002526,0.012793,0.012096,-0.006518,0.000187,0.014143,0.020414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,doc_title,heading998,-0.022023,-0.012541,0.017417,0.024647,-0.010179,0.001896,-0.008292,-0.021480,...,0.007718,0.013174,-0.005135,0.019037,-0.003210,0.000527,-0.019146,-0.000727,0.018838,0.031541
998,doc_title,heading999,-0.018099,-0.012854,0.013590,0.013381,-0.011863,0.011108,-0.006672,-0.018072,...,0.008554,0.002094,-0.002648,0.008090,-0.008859,-0.002973,-0.014481,0.010399,0.026271,0.033852
999,doc_title,heading1000,-0.003308,-0.008990,0.002737,0.013413,-0.015361,0.007884,-0.023802,-0.004067,...,0.009297,0.009675,0.012374,0.012260,0.020796,0.015937,-0.006787,0.001769,0.012411,0.020078
1000,doc_title,heading1001,-0.004768,-0.006661,-0.002423,0.008532,-0.006978,0.005500,-0.032082,-0.013919,...,-0.000947,0.000640,0.012469,0.004145,0.009428,0.014080,-0.007381,0.001932,0.013047,0.013284


In [107]:
df_embedding_doc.to_csv('knowledge_embedding.csv', index=False)