In [250]:
import joblib
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import os
import pickle
from langchain.output_parsers import NumberedListOutputParser
import uuid
from tqdm import tqdm
from transformers import AutoModelForCausalLM,AutoTokenizer

In [71]:
pdfs = os.listdir("llamaParseDocs")
pdfs.remove(".ipynb_checkpoints")
pdfs

['Amazon-2021-Annual-Report',
 'NASDAQ_NVDA_2022',
 'NASDAQ_NVDA_2023',
 '2020_alphabet_annual_report',
 'Amazon-2020-Annual-Report',
 'Apple_10-K-Q4-2020',
 'Apple_10-K-Q4-2022',
 'NASDAQ_NVDA_2020',
 'Apple_10-K-Q4-2023',
 'FB_2022',
 'FB_2021-Annual-Report_FB',
 'FB_2023',
 'Amazon-com-Inc-2023-Annual-Report',
 '2022-alphabet-annual-report',
 '2023_alphabet',
 'Apple_10-K-2021',
 'NASDAQ_NVDA_2021',
 'Amazon-2022-Annual-Report',
 '2024_alphabet-10-q-q1-2024',
 'FB_2020-Annual-Report_FB',
 '2021_alphabet_annual_report']

In [42]:
doc = joblib.load("llamaParseDocs/Amazon-2021-Annual-Report/Amazon-2021-Annual-Report.pkl")

In [66]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1250,
    chunk_overlap=200,
    length_function=len,
    separators = ["\n---\n","\n\n","\n","."," ",""],
    is_separator_regex=False,
)

In [67]:
langdoc = Document(page_content = doc.text,metadata = doc.metadata)

In [68]:
docs = splitter.split_documents([langdoc])

In [107]:
for pdf in pdfs[3:4]:
    doc = joblib.load(f"llamaParseDocs/{pdf}/{pdf}.pkl")
    langdoc = Document(page_content = doc[0].text,metadata = doc[0].metadata)
    docs = splitter.split_documents([langdoc])
    with open("llamaParseDocs/"+pdf+"/docs"+".pkl", "wb") as f:
        pickle.dump(docs, f)

In [109]:
def updateDoc(doc,string):
    newDoc = Document(page_content = string + doc.page_content,metadata = doc.metadata)
    return newDoc

In [113]:
pdfs

['Amazon-2021-Annual-Report',
 'NASDAQ_NVDA_2022',
 'NASDAQ_NVDA_2023',
 '2020_alphabet_annual_report',
 'Amazon-2020-Annual-Report',
 'Apple_10-K-Q4-2020',
 'Apple_10-K-Q4-2022',
 'NASDAQ_NVDA_2020',
 'Apple_10-K-Q4-2023',
 'FB_2022',
 'FB_2021-Annual-Report_FB',
 'FB_2023',
 'Amazon-com-Inc-2023-Annual-Report',
 '2022-alphabet-annual-report',
 '2023_alphabet',
 'Apple_10-K-2021',
 'NASDAQ_NVDA_2021',
 'Amazon-2022-Annual-Report',
 '2024_alphabet-10-q-q1-2024',
 'FB_2020-Annual-Report_FB',
 '2021_alphabet_annual_report']

In [157]:
spdf = pdfs[20]
docs = joblib.load(f"llamaParseDocs/{spdf}/docs.pkl")
docs = [updateDoc(doc,string = "The following text is an Excerpt from Alphabet(A.K.A Google)'s 2021 Financial Report:\n") for doc in docs]
with open("llamaParseDocs/"+spdf+"/docs"+".pkl", "wb") as f:
        pickle.dump(docs, f)

In [127]:
def repairDoc(doc):
    newDoc = Document(page_content = doc.page_content[69:],metadata = doc.metadata)
    return newDoc

In [129]:
docs = joblib.load(f"llamaParseDocs/{spdf}/docs.pkl")
docs = [repairDoc(doc) for doc in docs]
with open("llamaParseDocs/"+spdf+"/docs"+".pkl", "wb") as f:
        pickle.dump(docs, f)

In [159]:
pdfs[20]

'2021_alphabet_annual_report'

In [160]:
pdfs

['Amazon-2021-Annual-Report',
 'NASDAQ_NVDA_2022',
 'NASDAQ_NVDA_2023',
 '2020_alphabet_annual_report',
 'Amazon-2020-Annual-Report',
 'Apple_10-K-Q4-2020',
 'Apple_10-K-Q4-2022',
 'NASDAQ_NVDA_2020',
 'Apple_10-K-Q4-2023',
 'FB_2022',
 'FB_2021-Annual-Report_FB',
 'FB_2023',
 'Amazon-com-Inc-2023-Annual-Report',
 '2022-alphabet-annual-report',
 '2023_alphabet',
 'Apple_10-K-2021',
 'NASDAQ_NVDA_2021',
 'Amazon-2022-Annual-Report',
 '2024_alphabet-10-q-q1-2024',
 'FB_2020-Annual-Report_FB',
 '2021_alphabet_annual_report']

In [161]:
allPDFdocs = []
for pdf in pdfs:
    allPDFdocs.extend(joblib.load(f"llamaParseDocs/{pdf}/docs.pkl"))

In [163]:
with open("allPDFDocs"+".pkl", "wb") as f:
        pickle.dump(allPDFdocs, f)

In [184]:
def applyHeader(doc):
    source = doc.metadata['source'].split("/")
    if source[2].startswith("table"):
        flag = "table"
        headers = {
        pdfs[0]:f"Here is a summary of a {flag} from Amazon's 2021 Financial Report:\n",
        pdfs[1]:f"Here is a summary of a {flag} from NVIDIA's 2022 Financial Report:\n",
        pdfs[2]:f"Here is a summary of a {flag} from NVIDIA's 2023 Financial Report:\n",
        pdfs[3]:f"Here is a summary of a {flag} from Alphabet(A.K.A Google)'s 2020 Financial Report:\n",
        pdfs[4]:f"Here is a summary of a {flag} from Amazon's 2020 Financial Report:\n",
        pdfs[5]:f"Here is a summary of a {flag} from Apple's 2020 Financial Report:\n",
        pdfs[6]:f"Here is a summary of a {flag} from Apple's 2022 Financial Report:\n",
        pdfs[7]:f"Here is a summary of a {flag} from NVIDIA's 2020 Financial Report:\n",
        pdfs[8]:f"Here is a summary of a {flag} from Apple's 2023 Financial Report:\n",
        pdfs[9]:f"Here is a summary of a {flag} from Facebook's 2022 Financial Report:\n",
        pdfs[10]:f"Here is a summary of a {flag} from Facebooks's 2021 Financial Report:\n",
        pdfs[11]:f"Here is a summary of a {flag} from Facebooks's 2023 Financial Report:\n",
        pdfs[12]:f"Here is a summary of a {flag} from Amazon's 2023 Financial Report:\n",
        pdfs[13]:f"Here is a summary of a {flag} from Alphabet(A.K.A Google)'s 2022 Financial Report:\n",
        pdfs[14]:f"Here is a summary of a {flag} from Alphabet(A.K.A Google)'s 2023 Financial Report:\n",
        pdfs[15]:f"Here is a summary of a {flag} from Apple's 2021 Financial Report:\n",
        pdfs[16]:f"Here is a summary of a {flag} from NVIDIA's 2021 Financial Report:\n",
        pdfs[17]:f"Here is a summary of a {flag} from Amazon's 2022 Financial Report:\n",
        pdfs[18]:f"Here is a summary of a {flag} from Alphabet(A.K.A Google)'s 2024 Financial Report:\n",
        pdfs[19]:f"Here is a summary of a {flag} from Facebook's 2020 Financial Report:\n",
        pdfs[20]:f"Here is a summary of a {flag} from Alphabet(A.K.A Google)'s 2021 Financial Report:\n",
    }
        newDoc = Document(page_content = headers[source[1]]+doc.page_content,metadata = doc.metadata)
    elif source[2].startswith("figure"):
        flag = "picture"
        headers = {
        pdfs[0]:f"Here is a summary of a {flag} from Amazon's 2021 Financial Report:\n",
        pdfs[1]:f"Here is a summary of a {flag} from NVIDIA's 2022 Financial Report:\n",
        pdfs[2]:f"Here is a summary of a {flag} from NVIDIA's 2023 Financial Report:\n",
        pdfs[3]:f"Here is a summary of a {flag} from Alphabet(A.K.A Google)'s 2020 Financial Report:\n",
        pdfs[4]:f"Here is a summary of a {flag} from Amazon's 2020 Financial Report:\n",
        pdfs[5]:f"Here is a summary of a {flag} from Apple's 2020 Financial Report:\n",
        pdfs[6]:f"Here is a summary of a {flag} from Apple's 2022 Financial Report:\n",
        pdfs[7]:f"Here is a summary of a {flag} from NVIDIA's 2020 Financial Report:\n",
        pdfs[8]:f"Here is a summary of a {flag} from Apple's 2023 Financial Report:\n",
        pdfs[9]:f"Here is a summary of a {flag} from Facebook's 2022 Financial Report:\n",
        pdfs[10]:f"Here is a summary of a {flag} from Facebooks's 2021 Financial Report:\n",
        pdfs[11]:f"Here is a summary of a {flag} from Facebooks's 2023 Financial Report:\n",
        pdfs[12]:f"Here is a summary of a {flag} from Amazon's 2023 Financial Report:\n",
        pdfs[13]:f"Here is a summary of a {flag} from Alphabet(A.K.A Google)'s 2022 Financial Report:\n",
        pdfs[14]:f"Here is a summary of a {flag} from Alphabet(A.K.A Google)'s 2023 Financial Report:\n",
        pdfs[15]:f"Here is a summary of a {flag} from Apple's 2021 Financial Report:\n",
        pdfs[16]:f"Here is a summary of a {flag} from NVIDIA's 2021 Financial Report:\n",
        pdfs[17]:f"Here is a summary of a {flag} from Amazon's 2022 Financial Report:\n",
        pdfs[18]:f"Here is a summary of a {flag} from Alphabet(A.K.A Google)'s 2024 Financial Report:\n",
        pdfs[19]:f"Here is a summary of a {flag} from Facebook's 2020 Financial Report:\n",
        pdfs[20]:f"Here is a summary of a {flag} from Alphabet(A.K.A Google)'s 2021 Financial Report:\n",
    }
        newDoc = Document(page_content = headers[source[1]]+doc.page_content,metadata = doc.metadata)
    return newDoc

In [188]:
allPDFSums = []
for pdf in pdfs:
    sums = joblib.load(f"llamaParseDocs/{pdf}/summaries.pkl")
    sums = [applyHeader(doc) for doc in sums]
    allPDFSums.extend(sums)

In [193]:
with open("allPDFSums"+".pkl", "wb") as f:
        pickle.dump(allPDFSums, f)

In [242]:
def genHypoQuery(doc):
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
-You Are an Expert Financial Analyst.
-You will be given Excerpts,Tables and Pictures from yearly financial reports of Organizations.
-Your Task is to Generate a numbered list of exactly 3 hypothetical short and simple patient questions that the document provided by the user could be used to answer for a financial chatbot. Each question should directly mention the Organization, Year, Topic, Financial Terms and Statistics it refers to without using pronouns like "these" or "this."
- Use this format for output: 1. "question 1" 2. "question 2" 3. "question 3".
- Ensure questions are short, simple, and directly reference the document's content by explicitly naming the Organization, Year, Topic, Financial Terms and Statistics etc. they are inquiring about.
- Avoid general pronouns and ensure specificity in each question to make them clear and direct.<|eot_id|>
<|start_header_id|>user<|end_header_id|>
###Document:{doc}
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""
    model = AutoModelForCausalLM.from_pretrained("astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit",device_map = "cuda:0")
    tokenizer = AutoTokenizer.from_pretrained("astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit")
    questions = tokenizer.batch_decode(model.generate(**tokenizer(prompt,return_tensors = "pt"),max_new_tokens = 200,repetition_penalty = 1.2,do_sample = True,temperature = 0.2))[len(prompt):]
    parser = NumberedListOutputParser()
    ans = parser.parse(questions)
    ans = [Document(page_content = ans,metadata = doc.metadata) for ans in ans]
    return ans

In [229]:
docs = joblib.load("allPDFDocs.pkl")

In [246]:
docs = joblib.load("allPDFDocs.pkl")

In [216]:
docs.extend(sums)

In [221]:
def addID(doc):
    doc.metadata['id'] = str(uuid.uuid4())
    return doc

In [230]:
docs = [addID(doc) for doc in docs]

In [232]:
len(docs)

10160

In [234]:
sums = [addID(doc) for doc in sums]

In [235]:
len(sums)

1970

In [239]:
with open("allPDFSums"+".pkl", "wb") as f:
        pickle.dump(sums, f)

In [240]:
with open("allPDFDocs"+".pkl", "wb") as f:
        pickle.dump(docs, f)

In [248]:
docs = joblib.load("allPDFDocs.pkl")
sums = joblib.load("allPDFSums.pkl")
docs.extend(sums)

In [249]:
len(docs)

12130

In [251]:
allHypoQuery = []
for doc in tqdm(docs):
    qs = genHypoQuery(doc)
    allHypoQuery.extend(qs)
with open("allPDFQueries"+".pkl", "wb") as f:
        pickle.dump(allHypoQuery, f)

100%|██████████| 12130/12130 [3:39:35<00:00,  1.09s/it] 


In [252]:
qs = joblib.load("allPDFQueries.pkl")