For this project, we will use llama2. Just because it is free and easy to use locally.

https://huggingface.co/blog/llama2

# Imports

In [1]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain

# Loading the pdf document

In [2]:
# Load documents
file_path = '../pdfs/1912.13318.pdf'
loader = PyPDFLoader(file_path)
# Load and split
docs = loader.load_and_split()

In [3]:
len(docs)

17

In [4]:
docs

[Document(page_content='LayoutLM: Pre-training of Text and Layout for\nDocument Image Understanding\nYiheng Xu∗\ncharlesyihengxu@gmail.com\nHarbin Institute of TechnologyMinghao Li∗\nliminghao1630@buaa.edu.cn\nBeihang UniversityLei Cui\nlecu@microsoft.com\nMicrosoft Research Asia\nShaohan Huang\nshaohanh@microsoft.com\nMicrosoft Research AsiaFuru Wei\nfuwei@microsoft.com\nMicrosoft Research AsiaMing Zhou\nmingzhou@microsoft.com\nMicrosoft Research Asia\nABSTRACT\nPre-training techniques have been verified successfully in a vari-\nety of NLP tasks in recent years. Despite the widespread use of\npre-training models for NLP applications, they almost exclusively\nfocus on text-level manipulation, while neglecting layout and style\ninformation that is vital for document image understanding. In\nthis paper, we propose the LayoutLM to jointly model interactions\nbetween text and layout information across scanned document\nimages, which is beneficial for a great number of real-world doc-\numen

In [5]:
docs[3].page_content

'2 LAYOUTLM\nIn this section, we briefly review the BERT model, and introduce\nhow we extend to jointly model text and layout information in the\nLayoutLM framework.\n3https://guillaumejaume.github.io/FUNSD/\n4https://rrc.cvc.uab.es/?ch=13\n5https://www.cs.cmu.edu/~aharley/rvl-cdip/'

### Splitting text
Split the text retrieved from docs to chunks that can fit into the model's context window.

For simplicity, we will use Character text splitter.

https://python.langchain.com/docs/modules/data_connection/document_transformers/

In [6]:
# TODO: Use the tokenizer from the original meta repo, not working properly
# from transformers import LlamaTokenizerFast
# from langchain.docstore.document import Document

# # We use the llama tokenizer to count tokens and split text into chunks
# # TODO: Use the tokenizer from the original meta repo
# tokenizer = LlamaTokenizerFast.from_pretrained("TheBloke/Llama-2-7B-fp16")

# text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
#     tokenizer=tokenizer,
#     chunk_size=1000, # 1000 tokens per chunk
#     chunk_overlap=0  # No overlap
# )
# split_docs = text_splitter.split_documents(docs)
# text = ''
# for doc in docs:
#     text += doc.page_content

# texts = text_splitter.split_text(text)
# split_docs = [Document(page_content=t) for t in texts]


In [7]:
# len(split_docs)

# Define summary prompt and load the llm

In [8]:
# Define the prompt
prompt_template = """Write a concise summary of the following:
"{text}"
CONCISE SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

In [9]:
from langchain_community.llms import Ollama

# Define the LLM chain
llm = Ollama(model="llama2")
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [10]:
# Measure the number of tokens in the documents combined
num_tokens = 0
for doc in docs:
    num_tokens += llm.get_num_tokens(doc.page_content)
print(num_tokens)

Token indices sequence length is longer than the specified maximum sequence length for this model (1101 > 1024). Running this sequence through the model will result in indexing errors


12958


In [11]:
# Alternatively:
# from tokenizers import Tokenizer
# tokenizer = Tokenizer.from_pretrained("TheBloke/Llama-2-7B-fp16")
# num_tokens = 0
# for doc in docs:
#     num_tokens += len(tokenizer.encode(doc.page_content))

# print(num_tokens)

In [12]:
#Llama2 has a context length of 4096 tokens.
max_tokens = 4096

# Stuff

Stuffing is the simplest method to pass data to a language model. It "stuffs" text into the prompt as context in a way that all of the relevant information can be processed by the model to get what you want.

In [13]:
stuff_chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)

In [14]:
try:
    print(stuff_chain.invoke(docs[:2]))
except Exception as e:
    print("The code failed since it won't be able to fit the documents into the LLM context length: ", e)

{'input_documents': [Document(page_content='LayoutLM: Pre-training of Text and Layout for\nDocument Image Understanding\nYiheng Xu∗\ncharlesyihengxu@gmail.com\nHarbin Institute of TechnologyMinghao Li∗\nliminghao1630@buaa.edu.cn\nBeihang UniversityLei Cui\nlecu@microsoft.com\nMicrosoft Research Asia\nShaohan Huang\nshaohanh@microsoft.com\nMicrosoft Research AsiaFuru Wei\nfuwei@microsoft.com\nMicrosoft Research AsiaMing Zhou\nmingzhou@microsoft.com\nMicrosoft Research Asia\nABSTRACT\nPre-training techniques have been verified successfully in a vari-\nety of NLP tasks in recent years. Despite the widespread use of\npre-training models for NLP applications, they almost exclusively\nfocus on text-level manipulation, while neglecting layout and style\ninformation that is vital for document image understanding. In\nthis paper, we propose the LayoutLM to jointly model interactions\nbetween text and layout information across scanned document\nimages, which is beneficial for a great number of r

In [15]:
# In case the prompt is too long, error is thrown
try:
    print(stuff_chain.invoke(docs))
except Exception as e:
    print("The code failed since it won't be able to fit the documents into the LLM context length: ", e)

{'input_documents': [Document(page_content='LayoutLM: Pre-training of Text and Layout for\nDocument Image Understanding\nYiheng Xu∗\ncharlesyihengxu@gmail.com\nHarbin Institute of TechnologyMinghao Li∗\nliminghao1630@buaa.edu.cn\nBeihang UniversityLei Cui\nlecu@microsoft.com\nMicrosoft Research Asia\nShaohan Huang\nshaohanh@microsoft.com\nMicrosoft Research AsiaFuru Wei\nfuwei@microsoft.com\nMicrosoft Research AsiaMing Zhou\nmingzhou@microsoft.com\nMicrosoft Research Asia\nABSTRACT\nPre-training techniques have been verified successfully in a vari-\nety of NLP tasks in recent years. Despite the widespread use of\npre-training models for NLP applications, they almost exclusively\nfocus on text-level manipulation, while neglecting layout and style\ninformation that is vital for document image understanding. In\nthis paper, we propose the LayoutLM to jointly model interactions\nbetween text and layout information across scanned document\nimages, which is beneficial for a great number of r

Pros:
Only makes a single call to the LLM. When generating text, the LLM has access to all the data at once.

Cons:
Most LLMs have a context length, and for large documents (or many documents) this will not work as it will result in a prompt larger than the context length.

## Map-Reduce

The MapReduce method implements a multi-stage summarization. It is a technique for summarizing large pieces of text by first summarizing smaller chunks of text and then combining those summaries into a single summary.

In [None]:
# Map chain: map each document to an individual summary
map_prompt_template = """
                      Write a summary of this chunk of text that includes the main points and any important details.
                      {text}
                      """

map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])

In [None]:
# Reduce chain
reduce_template = """
                      Write a concise summary of the following text delimited by triple backquotes.
                      Return your response in bullet points which covers the key points of the text.
                      ```{text}```
                      BULLET POINT SUMMARY:
                      """

reduce_prompt = PromptTemplate(
    template=reduce_template, input_variables=["text"]
)

In [None]:
map_reduce_chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=map_prompt, combine_prompt=reduce_prompt)

In [None]:
map_reduce_outputs = map_reduce_chain.invoke(docs)

  warn_deprecated(


Pros:
This can scale to larger documents (and more documents) than StuffDocumentsChain. The calls to the LLM on individual documents are independent and can therefore be parallelized.

Cons:
Requires many more calls to the LLM than StuffDocumentsChain. Loses some information during the final combining call.

# Putting it together: Stuff or Map-Reduce

Based on the number of tokens, use stuff or map-reduce

In [None]:
import textwrap
from time import monotonic

if num_tokens < max_tokens:
    chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
else:
  chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=map_prompt, combine_prompt=reduce_prompt)


start_time = monotonic()
summary = chain.invoke(docs)


print(f"Run time: {monotonic() - start_time}")
print(f"Summary: {textwrap.fill(summary, width=100)}")

ValidationError: 1 validation error for StuffDocumentsChain
__root__
  document_variable_name text was not found in llm_chain input_variables: ['docs'] (type=value_error)

# Refine

This method involves an initial prompt on the first chunk of data, generating some output. For the remaining documents, that output is passed in, along with the next document, asking the LLM to refine the output based on the new document.

In [None]:
summarize_refine = load_summarize_chain(llm=llm, chain_type="refine")

Pros:
Can pull in the more relevant context, and may be less lossy than MapReduceDocumentsChain.

Cons:
Requires many more calls to the LLM than StuffDocumentsChain. The calls are also NOT independent, meaning they cannot be paralleled like MapReduceDocumentsChain. There are also some potential dependencies on the ordering of the documents.

But using these approaches are still costly, and can take a lot of time for long documents, due to how they work.



# Extractive then abstractive summarization approach

# Embeddings + KMeans over chunks