# LDA and LLMs for Document Summarization

Installing necessary libraries not included in colab.

In [24]:
!pip install pypdf==3.14.0
!pip install tiktoken==0.4.0
!pip install langchain==0.0.353
!pip install openai==0.27.8
!pip install gdown==4.7.3
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [31]:
import gensim
import nltk
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from pypdf import PdfReader
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.llms import OpenAI
from langchain.embeddings import HuggingFaceBgeEmbeddings



## Functions

The following are the functions that implement the prepocessing, topic extraction, and LLM call.

In [3]:
def preprocess(text, stop_words):
    """
    Tokenizes and preprocesses the input text, removing stopwords and short
    tokens.

    Parameters:
        text (str): The input text to preprocess.
        stop_words (set): A set of stopwords to be removed from the text.
    Returns:
        list: A list of preprocessed tokens.
    """
    result = []
    for token in simple_preprocess(text, deacc=True):
        if token not in stop_words and len(token) > 3:
            result.append(token)
    return result


In [4]:

def get_topic_lists_from_pdf(file, num_topics, words_per_topic):
    """
    Extracts topics and their associated words from a PDF document using the
    Latent Dirichlet Allocation (LDA) algorithm.

    Parameters:
        file (str): The path to the PDF file for topic extraction.
        num_topics (int): The number of topics to discover.
        words_per_topic (int): The number of words to include per topic.

    Returns:
        list: A list of num_topics sublists, each containing relevant words
        for a topic.
    """
    # Load the pdf file
    loader = PdfReader(file)

    # Extract the text from each page into a list. Each page is considered a document
    documents= []
    for page in loader.pages:
        documents.append(page.extract_text())

    # Preprocess the documents
    nltk.download('stopwords')
    stop_words = set(stopwords.words(['english','spanish']))
    processed_documents = [preprocess(doc, stop_words) for doc in documents]

    # Create a dictionary and a corpus
    dictionary = corpora.Dictionary(processed_documents)
    corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

    # Build the LDA model
    lda_model = LdaModel(
        corpus,
        num_topics=num_topics,
        id2word=dictionary,
        passes=15
        )

    # Retrieve the topics and their corresponding words
    topics = lda_model.print_topics(num_words=words_per_topic)

    # Store each list of words from each topic into a list
    topics_ls = []
    for topic in topics:
        words = topic[1].split("+")
        topic_words = [word.split("*")[1].replace('"', '').strip() for word in words]
        topics_ls.append(topic_words)

    return topics_ls


In [11]:
def topics_from_pdf(llm, file, num_topics, words_per_topic):
    """
    Generates descriptive prompts for LLM based on topic words extracted from a
    PDF document.

    This function takes the output of `get_topic_lists_from_pdf` function,
    which consists of a list of topic-related words for each topic, and
    generates an output string in bulleted nested list format.

    Parameters:
        llm (LLM): An instance of the Large Language Model (LLM) for generating
        responses.
        file (str): The path to the PDF file for extracting topic-related words.
        num_topics (int): The number of topics to consider.
        words_per_topic (int): The number of words per topic to include.

    Returns:
        str: A response generated by the language model based on the provided
        topic words.
    """

    # Extract topics and convert them to string
    list_of_topicwords = get_topic_lists_from_pdf(file, num_topics, words_per_topic)
    string_lda = ""
    for list in list_of_topicwords:
        string_lda += str(list) + "\n"

    # Create the template
    template_string = '''Describe the topic of each of the {num_topics}
        double-quote delimited lists in a simple sentence and also write down
        three possible different subthemes. The lists are the result of an
        algorithm for topic discovery.
        Do not provide an introduction or a conclusion, only describe the
        topics. Do not mention the word "topic" when describing the topics.
        Use the following template for the response.

        1: <<<(sentence describing the topic)>>>
        - <<<(Phrase describing the first subtheme)>>>
        - <<<(Phrase describing the second subtheme)>>>
        - <<<(Phrase describing the third subtheme)>>>

        2: <<<(sentence describing the topic)>>>
        - <<<(Phrase describing the first subtheme)>>>
        - <<<(Phrase describing the second subtheme)>>>
        - <<<(Phrase describing the third subtheme)>>>

        ...

        n: <<<(sentence describing the topic)>>>
        - <<<(Phrase describing the first subtheme)>>>
        - <<<(Phrase describing the second subtheme)>>>
        - <<<(Phrase describing the third subtheme)>>>

        Lists: """{string_lda}""" '''

    # LLM call
    prompt_template = ChatPromptTemplate.from_template(template_string)
    chain = LLMChain(llm=llm, prompt=prompt_template)
    response = chain.run({
        "string_lda" : string_lda,
        "num_topics" : num_topics
        })

    return response

SyntaxError: positional argument follows keyword argument (<ipython-input-11-ffbf788611a9>, line 23)

## OpenAI API key

For this demo, we are going to use chatgpt-3.5 Turbo. For that, it is necessary to introduce the API key. Check [How to get an OPEN API key for ChatGPT](https://www.maisieai.com/help/how-to-get-an-openai-api-key-for-chatgpt) for instructions on how to get one.

In [6]:
openai_key = "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"
llm = OpenAI(openai_api_key=openai_key, max_tokens=-1)

  warn_deprecated(


## Testing with documents

Now, lets try with a public domain pdf document, The Metamorphosis By Franz Kafka (1915).

In [7]:
!gdown https://drive.google.com/uc?id=1mpXUmuLGzkVEqsTicQvBPcpPJW0aPqdL

Downloading...
From: https://drive.google.com/uc?id=1mpXUmuLGzkVEqsTicQvBPcpPJW0aPqdL
To: /content/the-metamorphosis.pdf
  0% 0.00/427k [00:00<?, ?B/s]100% 427k/427k [00:00<00:00, 47.4MB/s]


In [12]:
file = "/content/Task1_Input.pdf"

num_topics = 6
words_per_topic = 30

summary = topics_from_pdf(llm, file, num_topics, words_per_topic)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Also, let's try with a technical book: The Foundations of Geometry by David Hilbert (1899).


In [15]:
!gdown https://drive.google.com/uc?id=1T_FeuGsoC08U_6Xb8Awt50CJXBUqji4D

file = "/content/Task1_Input.pdf"
summary = topics_from_pdf(llm, file, num_topics, words_per_topic)
print(summary)

Downloading...
From: https://drive.google.com/uc?id=1T_FeuGsoC08U_6Xb8Awt50CJXBUqji4D
To: /content/Hilbert.pdf
  0% 0.00/878k [00:00<?, ?B/s]100% 878k/878k [00:00<00:00, 112MB/s]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




1: "Loss"
- "Property damage"
- "Financial loss"
- "Emotional loss"

2: "Coverage"
- "Liability coverage"
- "Property coverage"
- "Comprehensive coverage"

3: "Revised"
- "Updated laws and regulations"
- "Changes in policy"
- "Revised coverage options"

4: "Settlement"
- "Insurance claim settlement"
- "Settlement agreement"
- "Settlement negotiations"

5: "Cost"
- "Insurance cost"
- "Repair cost"
- "Replacement cost"

6: "Structures"
- "Home structures"
- "Business structures"
- "Coverage for structures"


Feel free experiment with the **number of topics** and the number of **words per topic** and find the combination that works for your document.

## Licence

GNU General Public License v2.0

## Author

[Antonio Jimenez](https://www.linkedin.com/in/antonio-jimnzc)

In [188]:
items = summary.split("\n\n")


In [189]:
print(items)

['', '1: "Loss"\n- "Property damage"\n- "Financial loss"\n- "Emotional loss"', '2: "Coverage"\n- "Liability coverage"\n- "Property coverage"\n- "Comprehensive coverage"', '3: "Revised"\n- "Updated laws and regulations"\n- "Changes in policy"\n- "Revised coverage options"', '4: "Settlement"\n- "Insurance claim settlement"\n- "Settlement agreement"\n- "Settlement negotiations"', '5: "Cost"\n- "Insurance cost"\n- "Repair cost"\n- "Replacement cost"', '6: "Structures"\n- "Home structures"\n- "Business structures"\n- "Coverage for structures"']


In [22]:
items[1]

'1: "Loss"\n- "Property damage"\n- "Financial loss"\n- "Emotional loss"'

In [25]:
from PyPDF2 import PdfReader

In [26]:
doc_reader = PdfReader('/content/Task1_Input.pdf')

In [27]:
#read data from the file and put them into a variable called raw_text

raw_text = ''
for i, page in enumerate (doc_reader.pages):
  text = page.extract_text()
  if text:
    raw_text +=text

In [28]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [104]:
#Splitting up the text into smaller chunks for indexing
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 1000,
    chunk_overlap  = 400,
    length_function = len,
    is_separator_regex = False,
)

texts = text_splitter.split_text(raw_text)

In [105]:
pip install sentence_transformers



In [106]:
from langchain.embeddings.openai import OpenAIEmbeddings

In [107]:
import openai

In [108]:
openai_key ='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'

In [109]:
# Define the model to use
model_name_embedding = "text-embedding-ada-002"

# Define the encode kwargs
encode_kwargs = {
    "max_length": 512,
    "truncation": "only_first",
}

# Create the OpenAI embeddings instance
model_norm = OpenAIEmbeddings(
    model_name=model_name_embedding,
    encode_kwargs=encode_kwargs,
    openai_api_key = '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'
)

                    model_name was transferred to model_kwargs.
                    Please confirm that model_name is what you intended.
                    encode_kwargs was transferred to model_kwargs.
                    Please confirm that encode_kwargs is what you intended.


In [110]:
def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

In [111]:
def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    #print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])


In [112]:
pip install chromadb



In [113]:
from langchain.vectorstores import Chroma

In [114]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk

persist_directory = 'db'

## Here is the nmew embeddings being used
embedding = model_norm

vectordb = Chroma.from_texts(texts=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [115]:
# Define the model to use
model_name = "gpt-3.5-turbo"

# Define the encode kwargs
encode_kwargs = {
    "max_length": 2048,  # Increase the max length to 2048
    "truncation": "only_first",
}


In [116]:
from langchain.chat_models import ChatOpenAI

In [117]:
llm = ChatOpenAI(temperature=0,
                 model=model_name,
                 openai_api_key = "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",
                 )

In [118]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk

persist_directory = 'db'

## Here is the nmew embeddings being used
embedding = model_norm

vectordb = Chroma.from_texts(texts=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [139]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

In [149]:

DEFAULT_SYSTEM_PROMPT = """Provide a comprehensive summary of the following text, highlighting the key points and main ideas. Make sure to capture the essential details while leaving out any irrelevant or extraneous information. Your summary should be detailed and informative, while still being concise and easy to understand. Avoid adding any external information or making assumptions beyond what is provided in the text."
You can modify this prompt to fit your specific needs, but the key is to emphasize the importance of comprehensiveness, accuracy, and clarity in the summary."""

def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = new_system_prompt
    prompt_template =  SYSTEM_PROMPT + instruction
    return prompt_template

In [153]:
sys_prompt = """Provide a comprehensive summary of the following text, highlighting the key points and main ideas. Make sure to capture the essential details while leaving out any irrelevant or extraneous information. Your summary should be detailed and informative, while still being concise and easy to understand. Avoid adding any external information or making assumptions beyond what is provided in the text."
You can modify this prompt to fit your specific needs, but the key is to emphasize the importance of comprehensiveness, accuracy, and clarity in the summary."""
instruction = """CONTEXT:/n/n {context}/n

Question: {question}"""
get_prompt(instruction, sys_prompt)

'Provide a comprehensive summary of the following text, highlighting the key points and main ideas. Make sure to capture the essential details while leaving out any irrelevant or extraneous information. Your summary should be detailed and informative, while still being concise and easy to understand. Avoid adding any external information or making assumptions beyond what is provided in the text."\nYou can modify this prompt to fit your specific needs, but the key is to emphasize the importance of comprehensiveness, accuracy, and clarity in the summary.CONTEXT:/n/n {context}/n\n\nQuestion: {question}'

In [154]:
from langchain.prompts import PromptTemplate
prompt_template = get_prompt(instruction, sys_prompt)

prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [155]:
chain_type_kwargs = {"prompt": prompt}


In [156]:
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain


In [157]:
from langchain.schema import prompt
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type="stuff",
                                       retriever=retriever,
                                       chain_type_kwargs=chain_type_kwargs,
                                       return_source_documents=True)

In [158]:
import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    return wrap_text_preserve_newlines(llm_response['result'])
    #print('\n\nSources:')
    #for source in llm_response["source_documents"]:
        #print(source.metadata['source'])

In [190]:
topic = []
summarized_answer = []

In [192]:
# full example
for i in items:
  if i != "":
    query = i
    llm_response = qa_chain(query)
    #print(llm_response)
    topic.append(i)
    summarized_answer.append(llm_response['result'])

In [193]:
topic

['1: "Loss"\n- "Property damage"\n- "Financial loss"\n- "Emotional loss"',
 '2: "Coverage"\n- "Liability coverage"\n- "Property coverage"\n- "Comprehensive coverage"',
 '3: "Revised"\n- "Updated laws and regulations"\n- "Changes in policy"\n- "Revised coverage options"',
 '4: "Settlement"\n- "Insurance claim settlement"\n- "Settlement agreement"\n- "Settlement negotiations"',
 '5: "Cost"\n- "Insurance cost"\n- "Repair cost"\n- "Replacement cost"',
 '6: "Structures"\n- "Home structures"\n- "Business structures"\n- "Coverage for structures"',
 '1: "Loss"\n- "Property damage"\n- "Financial loss"\n- "Emotional loss"',
 '2: "Coverage"\n- "Liability coverage"\n- "Property coverage"\n- "Comprehensive coverage"',
 '3: "Revised"\n- "Updated laws and regulations"\n- "Changes in policy"\n- "Revised coverage options"',
 '4: "Settlement"\n- "Insurance claim settlement"\n- "Settlement agreement"\n- "Settlement negotiations"',
 '5: "Cost"\n- "Insurance cost"\n- "Repair cost"\n- "Replacement cost"',
 

In [194]:
summarized_answer

['The text outlines various settlement options for different types of losses related to property damage, including actual cash value loss settlements for specific structures away from the residence premises, windstorm or hail losses to roof surfacing, and sinkhole collapse coverage. It also includes coverage for limited water backup, sump discharge or overflow, home day care, refrigerated property, special personal property, and additional insured individuals living away from the residence premises. The text also mentions coverage for owned motorized golf carts, functional replacement cost loss settlement, modified functional replacement cost loss settlement, and extended theft coverage for residence premises occasionally rented to others. The concept of "loss" is discussed in terms of property damage, financial loss, and emotional loss.',
 'The text provides a list of various insurance coverages, including liability coverage, property coverage, and comprehensive coverage. It includes 

In [169]:
type(process_llm_response(llm_response))

The text discusses various revisions and new endorsements in homeowners insurance policies. Key changes
include the introduction of a new optional endorsement for excluding specified structures from coverage to
provide underwriting flexibility. This allows insurers to accept risks they might otherwise decline, such as
dilapidated sheds or barns. Other revisions include updates to coverage for green upgrades, mechanical
breakdowns, cosmetic damage exclusions for windstorm or hail, and increased limits for damage to property of
others. Additionally, there are withdrawals of endorsements related to home-sharing host activities. The
changes aim to enhance coverage and liability options for policyholders, offering a choice between property
coverage only or a combination of coverage and liability.


NoneType

In [176]:
llm_response

{'query': '1: "Loss"\n- "Property damage"\n- "Financial loss"\n- "Emotional loss"',
 'result': 'The text outlines various settlement options for different types of losses related to property damage, including actual cash value loss settlements for specific structures away from the residence premises, windstorm or hail losses to roof surfacing, and limited water backup and sump discharge coverage. It also includes coverage for sinkhole collapse, special personal property coverage, and additional insured coverage for students living away from the residence premises. The text also mentions coverage for owned motorized golf carts, functional replacement cost loss settlement, and extended theft coverage for residence premises occasionally rented to others. The concept of "loss" is discussed in terms of property damage, financial loss, and emotional loss.',
 'source_documents': [Document(page_content='Residence Premises  - Actual Cash Value Loss  \nSettlemen t \nHO 04 92 03 22  02 17  Specif

In [177]:
llm_response['result']

'The text outlines various settlement options for different types of losses related to property damage, including actual cash value loss settlements for specific structures away from the residence premises, windstorm or hail losses to roof surfacing, and limited water backup and sump discharge coverage. It also includes coverage for sinkhole collapse, special personal property coverage, and additional insured coverage for students living away from the residence premises. The text also mentions coverage for owned motorized golf carts, functional replacement cost loss settlement, and extended theft coverage for residence premises occasionally rented to others. The concept of "loss" is discussed in terms of property damage, financial loss, and emotional loss.'