<a href="https://colab.research.google.com/github/ak2742/mlplay/blob/RAG/04)_Summarize_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount Google Drive**


In [None]:
#Code to mount Google Drive at Colab Notebook instance
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Workaround to avoid following error at notebook
# NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968
import locale
locale.getpreferredencoding = lambda: "UTF-8"


# **Install Libraries**

In [None]:
# Huggingface libraries to run LLM.
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U bitsandbytes

#LangChain related libraries
!pip install -q -U langchain==0.1.2

#and transforming the pages of PDF files
!pip install -q -U pypdf

In [None]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig

import torch

from langchain.llms import HuggingFacePipeline


device = 'cuda' if torch.cuda.is_available() else 'cpu'

print("Device:", device)
if device == 'cuda':
    print(torch.cuda.get_device_name(0))



# **Get Model**

In [None]:
origin_model_path = "mistralai/Mistral-7B-Instruct-v0.1"
model_path = "filipealmeida/Mistral-7B-Instruct-v0.1-sharded"
bnb_config = BitsAndBytesConfig \
              (
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
              )
model = AutoModelForCausalLM.from_pretrained (model_path, trust_remote_code=True,
                                              quantization_config=bnb_config,
                                              device_map="auto")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(origin_model_path)


# Create **Pipeline**

In [None]:
text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    batch_size=4,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=600,
    temperature = 0.3,
    do_sample=True,
)
text_generation_pipeline.tokenizer.pad_token_id = text_generation_pipeline.model.config.eos_token_id
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

## **Text Summarization Task**

In [None]:
#Librarires to count token programmatically
!pip install -q -U tiktoken
!pip install -q -U openai

# **Approach 1**

In [None]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(string)
    num_tokens = len(tokens)

    print(f'Chars Count: {len(string)}')
    print(f'Token Count: {num_tokens}')
    return num_tokens


# Stuff all your documents into a single prompt

In [None]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema.document import Document


def stuff_text_summarization(text):
  token_count = num_tokens_from_string(text, "cl100k_base")
  # print(f'Token Count: {token_count}')

  #Converting text to LangChain documents so that StuffDocumentsChain can understand Input
  documents = Document(page_content=text, metadata={"source": "local"})

  # Define prompt with prompt template
  prompt_template = """Write a concise summary of the following:
  "{docs}"
  CONCISE SUMMARY:"""
  prompt = PromptTemplate.from_template(prompt_template)

  # Define LLM chain
  llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

  # Define StuffDocumentsChain
  stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="docs")

  #Get the Summary of text by invoking StuffDocumentsChain
  summary = stuff_chain.invoke([documents])
  return summary['output_text']

In [None]:
  text = """It was a Thursday, but it felt like a Monday to John. And John loved Mondays. He thrived at work. He dismissed the old cliché of dreading Monday mornings and refused to engage in water-cooler complaints about “the grind” and empty conversations that included the familiar parry “How was your weekend?” “Too short!”. Yes, John liked his work and was unashamed.

  I should probably get another latte. I’ve just been sitting here with this empty cup. But then I’ll start to get jittery. I’ll get a decaf. No, that’s stupid, it feels stupid to pay for a decaf. I can’t justify that.

  John was always impatient on the weekends; he missed the formal structure of the business week. When he was younger he used to stay late after school on Fridays and come in early on Mondays, a pattern his mother referred to with equal parts admiration and disdain as “studying overtime.”

  Jesus, I’ve written another loser.

  Now, John spent his weekends doing yard work at the Tudor house Rebecca left him after their divorce. Rebecca, with her almond eyes—both in shape and in color—could never be his enemy.

  That barista keeps looking at me. She’ll probably ask me to leave if I don’t buy something. She’s kind of attractive. Not her hair—her hair seems stringy—but her face is nice. I should really buy something.

  Their divorce was remarkably amicable. In fact, John would often tell his parents, “Rebecca and I are better friends now than when we were married!” In fact, John looked forward to the days when he and Rebecca, with their new partners, would reminisce about their marriage, seeing it in a positive light, like two mature adults.

  Maybe I’ll just get a pumpkin-spice loaf. That way I can still sit here without going through a whole production of buying a coffee and giving my name and feeling like an asshole while it gets made.

  But if John were being honest, the house did get lonely on the weekends. Rebecca’s parents had been generous enough to leave John the house even though they had paid for it. John was still struggling to get his short-story writing—I mean, his painting—career off the ground, and Rebecca and her family had been more than supportive, even during the breakup."""

stuff_summary = stuff_text_summarization(text)

print(stuff_summary)

# **Approach 2**

In [None]:
from langchain.document_loaders import PyPDFLoader

def load_pdf(file_path):
    # Load the pdf file
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    token_count = num_tokens_from_string(str(documents), "cl100k_base")

    print(f'Docs Count: {len(documents)}')
    return documents, token_count


# Split large Docs using Map-Reduce

In [None]:
from langchain.chains import MapReduceDocumentsChain, LLMChain, ReduceDocumentsChain, StuffDocumentsChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate

def map_reduce_summarize_document(file_path):

    #Load PDF using PyPDF
    documents, token_count = load_pdf(file_path)

    # Map Prompt template and LLM Chain
    map_prompt_template = """[INST] The following is a part of an pdf document:
    {docs}
    Based on this, please identify the main points.
    Answer:  [/INST] """
    map_prompt = PromptTemplate.from_template(map_prompt_template)
    map_chain = LLMChain(llm=mistral_llm, prompt=map_prompt)


    # Reduce Prompt template and LLM Chain
    reduce_prompt_template = """[INST] The following is set of summaries from the article:
    {doc_summaries}
    Take these and distill it into a final, consolidated summary of the main points.
    Construct it as a well organized summary of the main points and should be between 3 and 5 paragraphs.
    Answer:  [/INST] """
    reduce_prompt = PromptTemplate.from_template(reduce_prompt_template)
    reduce_chain = LLMChain(llm=mistral_llm, prompt=reduce_prompt)


    # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="doc_summaries"
    )

    # Combines and iteratively reduces the mapped documents
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=4000,
    )

    # Combining documents by mapping a chain over them, then combining results
    map_reduce_chain = MapReduceDocumentsChain(
        # Map chain
        llm_chain=map_chain,
        # Reduce chain
        reduce_documents_chain=reduce_documents_chain,
        # The variable name in the llm_chain to put the documents in
        document_variable_name="docs",
        # Return the results of the map steps in the output
        return_intermediate_steps=True,
    )

    # Split documents into chunks using RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=4000, chunk_overlap=100
    )
    split_docs = text_splitter.split_documents(documents)

    print(f'Split Docs Count: {len(split_docs)}')
    # Run the chain
    result = map_reduce_chain.invoke(split_docs, return_only_outputs=False)
    return result['output_text']

In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/ImpactofIndianPremierLeagueonTestCricketinIndia.pdf'

summary_response = map_reduce_summarize_document(file_path)

In [None]:
print(summary_response)