In [4]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

### Basic Prompt
This method is not flexible or scalable because it is limited to the model context length, which is around 4096 tokens.

The pros of this method are that it only makes a single call to the LM, and when generating text, the LM has access to all of the data at once.

In [5]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import(
    AIMessage,
    HumanMessage,
    SystemMessage
)

In [24]:
text= """
Mojo exceptional ability, good luck, success in its earliest sense denoted an object that was believed to carry a magic spell. From there, the word expanded to indicate magic itself and personal use of magic, and mojo's popular use today in reference to seemingly magical influence or ability is informal.
"""

messages = [
    SystemMessage(content='You are an expert copywriter with expertize in summarizing documents'),
    HumanMessage(content=f'Please provide a short and concise summary of the following text:\n TEXT: {text}')
]

llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo')

In [25]:
llm.get_num_tokens(text)

60

In [26]:
summary_output = llm(messages)

[SystemMessage(content='You are an expert copywriter with expertize in summarizing documents'), HumanMessage(content="Please provide a short and concise summary of the following text:\n TEXT: \nMojo exceptional ability, good luck, success in its earliest sense denoted an object that was believed to carry a magic spell. From there, the word expanded to indicate magic itself and personal use of magic, and mojo's popular use today in reference to seemingly magical influence or ability is informal.\n")]


In [27]:
print(summary_output.content)

Summary:
Originally referring to an object believed to possess magical powers, "mojo" now commonly signifies a person's exceptional ability or luck, often used informally to describe a seemingly magical influence or skill.


### Summarizing Using Prompt Template 

In [28]:
from langchain import PromptTemplate
from langchain.chains import LLMChain

In [29]:
template = '''
Write a concise and short summary of the following text:
TEXT: `{text}`
Translate the summary of {language}.
'''
promp = PromptTemplate(
    input_variables=['text', 'language'],
    template=template
)

In [30]:
llm.get_num_tokens(promp.format(text=text, language='English'))

81

In [31]:
chain = LLMChain(llm=llm, prompt=promp)
summary = chain.run({'text': text, 'language':'yoruba'})

In [32]:
print(summary)

Summary: Mojo originally referred to an object believed to have magical powers, but now it is commonly used to describe someone's seemingly magical influence or ability.

Yoruba Translation: Mojo ti o ti o je ohun ti o gba ase iranlowo, sugbon ni o je lati lo lati so pe eniyan nikan ni o je ohun ti o gba ase iranlowo tabi abiliti ti o je bi ase.


### Summarizing using SuffDocumentChain

The downside of this method is that it only works with smaller pieces of data.

In [33]:
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document

In [38]:
with open('./text.txt', encoding='utf-8') as f:
    text = f.read()

# text

docs = [Document(page_content=text)]
llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo')

In [39]:
template = '''
Write a concise and short summary of the following text. 
TEXT: `{text}`
'''

prompt = PromptTemplate(
    input_variables=['text'],
    template=template
)

In [40]:
chain = load_summarize_chain(
    llm,
    chain_type='stuff',
    prompt=prompt,
    verbose=False
)
output_summary = chain.run(docs)

In [41]:
print(output_summary)

The author apologizes for not being able to provide specific text files related to famous individuals due to copyright concerns. However, they offer to provide information and guidance on various topics related to famous people such as biographical details, quotes, and general information. They encourage the reader to ask for assistance in any other way they may need.


### Summarizing Large Documents Using map_reduce
MapReduce method will split the document into small chunks that fit within the token limit of the model.
It will first summarize each chunk and then it will get a summary of the summaries.
MapReduce uses two prompts, an initial one to summarize each chunk of data, and then a different prompt to combine each summary into the final one.

Props of the Method:
It scales to larger documents, and the call to the LM on individual chunks are independent and can therefore be parallelized.

Cons of the Method:
It requires many more calls to the LM than staff document chain.
It also loses some information during the final combining call.

In [42]:
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [43]:
with open('./text.txt', encoding='utf-8') as f:
    text = f.read()

llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo')

In [44]:
llm.get_num_tokens(text)

71

In [47]:
# You can change chunk_size to increase or reduce the chunk number
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=50)
chunks = text_splitter.create_documents([text])

In [48]:
len(chunks)

7

In [49]:
chain = load_summarize_chain(
    llm,
    chain_type='map_reduce',
    verbose=False
)
output_summary = chain.run(chunks)

In [50]:
print(output_summary)

The text apologizes for not being able to provide specific text files related to famous individuals due to copyright restrictions. It offers guidance on providing biographical details, quotes, or general information about famous people and asks for specific instructions on how to do so. Additional information or assistance is also requested.


In [51]:
chain.llm_chain.prompt.template

'Write a concise summary of the following:\n\n\n"{text}"\n\n\nCONCISE SUMMARY:'

In [52]:
chain.combine_document_chain.llm_chain.prompt.template

'Write a concise summary of the following:\n\n\n"{text}"\n\n\nCONCISE SUMMARY:'

### MapReduce with Custom Prompt

In [53]:
map_prompt = ''' 
Write a short and concise summary of the following:
Text: `{text}`
CONCISE SUMMARY
'''
map_prompt_template = PromptTemplate(
    input_variables=['text'],
    template=map_prompt
)

In [54]:
combine_prompt = ''' 
Write a concise summary of the following text that covers the key points.
Add a title to the summary.
Start your summary with an INTRODUCTION PARAGRAPH that gives an overview of the topic FOLLOWED by BULLET POINTS if possible AND end the summary with a CONCLUSION PHRASE.
Text: `{text}` 
'''
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=['text'])

In [55]:
summary_chain = load_summarize_chain(
    llm=llm,
    chain_type='map_reduce',
    map_prompt=map_prompt_template,
    combine_prompt=combine_prompt_template,
    verbose=False
)

output = summary_chain.run(chunks)

In [56]:
print(output)

Title: Copyright Issues and Biographical Information Requests

Introduction:
The text addresses the issue of copyright when providing information about famous individuals, as well as the request for biographical details and quotes.

Key Points:
- Apology for confusion, unable to provide specific text files
- Mention of specific text files related to famous individuals
- Discussion on potential copyright issues related to providing information about famous individuals
- Providing information and guidance on various topics related to copyrighted material
- Request for biographical details, quotes, or general information
- Request for additional assistance or information

Conclusion:
The text highlights the importance of considering copyright issues when sharing information about famous individuals and emphasizes the request for biographical details and quotes.


### Summarizing Using the refine CombineDocumentChain
The method starts by generating a summary for the first chunk of data.  
The summary of the first chunk is passed in along with the second chunk, and the LM is asked to refine the summary based on the new document.   

Prons:  
- Uses a more relevant context (better summarization)
- less lossy than map_reduce

Cons:
- It requires many more calls to the LLM
- the calls are not independent and can not be parallelized


In [2]:
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader

In [3]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [4]:
pip install unstructured -q

Note: you may need to restart the kernel to use updated packages.


In [65]:
pip install pdf2image

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Note: you may need to restart the kernel to use updated packages.


In [76]:
pip install --upgrade pdfminer.six


Collecting pdfminer.sixNote: you may need to restart the kernel to use updated packages.

  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six)
  Downloading cryptography-42.0.5-cp39-abi3-win_amd64.whl.metadata (5.4 kB)
Collecting cffi>=1.12 (from cryptography>=36.0.0->pdfminer.six)
  Downloading cffi-1.16.0-cp311-cp311-win_amd64.whl.metadata (1.5 kB)
Collecting pycparser (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six)
  Downloading pycparser-2.22-py3-none-any.whl.metadata (943 bytes)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.6 MB 146.3 kB/s eta 0:00:39
   ------------------

In [5]:
from langchain.document_loaders import PyPDFLoader
# loader = UnstructuredPDFLoader('./ngConstitution.pdf')


In [6]:
loader = PyPDFLoader('./ngConstitution.pdf')

data = loader.load()

In [7]:
print(data[0].page_content)

Constitution of the Federal Republic of Nigeria 
1999 
  
  
  
We the people of the Federal Republic of Nigeria  
  
Having firmly and solemnly resolve, to live in unity and harmony as one indivisible and 
indissoluble sovereign nation under God, dedicated to the pr omotion of inter-African 
solidarity, world peace, interna tional co-operation and understanding  
  
And to provide for a Constitution for the pur pose of promoting the good government and 
welfare of all persons in our c ountry, on the principles of freedom, equality and justice, and 
for the purpose of consolida ting the unity of our people  
  
Do hereby make, enact and give to ourselves the following Constitution:-  
  
 
Chapter I 
General Provisions 
  
Part I  
  
  
Federal Republic of Nigeria 
  
1. (1) This Constitution is supreme and its provisions sha ll have binding force on the authorities and persons 
throughout the Federal Republic of Nigeria. 
 
(2) The Federal Republic of Nigeria shall not be governed, n

In [10]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000000, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [11]:
len(chunks)

118

In [12]:
llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo')

  warn_deprecated(


In [13]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('gpt-3.5-turbo')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.002:.6f}')

print_embedding_cost(chunks)

Total Tokens: 83946
Embedding Cost in USD: 0.167892


In [15]:
chain = load_summarize_chain(
    llm=llm,
    chain_type='refine',
    verbose=False
)
output_summary = chain.run(chunks)

In [None]:
print(output_summary)

### Refine With Custom Prompts

In [None]:
prompt_template = ''' 
Write a concise summary of the following extracting the key information:
Text: `{text}`
CONCISE SUMMARY:
'''
initial_prompt = PromptTemplate(template=prompt_template, input_variable=['text'])

refine_template = '''
    Your job is to produce a final summary.
    I have provided an existing summary up to a certain point: {existing_answer}.
    Please refine the existing summary with some more context below.
    -----------
    {text}
    -----------
    Start the final summary with an INTRODUCTION PARAGRAPH that gives an overview of the topic FOLLOWED by BULLET POINTS if possible AND end the summary with a CONCLUSION PHRASE.
'''

refine_prompt = PromptTemplate(template=refine_template, input_variable=['existing_answer', 'text'])

In [None]:
chain = load_summarize_chain(
    llm=llm,
    chain_type='refine',
    question_prompt=initial_prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=False
)

output_summary = chain.run(chunks)

In [None]:
print(output_summary)

### Summarizing Using LangChain Agents

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, Tool
from langchain.utilities import WikipediaAPIWrapper

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

In [None]:
llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo')
wikipedia = WikipediaAPIWrapper()

In [None]:
tools = [
    Tool(
        name='Wikipedia',
        func=wikipedia.run,
        description='Useful for when you need to get information from wikipedia about a topic'
    )
]

In [None]:
agent_executor = initialize_agent(tools, llm, agent='zero-shot-react-description', verbose=True)

In [None]:
output = agent_executor.run('Can you please provide a short summary of Nigeria Constitution')

In [None]:
print(output)