In [None]:
! pip install langchain tiktoken

In [1]:
data_dir = 'data/state_of_the_union/'

In [2]:
import os

paths = [os.path.join(data_dir, f) for f in os.listdir(data_dir)]
texts = [open(path, 'r').read().replace('\n\n', '\n') for path in paths if path.endswith('.txt')]

In [3]:
from langchain.text_splitter import CharacterTextSplitter

In [4]:
text_splitter = CharacterTextSplitter(separator = ".", chunk_size=3000, chunk_overlap=100)
docs = text_splitter.create_documents([texts[0]])

In [5]:
from langchain.prompts.base import BasePromptTemplate
from langchain import LLMChain, OpenAI, PromptTemplate
from langchain.text_splitter import TextSplitter
from typing import Any, Dict, Optional, List
from langchain.callbacks.manager import CallbackManagerForChainRun
from langchain.chains.summarize import load_summarize_chain, refine_prompts
from langchain.schema import LLMResult, Generation


class RefineLLMChain(LLMChain):
    """Chain to run queries against LLMs. Queries can be large and will be broken up and
    processed incrementally using the Refine summarize strategy. Only one input key allowed.

    Example:
        .. code-block:: python

            from langchain import LLMChain, OpenAI, PromptTemplate
            prompt_template = "Tell me a {adjective} joke"
            prompt = PromptTemplate(
                input_variables=["adjective"], template=prompt_template
            )
            llm = LLMChain(llm=OpenAI(), prompt=prompt)
    """
    
    text_splitter: TextSplitter
    """TextSplitter used to split up the input text if necessary (e.g. input text is too long to use for a single call
    to LLM)
    """
    question_prompt: BasePromptTemplate = refine_prompts.PROMPT
    """Question prompt for refine chain
    """
    refine_prompt: BasePromptTemplate = refine_prompts.REFINE_PROMPT
    """Refine prompt for refine chain
    """

    # def _call(
    #     self,
    #     inputs: Dict[str, Any],
    #     run_manager: Optional[CallbackManagerForChainRun] = None,
    # ) -> Dict[str, str]:
    #     text = inputs[self.prompt.input_variables[0]] # do some input validation here. checking that there is only one key in the dict
    #     docs = self.text_splitter.create_documents([text])
    #     with open("doc_nums.txt", "a") as myfile:
    #         myfile.write(f"appended text {len(docs)}")
    #     refine_chain = load_summarize_chain(self.llm, chain_type="refine")
    #     output = refine_chain({"input_documents": docs}, return_only_outputs=True)
    #     return {'text': output['output_text']}


    def generate( # this is a hack and should be refactored
        self,
        input_list: List[Dict[str, Any]],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> LLMResult:
        """Generate LLM result from inputs."""
        # prompts, stop = self.prep_prompts(input_list, run_manager=run_manager)
        # output = self.llm.generate_prompt(
        #     prompts,
        #     stop,
        #     callbacks=run_manager.get_child() if run_manager else None,
        #     **self.llm_kwargs,
        # )
        refine_chain = load_summarize_chain(
            self.llm, chain_type="refine",
            refine_prompt=self.refine_prompt,
            question_prompt=self.question_prompt, 
        )
        generations = []
        for index, input_item in enumerate(input_list):
            print("processing", index)
            text = input_item[self.prompt.input_variables[0]] # do some input validation here. checking that there is only one key in the dict
            docs = self.text_splitter.create_documents([text])
            output = refine_chain(
                {"input_documents": docs}, return_only_outputs=True
            )
            generations.append([Generation(text=output["output_text"])])
        return LLMResult(generations=generations)
    
    """
    todo: implement async call
    """

In [6]:
llm1 = OpenAI(max_tokens=500)
llm2 = OpenAI(max_tokens=500) # can probably just use one variable here

In [7]:
refine_text_splitter = CharacterTextSplitter(separator = ".", chunk_size=1000, chunk_overlap=100)

In [8]:
prompt_template = "{text}"
prompt = PromptTemplate(
    input_variables=["text"], template=prompt_template
) # kinda hacky but works
refineLLMChain = RefineLLMChain(llm=llm1, prompt=prompt, text_splitter=refine_text_splitter)

Test the output of the custom RefineLLMChain

In [9]:
refineLLMChain({"text": docs[0].page_content})

processing 0


{'text': '\n\nThree years ago, the US began a "great American comeback" which has resulted in job growth, rising incomes, reduced poverty and crime, and a regained respect from other nations. Since taking office, the President has moved rapidly to revive the US economy by slashing job-killing regulations, enacting tax cuts, and fighting for fair and reciprocal trade agreements. This pro-worker, pro-family, and pro-growth agenda has resulted in 7 million new jobs, with the unemployment rate at its lowest in over half a century. The unemployment rate for African Americans, Hispanic Americans, and Asian Americans has also reached the lowest levels in history. We have totally rejected the downsizing mentality and are moving forward at a pace that was unimaginable just a short time ago, building the world’s most prosperous and inclusive society, where every citizen can join in America’s success and every community can take part in America’s extraordinary rise.'}

In [10]:
refine_chain = load_summarize_chain(llm2, chain_type="refine")

In [11]:
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain

map_reduce_chain = MapReduceDocumentsChain(
    llm_chain=refineLLMChain,
    combine_document_chain=refine_chain,
    return_intermediate_steps=True
)

Test the custom MapReduceDocumentChain that uses a refine summarizing strategy for both the map and reduce steps

In [12]:
output = map_reduce_chain({"input_documents": docs[:3]})

processing 0
processing 1
processing 2


In [13]:
output

{'input_documents': [Document(page_content='Three years ago, we launched the great American comeback.  Tonight, I stand before you to share the incredible results.  Jobs are booming, incomes are soaring, poverty is plummeting, crime is falling, confidence is surging, and our country is thriving and highly respected again.  (Applause.)  America’s enemies are on the run, America’s fortunes are on the rise, and America’s future is blazing bright.\nThe years of economic decay are over.  (Applause.)  The days of our country being used, taken advantage of, and even scorned by other nations are long behind us.  (Applause.)  Gone too are the broken promises, jobless recoveries, tired platitudes, and constant excuses for the depletion of American wealth, power, and prestige.\nIn just three short years, we have shattered the mentality of American decline, and we have rejected the downsizing of America’s destiny.  We have totally rejected the downsizing.  We are moving forward at a pace that was 

### Now we will use a custom MapReduceDocumentsChain to summarize and contrast the State of the Union speeches of President Obama, Trump, and Biden

In [14]:
prompt_template = """Write a concise summary of the following:

{text}

CONCISE SUMMARY:"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
refine_template = (
    "Your job is to produce a final summary constrasting the state of the union speeches of President Biden, Trump, and Obama.\n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary."
    "If the context isn't useful, return the original summary."
)
refine_prompt = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template,
)

refine_chain = load_summarize_chain(llm2, chain_type="refine", question_prompt=PROMPT, refine_prompt=refine_prompt)

In [15]:
refine_text_splitter = CharacterTextSplitter(separator = ".", chunk_size=3000, chunk_overlap=100)

In [16]:
prompt_template = "{text}"
prompt = PromptTemplate(
    input_variables=["text"], template=prompt_template
) # kinda hacky but works
refineLLMChain = RefineLLMChain(llm=llm1, prompt=prompt, text_splitter=refine_text_splitter)

In [17]:
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain

map_reduce_chain = MapReduceDocumentsChain(
    llm_chain=refineLLMChain,
    combine_document_chain=refine_chain,
    return_intermediate_steps=True
)

In [18]:
president_names = [ path.split('/')[-1].replace('.txt', '') for path in paths if path.endswith('.txt')]

In [19]:
from langchain.schema import Document

docs = [Document(page_content=f"President {president_names[index]}:\n{text}") for index, text in enumerate(texts)]

In [20]:
output = map_reduce_chain({"input_documents": docs})

processing 0
processing 1
processing 2


In [21]:
output['input_documents']

[Document(page_content='President Trump:\nThree years ago, we launched the great American comeback.  Tonight, I stand before you to share the incredible results.  Jobs are booming, incomes are soaring, poverty is plummeting, crime is falling, confidence is surging, and our country is thriving and highly respected again.  (Applause.)  America’s enemies are on the run, America’s fortunes are on the rise, and America’s future is blazing bright.\nThe years of economic decay are over.  (Applause.)  The days of our country being used, taken advantage of, and even scorned by other nations are long behind us.  (Applause.)  Gone too are the broken promises, jobless recoveries, tired platitudes, and constant excuses for the depletion of American wealth, power, and prestige.\nIn just three short years, we have shattered the mentality of American decline, and we have rejected the downsizing of America’s destiny.  We have totally rejected the downsizing.  We are moving forward at a pace that was un

In [22]:
output['intermediate_steps']

["\n\nPresident Trump has overseen an incredible revival of the U.S. economy since taking office, resulting in record-low unemployment, job growth, and poverty levels. African American youth unemployment has reached an all-time low, African American poverty has declined to the lowest rate ever recorded, and the unemployment rate for women has reached the lowest level in almost 70 years. Under the Trump administration, 7 million Americans have come off food stamps, and 10 million people have been lifted off of welfare. Workers without a high school diploma have achieved the lowest unemployment rate recorded in U.S. history, and a record number of young Americans are now employed. U.S. stock markets have soared 70 percent, adding more than $12 trillion to our nation’s wealth, and consumer confidence has reached amazing new highs. Opportunity Zones are supporting Americans in neglected neighborhoods, providing job opportunities and investments. The USMCA trade deal has created nearly 100,

In [23]:
output['output_text']

"\n\nSince taking office, President Trump has overseen an incredible revival of the US economy, resulting in record-low unemployment, job growth, and poverty levels, with gains particularly for African Americans and women. Tariffs have been successful in bringing billions of dollars into the US treasury, and the USMCA trade deal has created nearly 100,000 new jobs. The US has invested a record-breaking $2.2 trillion in the military, and the Trump administration has taken action against terrorism, pharmaceutical companies, and sanctuary cities. This commitment to putting America first has brought a stronger future with investments into national security, the space program, and education. \n\nPresident Obama, in his final State of the Union address, called for a focus on the future and encouraged Americans to embrace change and work together for progress on issues like criminal justice reform, immigration reform, and gun violence prevention. He also proposed providing pre-K for all, offe

Lets try using a StuffChain for the combine chain of the MapReduceChain. The MapReduceChain is recursive if necessary, breaking a longer list of documents (after the initial map step) up into multiple shorter lists that can be processed without LLM context window issues.

In [24]:
stuff_chain = load_summarize_chain(llm2, chain_type="stuff")

In [25]:
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain

map_reduce_chain = MapReduceDocumentsChain(
    llm_chain=refineLLMChain,
    combine_document_chain=stuff_chain,
    return_intermediate_steps=True
)

In [None]:
output = map_reduce_chain({"input_documents": docs})

processing 0
processing 1
processing 2


In [None]:
output['intermediate_steps']

In [None]:
output['output_text']