In [None]:
! pip install langchain

In [1]:
data_dir = 'data/state_of_the_union/'

In [2]:
import os

paths = [os.path.join(data_dir, f) for f in os.listdir(data_dir)]
texts = [open(path, 'r').read().replace('\n\n', '\n') for path in paths if path.endswith('.txt')]

In [3]:
from langchain.text_splitter import CharacterTextSplitter

In [4]:
text_splitter = CharacterTextSplitter(separator = ".", chunk_size=3000, chunk_overlap=100)
docs = text_splitter.create_documents([texts[0]])

In [5]:
from langchain.prompts.base import BasePromptTemplate
from langchain import LLMChain, OpenAI, PromptTemplate
from langchain.text_splitter import TextSplitter
from typing import Any, Dict, Optional, List
from langchain.callbacks.manager import CallbackManagerForChainRun
from langchain.chains.summarize import load_summarize_chain, refine_prompts
from langchain.schema import LLMResult, Generation


class RefineLLMChain(LLMChain):
    """Chain to run queries against LLMs. Queries can be large and will be broken up and
    processed incrementally using the Refine summarize strategy. Only one input key allowed.

    Example:
        .. code-block:: python

            from langchain import LLMChain, OpenAI, PromptTemplate
            prompt_template = "Tell me a {adjective} joke"
            prompt = PromptTemplate(
                input_variables=["adjective"], template=prompt_template
            )
            llm = LLMChain(llm=OpenAI(), prompt=prompt)
    """
    
    text_splitter: TextSplitter
    """TextSplitter used to split up the input text if necessary (e.g. input text is too long to use for a single call
    to LLM)
    """
    question_prompt: BasePromptTemplate = refine_prompts.PROMPT
    """Question prompt for refine chain
    """
    refine_prompt: BasePromptTemplate = refine_prompts.REFINE_PROMPT
    """Refine prompt for refine chain
    """

    # def _call(
    #     self,
    #     inputs: Dict[str, Any],
    #     run_manager: Optional[CallbackManagerForChainRun] = None,
    # ) -> Dict[str, str]:
    #     text = inputs[self.prompt.input_variables[0]] # do some input validation here. checking that there is only one key in the dict
    #     docs = self.text_splitter.create_documents([text])
    #     with open("doc_nums.txt", "a") as myfile:
    #         myfile.write(f"appended text {len(docs)}")
    #     refine_chain = load_summarize_chain(self.llm, chain_type="refine")
    #     output = refine_chain({"input_documents": docs}, return_only_outputs=True)
    #     return {'text': output['output_text']}


    def generate( # this is a hack and should be refactored
        self,
        input_list: List[Dict[str, Any]],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> LLMResult:
        """Generate LLM result from inputs."""
        # prompts, stop = self.prep_prompts(input_list, run_manager=run_manager)
        # output = self.llm.generate_prompt(
        #     prompts,
        #     stop,
        #     callbacks=run_manager.get_child() if run_manager else None,
        #     **self.llm_kwargs,
        # )
        print('question prompot', self.question_prompt)
        refine_chain = load_summarize_chain(
            self.llm, chain_type="refine",
            refine_prompt=self.refine_prompt,
            question_prompt=self.question_prompt, 
        )
        generations = []
        for input_item in input_list:
            text = input_item[self.prompt.input_variables[0]] # do some input validation here. checking that there is only one key in the dict
            docs = self.text_splitter.create_documents([text])
            output = refine_chain(
                {"input_documents": docs}, return_only_outputs=True
            )
            generations.append([Generation(text=output["output_text"])])
        return LLMResult(generations=generations)
    
    """
    todo: implement async call
    """

In [6]:
llm1 = OpenAI()
llm2 = OpenAI() # can probably just use one variable here

In [7]:
refine_text_splitter = CharacterTextSplitter(separator = ".", chunk_size=1000, chunk_overlap=100)

In [8]:
prompt_template = "{text}"
prompt = PromptTemplate(
    input_variables=["text"], template=prompt_template
) # kinda hacky but works
refineLLMChain = RefineLLMChain(llm=llm1, prompt=prompt, text_splitter=refine_text_splitter)

Test the output of the custom RefineLLMChain

In [9]:
refineLLMChain({"text": docs[0].page_content})

question prompot input_variables=['text'] output_parser=None partial_variables={} template='Write a concise summary of the following:\n\n\n"{text}"\n\n\nCONCISE SUMMARY:' template_format='f-string' validate_template=True


{'text': '\n\nThree years ago, the US launched the "great American comeback" and the results are impressive. Jobs are booming, poverty is plummeting, and the US is highly respected again. The mentality of American decline and the downsizing of America\'s destiny have been rejected, resulting in an increase in wealth, power, and prestige. Since taking office, President Trump has slashed a record number of job-killing regulations, enacted historic and record-setting tax cuts, and fought for fair and reciprocal trade agreements, resulting in the creation of 7 million new jobs and an unemployment rate lower than any other administration in the history of the US. We have totally rejected the downsizing and are moving forward at a pace that was unimaginable just a short time ago. The economy is the best it has ever been, the military is rebuilt with unmatched power, our borders are secure, families are flourishing, values are renewed, pride is restored, and the state of our Union is stronger

In [10]:
refine_chain = load_summarize_chain(llm2, chain_type="refine")

In [11]:
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain

map_reduce_chain = MapReduceDocumentsChain(
    llm_chain=refineLLMChain,
    combine_document_chain=refine_chain,
    return_intermediate_steps=True
)

Test the custom MapReduceDocumentChain that uses a refine summarizing strategy for both the map and reduce steps

In [None]:
output = map_reduce_chain({"input_documents": docs[:3]})

question prompot input_variables=['text'] output_parser=None partial_variables={} template='Write a concise summary of the following:\n\n\n"{text}"\n\n\nCONCISE SUMMARY:' template_format='f-string' validate_template=True


In [None]:
output

### Now we will use a custom MapReduceDocumentsChain to summarize and contrast the State of the Union speeches of President Obama, Trump, and Biden

In [None]:
prompt_template = """Write a concise summary of the following:

{text}

CONCISE SUMMARY:"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
refine_template = (
    "Your job is to produce a final summary constrasting the state of the union speeches of President Biden, Trump, and Obama.\n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary."
    "If the context isn't useful, return the original summary."
)
refine_prompt = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template,
)

refine_chain = load_summarize_chain(llm2, chain_type="refine", question_prompt=prompt_template, refine_prompt=refine_prompt)

In [None]:
refine_text_splitter = CharacterTextSplitter(separator = ".", chunk_size=3000, chunk_overlap=100)

In [None]:
prompt_template = "{text}"
prompt = PromptTemplate(
    input_variables=["text"], template=prompt_template
) # kinda hacky but works
refineLLMChain = RefineLLMChain(llm=llm1, prompt=prompt, text_splitter=refine_text_splitter)

In [None]:
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain

map_reduce_chain = MapReduceDocumentsChain(
    llm_chain=refineLLMChain,
    combine_document_chain=refine_chain,
    return_intermediate_steps=True
)

In [None]:
output = map_reduce_chain({"input_documents": docs})