In [2]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from dotenv import load_dotenv
import os
from langchain.document_loaders import PyPDFLoader
import PyPDF2

load_dotenv(dotenv_path='.env')
openai_api_key = os.getenv("OPENAI_API_KEY")

In [None]:

def summarize_the_pdf(
    file_dir: str,
    max_final_token: int,
    token_threshold: int,
    gpt_model: str,
    temperature: float,
    summarizer_llm_system_role: str,
    final_summarizer_llm_system_role: str,
    character_overlap: int
):
    """
    Summarizes the content of a PDF file using OpenAI's ChatGPT engine.

    Args:
        file_dir (str): The path to the PDF file.
        max_final_token (int): The maximum number of tokens in the final summary.
        token_threshold (int): The threshold for token count reduction.
        gpt_model (str): The ChatGPT engine model name.
        temperature (float): The temperature parameter for ChatGPT response generation.
        summarizer_llm_system_role (str): The system role for the summarizer.

    Returns:
        str: The final summarized content.
    """
    docs = []
    docs.extend(PyPDFLoader(file_dir).load())
    print(f"Document length: {len(docs)}")
    max_summarizer_output_token = int(
        max_final_token/len(docs)) - token_threshold
    full_summary = ""
    counter = 1
    print("Generating the summary..")
    # if the document has more than one pages
    if len(docs) > 1:
        for i in range(len(docs)):
            # NOTE: This part can be optimized by considering a better technique for creating the prompt. (e.g: lanchain "chunksize" and "chunkoverlap" arguments.)

            if i == 0:  # For the first page
                prompt = docs[i].page_content + \
                    docs[i+1].page_content[:character_overlap]
            # For pages except the fist and the last one.
            elif i < len(docs)-1:
                prompt = docs[i-1].page_content[-character_overlap:] + \
                    docs[i].page_content + \
                    docs[i+1].page_content[:character_overlap]
            else:  # For the last page
                prompt = docs[i-1].page_content[-character_overlap:] + \
                    docs[i].page_content
            summarizer_llm_system_role = summarizer_llm_system_role.format(
                max_summarizer_output_token)
            full_summary += Summarizer.get_llm_response(
                gpt_model,
                temperature,
                summarizer_llm_system_role,
                prompt=prompt
            )
    else:  # if the document has only one page
        full_summary = docs[0].page_content

        print(f"Page {counter} was summarized. ", end="")
        counter += 1
    print("\nFull summary token length:", count_num_tokens(
        full_summary, model=gpt_model))
    final_summary = Summarizer.get_llm_response(
        gpt_model,
        temperature,
        final_summarizer_llm_system_role,
        prompt=full_summary
    )
    return final_summary


In [6]:
file_dir = 'dados/bitcoin.pdf'
docs = []
docs.extend(PyPDFLoader(file_dir).load())
print(f"Document length: {len(docs)}")
max_final_token = 1000
token_threshold = 100
max_summarizer_output_token = int(
    max_final_token/len(docs)) - token_threshold
full_summary = ""
counter = 1
print("Generating the summary..")

Document length: 9
Generating the summary..


In [7]:
character_overlap=50
i=0
if i == 0:  # For the first page
    prompt = docs[i].page_content + \
        docs[i+1].page_content[:character_overlap]
# For pages except the fist and the last one.
elif i < len(docs)-1:
    prompt = docs[i-1].page_content[-character_overlap:] + \
        docs[i].page_content + \
        docs[i+1].page_content[:character_overlap]
else:  # For the last page
    prompt = docs[i-1].page_content[-character_overlap:] + \
        docs[i].page_content
print(prompt)


Bitcoin: Um Sistema de Dinheiro Eletrônico Peer-to-Peer
Satoshi Nakamoto
satoshin@gmx.com
www.bitcoin.org
Traduzido para Português de bitcoin.org/bitcoin.pdf
por Rodrigo Silva Pinto - http://linkedin.com/in/rodrigosilvap
Resumo. Uma versão puramente peer-to-peer de dinheiro eletrônico permitiria que
pagamentos  on-line  fossem enviados  diretamente  de uma parte  para  outra,  sem
passar por uma instituição financeira. As assinaturas digitais fornecem parte da
solução, mas os principais benefícios são perdidos se um terceiro confiável ainda é
necessário para evitar o gasto duplo. Nós propomos uma solução para o problema de
gasto duplo usando uma rede peer-to-peer. A rede insere data e hora nas transações
através de um hash em uma cadeia contínua de prova-de-trabalho à base de hash,
formando um registro que não pode ser alterado sem refazer a prova-de-trabalho. A
cadeia mais longa não só serve como prova da seqüência de eventos testemunhados,
mas prova de que ela veio do maior pool de C

In [8]:
summarizer_llm_system_role = summarizer_llm_system_role.format(
                max_summarizer_output_token)

NameError: name 'summarizer_llm_system_role' is not defined