In [32]:
!pip install --upgrade langchain langchain-core langchain-text-splitters openai
!pip install langchain-community
!pip install fpdf
!pip install pypdf
!pip install -U langchain
!pip install tiktoken
!pip install faiss-cpu
!pip install PyMuPDF tiktoken tqdm fpdf openai==0.28


Collecting openai==0.28
  Using cached openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Using cached openai-0.28.0-py3-none-any.whl (76 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.57.2
    Uninstalling openai-1.57.2:
      Successfully uninstalled openai-1.57.2
Successfully installed openai-0.28.0


In [42]:
import openai
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from fpdf import FPDF


# api_key = 'test-using-your-own-api'


def index_pdf(pdf_path):
    """
    Index the content of a PDF file for retrieval.

    :param pdf_path: Path to the PDF file.
    :return: A FAISS index of the document content.
    """
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200
    )
    split_docs = text_splitter.split_documents(documents)

    embeddings = OpenAIEmbeddings(openai_api_key=api_key)
    vectorstore = FAISS.from_documents(split_docs, embeddings)
    return vectorstore

def generate_dynamic_query(book_title, focus_areas=None, target_length=None):
    """
    Dynamically generate a query for summarization based on the book title and focus areas.

    :param book_title: Title of the book to summarize.
    :param focus_areas: Optional list of focus areas (themes, characters, events, etc.).
    :param target_length: Optional target length for the summary (in characters).
    :return: A dynamic query string for LangChain summarization.
    """
    query = f"Provide a detailed summary of '{book_title}', covering key events, themes, character development, and philosophical insights."
    
    if focus_areas:
        query += " Focus on the following aspects: " + ", ".join(focus_areas) + "."
    
    query += " The summary should be in-depth, explaining the major turning points and the overall impact of the novel's conclusion."
    
    if target_length:
        query += f" Target the summary length to be around {target_length} characters, representing approximately 20 pages."

    return query

def generate_summary_with_langchain(index, query, target_length=None):
    """
    Use LangChain to generate a concise summary based on indexed content.

    :param index: FAISS index of the document content.
    :param query: Query to retrieve relevant content for summarization.
    :param target_length: Optional target length for the summary (in characters).
    :return: A detailed summary of the book.
    """
    # Retrieve relevant chunks from the index
    docs = index.similarity_search(query, k=10)

    # Initialize LangChain's summarization chain
    llm = ChatOpenAI(model="gpt-4", openai_api_key=api_key)
    chain = load_summarize_chain(llm, chain_type="map_reduce")

    # Generate summary using LangChain
    summary = chain.run(docs)
    
    # If target length is specified, truncate or adjust the summary accordingly
    if target_length:
        summary = summary[:target_length]

    return summary


def save_summary_as_pdf(summary, filename="summary.pdf"):
    """
    Save the provided summary as a PDF file.

    :param summary: The text of the summary to save.
    :param filename: The name of the output PDF file.
    """
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Add summary content to the PDF
    for line in summary.split("\n"):
        pdf.multi_cell(0, 10, line)

    pdf.output(filename)

if __name__ == "__main__":
    pdf_path = "crime-and-punishment.pdf"  # Replace with your PDF file path

    # Define the book title and optional focus areas
    book_title = "Crime and Punishment"
    focus_areas = [
        "moral dilemmas",
        "psychological complexities of Rodion Raskolnikov",
        "character interactions with Sonia, Svidrigailov, and Porfiry"
    ]
    target_length = 15000  # Approx. characters for 20 PDF pages

    try:
        # Step 2: Index the PDF content
        print("Indexing PDF content...")
        index = index_pdf(pdf_path)
        
        # Step 3: Generate dynamic query for the summary
        query = generate_dynamic_query(book_title, focus_areas, target_length)
        print(f"Generated Query: {query}")
        
        # Step 4: Generate detailed summary
        print("Generating summary...")
        summary = generate_summary_with_langchain(index, query, target_length)
        
        # Step 5: Save the summary as a PDF
        output_filename = "Crime_and_Punishment_Summary.pdf"
        print("Saving summary to PDF...")
        save_summary_as_pdf(summary, filename=output_filename)
        
        # Print the summary to console
        print("Generated Summary:")
        print(summary)
        
        print(f"Summary saved as '{output_filename}'")
    except Exception as e:
        print(f"An error occurred: {e}")


Indexing PDF content...
Generated Query: Provide a detailed summary of 'Crime and Punishment', covering key events, themes, character development, and philosophical insights. Focus on the following aspects: moral dilemmas, psychological complexities of Rodion Raskolnikov, character interactions with Sonia, Svidrigailov, and Porfiry. The summary should be in-depth, explaining the major turning points and the overall impact of the novel's conclusion. Target the summary length to be around 15000 characters, representing approximately 20 pages.
Generating summary...
Saving summary to PDF...
Generated Summary:
In Dostoevsky's "Crime and Punishment," protagonist Rodion Raskolnikov, a former university student, is serving a prison sentence in Siberia for murder. Despite his crimes, Raskolnikov is shown to have a compassionate side, having financially supported an ill fellow student and his elderly father, as well as saving two children from a burning building. He has severed ties with his fam

In [44]:
import openai
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from fpdf import FPDF


# api_key = 'test-using-your-own-api'


def index_pdf(pdf_path):
    """
    Index the content of a PDF file for retrieval.

    :param pdf_path: Path to the PDF file.
    :return: A FAISS index of the document content.
    """
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200
    )
    split_docs = text_splitter.split_documents(documents)

    embeddings = OpenAIEmbeddings(openai_api_key=api_key)
    vectorstore = FAISS.from_documents(split_docs, embeddings)
    return vectorstore

def generate_summary_with_langchain(index, query, target_length=None):
    """
    Use LangChain to generate a concise summary based on indexed content.

    :param index: FAISS index of the document content.
    :param query: Query to retrieve relevant content for summarization.
    :param target_length: Optional target length for the summary (in characters).
    :return: A detailed summary of the book.
    """
    # Retrieve relevant chunks from the index, increase k for more content
    docs = index.similarity_search(query, k=20)  # Increase to 20 or 30 for more content

    # Initialize LangChain's summarization chain
    llm = ChatOpenAI(model="gpt-4", openai_api_key=api_key)
    chain = load_summarize_chain(llm, chain_type="refine")  # Change to 'refine' or 'stuff' for deeper summaries

    # Generate summary using LangChain
    summary = chain.run(docs)
    
    # If target length is specified, ensure we don't truncate the summary too early
    if target_length:
        summary = summary[:target_length]  # Adjust or remove truncation if needed

    return summary


# Function to save the summary to a PDF
def save_summary_to_pdf(summary: str, output_pdf_path: str):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    
    # Use 'Helvetica' instead of 'Arial' for better Unicode support
    pdf.set_font("Helvetica", size=12)
    
    # Save summary text, encoding to handle special characters
    pdf.multi_cell(0, 10, summary.encode('latin-1', 'ignore').decode('latin-1'))
    
    pdf.output(output_pdf_path)

if __name__ == "__main__":
    pdf_path = "crime-and-punishment.pdf"  # Replace with your PDF file path

    try:
        # Step 2: Index the PDF content
        print("Indexing PDF content...")
        index = index_pdf(pdf_path)
        
        # Step 3: Generate detailed summary (target ~20 pages)
        query = (
            "Provide a detailed summary of 'Crime and Punishment', covering key events, "
            "themes, character development, and philosophical insights. Focus on the moral dilemmas "
            "and psychological complexities of the protagonist, Rodion Raskolnikov, and how his actions affect the plot."
            "Also, highlight the interactions with key characters like Sonia, Svidrigailov, and Porfiry."
            "The summary should be in-depth, explaining the major turning points and the overall impact of the novel's conclusion."
            "Target the summary length to be around 25,000 characters, representing approximately 30 pages."
        )
        target_length = 25000  # Increase target length to 25,000 characters for a longer summary
        print("Generating summary...")
        summary = generate_summary_with_langchain(index, query, target_length)
        
        # Step 4: Save the summary as a PDF
        output_filename = "Crime_and_Punishment_Summary.pdf"
        print("Saving summary to PDF...")
        save_summary_as_pdf(summary, filename=output_filename)
        
        # Print the summary to console
        print("Generated Summary:")
        print(summary)
        
        print(f"Summary saved as '{output_filename}'")
    except Exception as e:
        print(f"An error occurred: {e}")


Indexing PDF content...
Generating summary...
Saving summary to PDF...
Generated Summary:
In the remote Russian fortress, former university student Rodion Raskolnikov is serving an eight-year sentence. His past altruistic deeds haunt him, as does his mother, Pulcheria Alexandrovna, who is deeply grieved by her daughter's marriage and Raskolnikov's disturbed mental state. Raskolnikov's suffering is exacerbated by his mother's illness, vivid nightmares, and his unending obsession with Svidrigaïlov, a menacing figure from his past. He finds solace in deep introspection with Sonia, the daughter of the late Mr. Marmeladov who was tragically killed before Raskolnikov's eyes. The narrative becomes more complex with the sudden appearance of Pyotr Petrovitch, a potential suitor, who brings news of an unexpected inheritance for Dounia, further complicating the family dynamics. Despite the strained relationships and emotional turmoil, Raskolnikov announces his separation from his family, stating 