# PDF2Podcast Test Notebook

This notebook demonstrates the features of the pdf2podcast library, including:  
- Text extraction from PDF  
- Text chunking  
- Semantic search  
- Podcast generation  

In [None]:
import os
import sys
from pathlib import Path
from dotenv import load_dotenv

# Add the correct path for local modules
module_path = str(Path(os.getcwd()).parent)
if module_path not in sys.path:
    sys.path.append(module_path)

# Direct imports from local modules
from pdf2podcast import PodcastGenerator
from pdf2podcast.core.rag import AdvancedPDFProcessor as SimplePDFProcessor
from pdf2podcast.core.llm import GeminiLLM
from pdf2podcast.core.tts import AWSPollyTTS, GoogleTTS
from pdf2podcast.core.prompts import PodcastPromptBuilder
from pdf2podcast.core.processing import SimpleChunker, SemanticRetriever

## Setup

Setup and configuration

In [None]:
# Load environment variables
load_dotenv("../.env")

# Check for required keys
api_key = os.getenv("GENAI_API_KEY")
if not api_key:
    raise ValueError("GENAI_API_KEY not found in the .env file")

# Verify that the test PDF exists
PDF_PATH = "./transformers.pdf"
if not os.path.exists(PDF_PATH):
    raise ValueError(f"PDF file not found: {PDF_PATH}")

## Basic Test: Text Extraction

In [None]:
# Initialize the basic PDF processor
processor = SimplePDFProcessor(
    max_chars_per_chunk=4000,
    extract_images=True,
    metadata=True
)

# Extract the text
text = processor.process_document(PDF_PATH)
print("Extracted text length:", len(text))
print("\nFirst 500 characters:\n")
print(text[:500])

## Chunking

In [None]:
# Initialize the chunker
chunker = SimpleChunker()

# Split the text into chunks
chunks = chunker.chunk_text(text, chunk_size=4000)

print(f"Number of chunks: {len(chunks)}")
print("\nFirst chunk:\n")
print(chunks[0])

## Semantic Search Test

In [None]:
# Initialize the retriever
retriever = SemanticRetriever()

# Add the chunks to the retriever
retriever.add_texts(chunks)

# Test a query
query = "Explain the main concepts of the paper"
relevant_chunks = retriever.get_relevant_chunks(query, k=5)

print("Most relevant chunks for the query:", query)
for i, chunk in enumerate(relevant_chunks, 1):
    print(f"\nChunk {i}:\n{chunk}")

## Complete Test: Podcast Generation

In [None]:
## Complete Test: Podcast Generation
chunker = SimpleChunker()
retriever = SemanticRetriever()
prompt_builder = PodcastPromptBuilder()

# Configure the PDF processor
processor = SimplePDFProcessor(
    chunker=chunker,
    retriever=retriever,
    extract_images=True,
    max_chars_per_chunk=6000,
    metadata=True
)

# Create the generator with the new configuration based on the managers
generator = PodcastGenerator(
    rag_system=processor,
    llm_provider="gemini",
    tts_provider="google",  # Using Google TTS for testing
    llm_config={
        "api_key": api_key,
        "max_output_tokens": 8000,
        "temperature": 0.1,
        "prompt_builder": prompt_builder
    },
    tts_config={
        "language": "en",
        "tld": "com",
        "slow": False
    },
    chunker=chunker,
    retriever=retriever,
    k=5
)

# Generate the podcast with a specific query
result = generator.generate(
    pdf_path=PDF_PATH,
    output_path="output.mp3",
    complexity="advanced",
    audience="experts",
    query="Explain in detail the functioning of the transformers architecture"
)

# Display the results
print("Generated script:\n")
print(result["script"])

print("\nAudio details:")
print(f"File: {result['audio']['path']}")
print(f"Size: {result['audio']['size']} bytes")

len(result["script"])


# Custom Prompt

In [None]:
from typing import Dict, Any
from pdf2podcast.core.base import BasePromptBuilder


# Implementation of the Builders
class StorytellingPromptBuilder(BasePromptBuilder):
    """Builder for storytelling-style prompts."""
    
    def build_prompt(self, text: str, **kwargs) -> str:
        return f"""
        You are a storyteller tasked with creating an engaging and captivating podcast episode.
        Your goal is to narrate the content in a way that captures the listener's imagination and keeps them hooked.
        Use vivid descriptions, emotional language, and a clear structure to convey the main ideas and themes of the text.

        Text to process:
        {text}

        Please ensure the storytelling style is immersive and suitable for a podcast audience.
        """

    def build_expand_prompt(self, text: str, **kwargs) -> str:
        return self.templates.get_expand_prompt(text, **kwargs)

In [None]:
# Test with StorytellingPromptBuilder
storytelling_generator = PodcastGenerator(
    rag_system=processor,
    llm_provider="gemini",
    tts_provider="google",
    llm_config={
        "api_key": api_key,
        "max_output_tokens": 8000,
        "temperature": 0.1,
        "prompt_builder": StorytellingPromptBuilder()
    },
    tts_config={
        "language": "en",
        "tld": "com",
        "slow": False
    },
    chunker=chunker,
    retriever=retriever,
)

# Generate the podcast with a specific query
result = storytelling_generator.generate(
    pdf_path=PDF_PATH,
    output_path="output.mp3",
    complexity="simple",
    audience="students",
    query="Explain the main concepts of the paper and the main results"
)

# Display the results
print("Generated script:\n")
print(result["script"])

print("\nAudio details:")
print(f"File: {result['audio']['path']}")
print(f"Size: {result['audio']['size']} bytes")

len(result["script"])