# PDF Extraction Tool - Notebook Version

This notebook processes PDF files using the Gemini AI API to answer research questions.


In [None]:
import os
import sys
import yaml
import logging
from datetime import datetime

# Import the PDF analyzer from modules directory
from modules.llm_extractor import LLMExtractor

print("=== PDF Analysis Tool - Notebook Version ===")
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("✓ Environment initialized - PDF analyzer module will handle logging")

In [None]:
# Cell 2: Configuration Loading
def load_config(config_file="config/config.yaml"):
    """Load configuration from YAML file"""
    try:
        with open(config_file, 'r', encoding='utf-8') as f:
            config = yaml.safe_load(f)
            return config
    except FileNotFoundError:
        print(f"ERROR: Configuration file '{config_file}' not found")
        print("Please create config.yaml based on config.template.yaml")
        return None
    except yaml.YAMLError as e:
        print(f"ERROR: Invalid YAML in configuration file: {e}")
        return None

# Load configuration
print("Loading configuration...")
config = load_config("config/config.yaml")
if not config:
    raise Exception("Failed to load configuration")

print("✓ Configuration loaded successfully")

In [None]:
# Cell 3: API Key and Model Setup
print("=== Setting up Gemini AI ===")

# Get configuration values
gemini_config = config.get('gemini', {})
paths_config = config.get('paths', {})
options_config = config.get('options', {})

# Get API key from config or environment
api_key = gemini_config.get('api_key') or os.getenv('GEMINI_API_KEY')

if not api_key:
    raise Exception("No Gemini API key found. Either set 'api_key' in config.yaml or set GEMINI_API_KEY environment variable")

# Get model name and file paths with timestamp replacement
model_name = gemini_config.get('model', 'gemini-2.0-flash-exp')
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
pdf_dir = paths_config.get('pdf_directory', 'pdfs')
questions_file = paths_config.get('questions_file', 'config/questions.yaml')

# Handle timestamp replacement in file paths
output_file = paths_config.get('output_file', 'output.md')
if '{timestamp}' in output_file:
    output_file = output_file.replace('{timestamp}', timestamp)

output_xlsx = paths_config.get('output_xlsx', None)  # Get Excel output path
if output_xlsx and '{timestamp}' in output_xlsx:
    output_xlsx = output_xlsx.replace('{timestamp}', timestamp)

log_file = paths_config.get('log_file', None)  # Get log file from config

# Create necessary directories
os.makedirs(os.path.dirname(output_file), exist_ok=True)
if output_xlsx:
    os.makedirs(os.path.dirname(output_xlsx), exist_ok=True)
if log_file:
    os.makedirs(os.path.dirname(log_file), exist_ok=True)

max_text_length = options_config.get('max_text_length', 30000)

print(f"✓ Model: {model_name}")
print(f"✓ Output: {output_file}")
if output_xlsx:
    print(f"✓ Excel Output: {output_xlsx}")
print(f"✓ Log File: {log_file if log_file else 'Console only'}")
print(f"✓ Ready to process PDFs")

In [None]:
# Cell 4: File Validation and PDF Discovery
print("=== Discovering PDFs ===")

# Check if required files exist
if not os.path.exists(pdf_dir):
    raise Exception(f"PDF directory '{pdf_dir}' not found")
    
if not os.path.exists(questions_file):
    raise Exception(f"Questions file '{questions_file}' not found")

# Count PDF files
pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith('.pdf')]

# Limit to first 10 files for processing (as per original script)
if len(pdf_files) > 10:
    print(f"📋 Limiting to first 10 files for processing")
    pdf_files = pdf_files[:10]

if len(pdf_files) == 0:
    raise Exception("No PDF files found to process")

print(f"✓ Found {len(pdf_files)} PDF files to process")

In [None]:
# Cell 5: Initialize Analyzer and Load Questions
print("=== Initializing Analyzer ===")

try:
    # Create analyzer with log file from configuration
    analyzer = LLMExtractor(api_key, model_name, log_file)
    
    # Load questions
    questions, additional_instructions = analyzer.load_questions(questions_file)
    
    print(f"✓ Analyzer ready with {len(questions)} questions")
    if log_file:
        print(f"✓ Logging configured to file (with timestamped filename)")
        
except Exception as e:
    print(f"❌ Error initializing analyzer: {e}")
    raise

In [None]:
# Cell 6: Prepare Output File
print("=== Preparing Output ===")

# Create or update header
with open(output_file, 'w', encoding='utf-8') as f:
    f.write("# PDF Analysis Results\n\n")
    f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Model used: {model_name}\n")
    f.write(f"Files to process: {len(pdf_files)}\n\n")
    f.write("---\n\n")

print(f"✓ Output file ready: {output_file}")

### Process files! If you want a progress bar, skip this cell and go the next one

In [None]:
# Cell 7: Process PDFs (PDF Analyzer Module Handles All Logging)
print("=== Starting PDF Analysis ===")
print(f"📊 Total work: {len(pdf_files)} PDFs × {len(questions)} questions = {len(pdf_files) * len(questions)} tasks")
analyzer.set_quiet_mode(False)
# Initialize counters for final summary
processed_count = 0
error_count = 0
results_summary = []

  # Load questions and additional instructions
questions, additional_instructions = analyzer.load_questions(questions_file)
     
        # Process each PDF
for i, pdf_file in enumerate(pdf_files, 1):
    
    pdf_path = os.path.join(pdf_dir, pdf_file)
    pdf_title = analyzer.get_pdf_title(pdf_file)

    try:
        print(f"\n🔍 Processing ({i}/{len(pdf_files)}): {pdf_file}")
        # Process PDF and get Q&A results
        qa_results = analyzer.process_pdf(pdf_path, questions, additional_instructions)
        if qa_results:
            # Update markdown file
            processed_count += 1
            analyzer.update_markdown_file(output_file, pdf_title, qa_results)
        else:
            error_count += 1
            print(f"⚠️ No results generated for {pdf_file}")
            
    except Exception as e:
        print(f"❌ Error processing {pdf_file}: {e}")
        error_count += 1
        continue

# Final summary
final_rate = f"{processed_count/(processed_count+error_count)*100:.1f}%" if (processed_count+error_count) > 0 else "No files processed"
print(f"\n🎉 Analysis complete!")
print(f"📊 Final Results: {processed_count} processed, {error_count} errors")
print(f"📈 Success Rate: {final_rate}")

## BONUS: with progress bar!

In [None]:
# Cell 8: Process PDFs with Progress Bar
from tqdm.notebook import tqdm

print("=== Starting PDF Analysis with Progress Bar ===")

# Calculate total tasks
total_tasks = len(pdf_files) * len(questions)
print(f"📊 Total work: {len(pdf_files)} PDFs × {len(questions)} questions = {total_tasks} tasks")

# Enable quiet mode - logging will only go to file, not console
analyzer.set_quiet_mode(True)

# Initialize counters for final summary
processed_count = 0
error_count = 0
results_summary = []
failed_files = []  # Track which files failed
all_results = {}  # Store all results for Excel export

# Generate column names for Excel export if xlsx output is configured
column_names = None
if output_xlsx:
    column_names = analyzer.generate_column_names_from_questions(questions)

# Create main progress bar
with tqdm(total=total_tasks, desc="Overall Progress", unit="task") as pbar:
    
    # Process each PDF
    for i, pdf_file in enumerate(pdf_files, 1):
        
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_title = analyzer.get_pdf_title(pdf_file)

        # Upload PDF file once using File API
        uploaded_file = analyzer.upload_pdf_file(pdf_path)
        n = len(questions)
        qa_results = {}
        try:
            pbar.set_description(f"Processing PDF {i}/{len(pdf_files)}: {pdf_file[:30]}...")

            for question in questions:
            
                # Process PDF and get Q&A results (console logging disabled)
                answer, question_duration = analyzer.ask_question(
                    i, n, uploaded_file, pdf_title, question,  additional_instructions)
                if answer:
                    qa_results[question] = answer
                    analyzer.update_markdown_file(output_file, pdf_title, qa_results)

                
                # Update progress by the number of questions processed for this PDF
                pbar.update(1)
            
            if qa_results:
                analyzer.update_markdown_file(output_file, pdf_title, qa_results)
                # Store results for Excel export
                if output_xlsx:
                    all_results[pdf_title] = qa_results
                processed_count += 1
            else:
                error_count += 1
                failed_files.append(f"{pdf_file} (no results)")
        except Exception as e:
            error_count += 1
            failed_files.append(f"{pdf_file} (error: {str(e)[:50]}...)")
            # Still update progress even if there was an error
            pbar.update(len(questions))
            continue

# Restore normal logging (console + file)
analyzer.set_quiet_mode(False)

# Export to Excel if configured and we have results
if output_xlsx and all_results and column_names:
    try:
        analyzer.export_to_xlsx(output_xlsx, all_results, questions, column_names)
        print(f"✓ Excel results exported to: {output_xlsx}")
    except Exception as e:
        print(f"❌ Failed to export Excel file: {e}")

# Final summary (after progress bar is complete)
final_rate = f"{processed_count/(processed_count+error_count)*100:.1f}%" if (processed_count+error_count) > 0 else "No files processed"
print(f"\n🎉 Analysis complete!")
print(f"📊 Final Results: {processed_count} PDFs processed, {error_count} PDFs failed")
print(f"📈 Success Rate: {final_rate}")
print(f"📝 Results saved to: {output_file}")
if output_xlsx:
    print(f"📊 Excel results saved to: {output_xlsx}")

# Show failed files if any
if failed_files:
    print(f"\n⚠️ Failed files:")
    for failed_file in failed_files:
        print(f"  - {failed_file}")