# PDF Information Extractor using Anthropic's Claude

This notebook allows you to extract information from PDF documents using Anthropic's Claude model.

In [None]:
import os
from pathlib import Path
import PyPDF2
import anthropic
import pandas as pd
from tqdm import tqdm

## Setup Anthropic Client

In [None]:
# Get API key from environment variable
api_key = os.getenv('ANTHROPIC_API_KEY')
if not api_key:
    raise ValueError("Please set ANTHROPIC_API_KEY environment variable")

# Initialize Anthropic client
client = anthropic.Anthropic(api_key=api_key)

## PDF Processing Functions

In [None]:
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from a PDF file"""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def process_with_claude(text: str, query: str, temperature: float = 0.7, max_tokens: int = 1000) -> str:
    """Process text with Claude model"""
    response = client.messages.create(
        model="claude-3-sonnet-20240229",
        max_tokens=max_tokens,
        temperature=temperature,
        messages=[{
            "role": "user",
            "content": f"Based on the following text, {query}\n\nText: {text}"
        }]
    )
    return response.content[0].text

## Process PDFs

In [None]:
# Configuration
pdf_dir = "pdfs"  # Directory containing PDF files
query = "What are the main findings of this research?"
temperature = 0.7
max_tokens = 1000
output_csv = "extraction_results.csv"

# Process PDFs
results = []
pdf_files = list(Path(pdf_dir).glob('*.pdf'))

for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
    try:
        text = extract_text_from_pdf(str(pdf_path))
        response = process_with_claude(text, query, temperature, max_tokens)
        
        results.append({
            'pdf_name': pdf_path.name,
            'query': query,
            'response': response
        })
    except Exception as e:
        print(f"Error processing {pdf_path.name}: {str(e)}")

# Save results to CSV
df = pd.DataFrame(results)
df.to_csv(output_csv, index=False)
print(f"Results saved to {output_csv}")

# Display results
display(df)

## Try Different Queries

You can modify the query and parameters above to extract different types of information from your PDFs.