In [1]:
# PDF Information Extractor 
import os
from pathlib import Path
import PyPDF2
import anthropic
import pandas as pd
from tqdm import tqdm
import tiktoken
import re
import csv
from io import StringIO

In [2]:
## Setup Anthropic Client
# API Key
api_key = 'YOUR_API_KEY_HERE'  # Replace with your actual API key or use environment variables

# Initialize Anthropic client
client = anthropic.Anthropic(api_key=api_key)

In [3]:
## PDF Processing Functions
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from a PDF file"""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def process_with_claude(text: str, query: str, temperature: float = 0, max_tokens: int = 1000) -> str:
    """Process text with Claude model"""
    encoding = tiktoken.get_encoding("cl100k_base")
    token_count = len(encoding.encode(text))
    token_count = int(token_count * 1.1)
    print(f"Token count is approximately {token_count}")
    
    response = client.messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=max_tokens,
        temperature=temperature,
        messages=[{
            "role": "user",
            "content": f"{query}\n\nPaper content:\n{text}"
        }]
    )
    return response.content[0].text


In [4]:
## prompt
query = """I need you to analyze the following research paper and extract specific information about regional resilience to catastrophic risks. Our research question is: "What specific geographical, institutional, and infrastructural factors have been empirically or theoretically identified as enhancing regional resilience to nuclear winter, large magnitude volcanic eruptions, extreme pandemics, and infrastructure collapse catastrophes, and how do these resilience factors vary across catastrophe types?"

After analyzing the paper thoroughly, provide your output in a single row CSV format with the following structure:

1. paper_citation: Full citation (author, year, title)
2. publication_type: [Journal article/Preprint/Report/Book chapter]
3. gcr_types: Types of catastrophic risks addressed [Nuclear/Volcanic/Asteroid/Infrastructure/Pandemic/Climate/Multiple]
4. geographic_focus: [Global/Regional/National/Local/Islands - specify]
5. regions_compared: [Yes/No] with brief description if yes
6. geographic_factors: List key geographic factors (location, climate, resources, etc.)
7. institutional_factors: List key institutional factors (governance, policies, social systems, etc.)
8. infrastructural_factors: List key infrastructure factors (energy, food, communications, etc.)
9. other_resilience_factors: Any resilience factors not fitting above categories
10. study_approach: [Model/Empirical/Review/Case study/Theoretical]
11. evidence_strength: [Low/Medium/High] with brief justification
12. evidence_causal: [TRUE/FALSE] for direct causal evidence
13. evidence_predictive: [TRUE/FALSE] for predictive evidence
14. evidence_correlational: [TRUE/FALSE] for correlational evidence
15. evidence_theoretical: [TRUE/FALSE] for theoretical/expert opinion
16. evidence_case_study: [TRUE/FALSE] for case study observations
17. evidence_model: [TRUE/FALSE] for model-based projections
18. validation_external: [TRUE/FALSE] for external validation against outcomes
19. validation_alternative: [TRUE/FALSE] for validation against alternative datasets
20. validation_temporal: [TRUE/FALSE] for validation across multiple time periods
21. validation_cross_regional: [TRUE/FALSE] for cross-regional validation
22. validation_none: [TRUE/FALSE] if no validation attempted
23. counterfactual_robust: [TRUE/FALSE] for robust counterfactual analysis
24. counterfactual_limited: [TRUE/FALSE] for limited counterfactual discussion
25. counterfactual_none: [TRUE/FALSE] if no counterfactuals considered
26. limitations_thorough: [TRUE/FALSE] if authors thoroughly address limitations
27. limitations_limited: [TRUE/FALSE] if limited discussion of limitations
28. limitations_none: [TRUE/FALSE] if no significant discussion of limitations
29. confidence_assessment: [High/Medium/Low/Very low] for key resilience factors
30. evidence_gaps: Brief description of critical missing validation elements
31. resilience_phase: [Preparedness/Robustness/Recovery/Adaptation]
32. implemented_measures: Brief description of measures already implemented (if any)
33. proposed_measures: Brief description of proposed measures (if any)
34. main_resilience_factors: Brief summary of main resilience-enhancing factors
35. differential_effectiveness: [Yes/No] with description if factors vary across GCR types
36. resilience_tradeoffs: [Yes/No] with description of any identified trade-offs
37. vulnerable_resilient_regions: List of particularly vulnerable or resilient regions identified
38. overall_relevance: [Low/Medium/High] relevance to our research question
39. key_quotes: 1-2 most relevant quotes supporting findings
40. additional_notes: Any other important observations

For text fields, place the content in double quotes to properly handle any commas. For boolean fields, use TRUE or FALSE. For fields with multiple options like evidence types, mark TRUE for all that apply.

Please analyze the paper thoroughly before extracting the information. Respond with ONLY the CSV row (no column headers)."""

# Define the column names based on the structure
columns = [
    "paper_citation", "publication_type", "gcr_types", "geographic_focus", 
    "regions_compared", "geographic_factors", "institutional_factors", "infrastructural_factors", 
    "other_resilience_factors", "study_approach", "evidence_strength", 
    "evidence_causal", "evidence_predictive", "evidence_correlational", 
    "evidence_theoretical", "evidence_case_study", "evidence_model", 
    "validation_external", "validation_alternative", "validation_temporal", 
    "validation_cross_regional", "validation_none", "counterfactual_robust", 
    "counterfactual_limited", "counterfactual_none", "limitations_thorough", 
    "limitations_limited", "limitations_none", "confidence_assessment", 
    "evidence_gaps", "resilience_phase", "implemented_measures", 
    "proposed_measures", "main_resilience_factors", "differential_effectiveness", 
    "resilience_tradeoffs", "vulnerable_resilient_regions", "overall_relevance", 
    "key_quotes", "additional_notes"
]

def parse_csv_response(response_text):
    """Parse the CSV response from Claude and return a dictionary with column names as keys"""
    # Clean the response text
    clean_text = response_text.strip()
    
    # If there are multiple lines, take only the CSV line
    if "\n" in clean_text:
        # Find the line that has the most commas (likely the CSV data)
        lines = clean_text.split('\n')
        clean_text = max(lines, key=lambda x: x.count(','))
    
    # Parse CSV using the csv module which handles quoted fields properly
    reader = csv.reader(StringIO(clean_text))
    try:
        row = next(reader)
        # Map values to column names
        result = {col: val for col, val in zip(columns, row)}
        return result
    except StopIteration:
        # If parsing fails, return the original text
        return {"error": "Failed to parse CSV response", "original_text": clean_text}

In [None]:
# Configuration
pdf_dir = "pdf"  # Directory containing PDF files
temperature = 0   # Keep it as deterministic as possible
max_tokens = 4000  # Increased token limit for more detailed responses
output_csv = "gcr_resilience_results.csv"

# Create pdf directory if it doesn't exist
os.makedirs(pdf_dir, exist_ok=True)

# Process PDFs
results = []
pdf_files = list(Path(pdf_dir).glob('*.pdf'))

for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
    try:
        print(f"Processing {pdf_path.name}...")
        text = extract_text_from_pdf(str(pdf_path))
        print(f"Extracted {len(text)} characters from PDF.")
        
        # Process with Claude
        response = process_with_claude(text, query, temperature, max_tokens)
        print(f"Received response of length {len(response)}")
        
        # Parse the CSV response
        parsed_result = parse_csv_response(response)
        
        # Add the filename for reference
        parsed_result['filename'] = pdf_path.name
        
        # Add to results
        results.append(parsed_result)
        print(f"Successfully processed {pdf_path.name}")
    except Exception as e:
        print(f"Error processing {pdf_path.name}: {str(e)}")
        # Add error record
        results.append({
            "error": str(e),
            "filename": pdf_path.name
        })

# Create DataFrame and save results
df = pd.DataFrame(results)

# Add timestamp to create a versioned output file
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
versioned_output_csv = f"{os.path.splitext(output_csv)[0]}_{timestamp}.csv"

# Save both versioned and standard output
df.to_csv(output_csv, index=False)
df.to_csv(versioned_output_csv, index=False)

print(f"Results saved to {output_csv} and versioned copy at {versioned_output_csv}")

# Display results
print(df.head())

In [1]:
print("Validate for GitHub")

Validate for GitHub
