In [1]:
# PDF Information Extractor 
import os
from pathlib import Path
import PyPDF2
import anthropic
import pandas as pd
from tqdm import tqdm
import tiktoken
import csv
import json
from io import StringIO
import hashlib
import datetime

In [2]:
os.getcwd()

'/Users/maximilianroessler/Documents/Hobby/Volunteering/ALLFED/LiteraryReview/gcr-resilience-map/scripts'

In [3]:
## Setup Anthropic Client

# read the API key from the file    
with open("../config/api_key.txt", 'r') as f:
    api_key = f.read().strip()

# API Key
api_key = api_key  # Replace with your actual API key or use environment variables

# Initialize Anthropic client
client = anthropic.Anthropic(api_key=api_key)

In [4]:
## PDF Processing Functions
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from a PDF file"""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text


def process_with_claude(text: str, query: str, temperature: float = 0, max_tokens: int = 1000) -> str:
    """Process text with Claude model using prefix caching"""
    encoding = tiktoken.get_encoding("cl100k_base")
    token_count = len(encoding.encode(text))
    token_count = int(token_count * 1.1)
    print(f"Token count is approximately {token_count}")

    response = client.messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=max_tokens,
        temperature=temperature,
        system=[
            {
                "type": "text",
                "text": "You are an AI assistant tasked with analyzing documents.",
            },
            {
                "type": "text",
                "text": f"Document content:\n{text}",
                "cache_control": {"type": "ephemeral"}
            }
        ],
        messages=[
            {
                "role": "user",
                "content": query
            }
        ]
    )

    # Log cache information if available
    if hasattr(response, 'usage'):
        print(f"API Response Token Usage:")
        print(f"  - Total input tokens: {response.usage.input_tokens}")
        print(
            f"  - Cache creation tokens: {getattr(response.usage, 'cache_creation_input_tokens', 0)}")
        print(
            f"  - Cache read tokens: {getattr(response.usage, 'cache_read_input_tokens', 0)}")

        # Calculate percentage saved if using cache
        cache_read = getattr(response.usage, 'cache_read_input_tokens', 0)
        if cache_read > 0:
            total_possible = response.usage.input_tokens + cache_read
            percentage_saved = (cache_read / total_possible) * 100
            print(
                f"  - Approximate cache savings: {percentage_saved:.1f}% of input tokens")

    return response.content[0].text

In [5]:
## extraction prompt
extraction_query = """I need you to analyze the provided research paper and extract specific information about regional resilience to catastrophic risks. Our research question is: "What specific geographical, institutional, and infrastructural factors have been empirically or theoretically identified as enhancing regional resilience to nuclear winter, large magnitude volcanic eruptions, extreme pandemics, and infrastructure collapse catastrophes, and how do these resilience factors vary across catastrophe types?"

After analyzing the paper thoroughly, provide your output in a single row CSV format with the following structure:

1. paper_citation: Full citation (author, year, title)
2. publication_type: [Journal article/Preprint/Report/Book chapter]
3. gcr_types: Types of catastrophic risks addressed [Nuclear/Volcanic/Asteroid/Infrastructure/Pandemic/Climate/Multiple]
4. geographic_focus: [Global/Regional/National/Local/Islands - specify]
5. regions_compared: [Yes/No] with brief description if yes
6. geographic_factors: List key geographic factors (location, climate, resources, etc.)
7. institutional_factors: List key institutional factors (governance, policies, social systems, etc.)
8. infrastructural_factors: List key infrastructure factors (energy, food, communications, etc.)
9. other_resilience_factors: Any resilience factors not fitting above categories
10. study_approach: [Model/Empirical/Review/Case study/Theoretical]
11. evidence_strength: [Low/Medium/High] with brief justification
12. evidence_causal: [TRUE/FALSE] for direct causal evidence
13. evidence_predictive: [TRUE/FALSE] for predictive evidence
14. evidence_correlational: [TRUE/FALSE] for correlational evidence
15. evidence_theoretical: [TRUE/FALSE] for theoretical/expert opinion
16. evidence_case_study: [TRUE/FALSE] for case study observations
17. evidence_model: [TRUE/FALSE] for model-based projections
18. validation_external: [TRUE/FALSE] for external validation against outcomes
19. validation_alternative: [TRUE/FALSE] for validation against alternative datasets
20. validation_temporal: [TRUE/FALSE] for validation across multiple time periods
21. validation_cross_regional: [TRUE/FALSE] for cross-regional validation
22. validation_none: [TRUE/FALSE] if no validation attempted
23. counterfactual_robust: [TRUE/FALSE] for robust counterfactual analysis
24. counterfactual_limited: [TRUE/FALSE] for limited counterfactual discussion
25. counterfactual_none: [TRUE/FALSE] if no counterfactuals considered
26. limitations_thorough: [TRUE/FALSE] if authors thoroughly address limitations
27. limitations_limited: [TRUE/FALSE] if limited discussion of limitations
28. limitations_none: [TRUE/FALSE] if no significant discussion of limitations
29. confidence_assessment: [High/Medium/Low/Very low] for key resilience factors
30. evidence_gaps: Brief description of critical missing validation elements
31. resilience_phase: [Preparedness/Robustness/Recovery/Adaptation]
32. implemented_measures: Brief description of measures already implemented (if any)
33. proposed_measures: Brief description of proposed measures (if any)
34. main_resilience_factors: Brief summary of main resilience-enhancing factors
35. differential_effectiveness: [Yes/No] with description if factors vary across GCR types
36. resilience_tradeoffs: [Yes/No] with description of any identified trade-offs
37. vulnerable_resilient_regions: List of particularly vulnerable or resilient regions identified
38. overall_relevance: [Low/Medium/High] relevance to our research question
39. key_quotes: 1-2 most relevant quotes supporting findings
40. additional_notes: Any other important observations

For text fields, place the content in double quotes to properly handle any commas. For boolean fields, use TRUE or FALSE. For fields with multiple options like evidence types, mark TRUE for all that apply.

Please analyze the paper thoroughly before extracting the information. Respond with ONLY the CSV row (no column headers)."""

# Define the column names based on the structure
extraction_columns = [
    "paper_citation", "publication_type", "gcr_types", "geographic_focus", 
    "regions_compared", "geographic_factors", "institutional_factors", "infrastructural_factors", 
    "other_resilience_factors", "study_approach", "evidence_strength", 
    "evidence_causal", "evidence_predictive", "evidence_correlational", 
    "evidence_theoretical", "evidence_case_study", "evidence_model", 
    "validation_external", "validation_alternative", "validation_temporal", 
    "validation_cross_regional", "validation_none", "counterfactual_robust", 
    "counterfactual_limited", "counterfactual_none", "limitations_thorough", 
    "limitations_limited", "limitations_none", "confidence_assessment", 
    "evidence_gaps", "resilience_phase", "implemented_measures", 
    "proposed_measures", "main_resilience_factors", "differential_effectiveness", 
    "resilience_tradeoffs", "vulnerable_resilient_regions", "overall_relevance", 
    "key_quotes", "additional_notes"
]

In [6]:
## relevance prompt

# prompt
relevance_query = """I need you to analyze the provided research paper and extract specific information about regional resilience to catastrophic risks. Our research question is: "What specific geographical, institutional, and infrastructural factors have been empirically or theoretically identified as enhancing regional resilience to nuclear winter, large magnitude volcanic eruptions, extreme pandemics, and infrastructure collapse catastrophes, and how do these resilience factors vary across catastrophe types?"

After analyzing the paper thoroughly, provide your output in a single row CSV format with the following structure:

1. serious_weakness: Does this paper exhibit serious methodological weaknesses? [TRUE/FALSE]

For text fields, place the content in double quotes to properly handle any commas. For boolean fields, use TRUE or FALSE. For fields with multiple options like evidence types, mark TRUE for all that apply.

Please analyze the paper thoroughly before extracting the information. Respond with ONLY the CSV row (no column headers)."""

# Define the column names based on the structure
relevance_columns = [
    "serious_weakness"
]



In [7]:

def parse_csv_response(response_text, columns):
    """Parse the CSV response from Claude and return a dictionary with column names as keys"""
    # Clean the response text
    clean_text = response_text.strip()

    # If there are multiple lines, take only the CSV line
    if "\n" in clean_text:
        # Find the line that has the most commas (likely the CSV data)
        lines = clean_text.split('\n')
        clean_text = max(lines, key=lambda x: x.count(','))

    # Parse CSV using the csv module which handles quoted fields properly
    reader = csv.reader(StringIO(clean_text))
    try:
        row = next(reader)
        # Map values to column names
        result = {col: val for col, val in zip(columns, row)}
        return result
    except StopIteration:
        # If parsing fails, return the original text
        return {"error": "Failed to parse CSV response", "original_text": clean_text}

In [8]:
pdf_dir = "pdf"  # Directory containing PDF files
temperature = 0   # Keep it as deterministic as possible
max_tokens = 4000  # Increased token limit for more detailed responses
cache_dir = "prompt_cache"  # Directory for caching prompts and responses

# Define configuration for extraction and relevance processes
process_types = ['extraction', 'relevance']
config = {
    'extraction': {
        'output_file': "gcr_resilience_extraction_results.csv",
        'cache_file': os.path.join(cache_dir, "extraction_prompt_cache.json"),
        'query': extraction_query,
        'columns': extraction_columns
    },
    'relevance': {
        'output_file': "gcr_resilience_relevance_results.csv",
        'cache_file': os.path.join(cache_dir, "relevance_prompt_cache.json"),
        'query': relevance_query,
        'columns': relevance_columns
    }
}

# Create directories if they don't exist
os.makedirs(pdf_dir, exist_ok=True)
os.makedirs(cache_dir, exist_ok=True)

# Initialize caches, results, and processed files
caches = {}
results = {}
processed_files = {}

# Load caches and existing results
for process_type in process_types:
    # Load cache
    cache_file = config[process_type]['cache_file']
    caches[process_type] = {}
    if os.path.exists(cache_file):
        try:
            with open(cache_file, 'r') as f:
                caches[process_type] = json.load(f)
            print(f"Loaded {len(caches[process_type])} cached {process_type} responses")
        except Exception as e:
            print(f"Error loading {process_type} cache: {str(e)}")
    
    # Load results
    output_file = config[process_type]['output_file']
    results[process_type] = []
    processed_files[process_type] = set()
    if os.path.exists(output_file):
        try:
            df = pd.read_csv(output_file)
            results[process_type] = df.to_dict('records')
            processed_files[process_type] = set(df['filename'].tolist())
            print(f"Loaded {len(results[process_type])} existing {process_type} results")
        except Exception as e:
            print(f"Error loading existing {process_type} results: {str(e)}")

# Process PDFs
pdf_files = list(Path(pdf_dir).glob('*.pdf'))

for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
    # Extract text only once for both queries
    text = None
    
    # Process each configuration
    for process_type in process_types:
        current_config = config[process_type]
        current_cache = caches[process_type]
        current_results = results[process_type]
        current_processed_files = processed_files[process_type]
        
        if pdf_path.name not in current_processed_files:
            try:
                print(f"Processing {pdf_path.name} for {process_type}...")
                
                # Extract text if not already done
                if text is None:
                    text = extract_text_from_pdf(str(pdf_path))
                    print(f"Extracted {len(text)} characters from PDF.")
                
                # Create cache key
                cache_key = hashlib.md5((text + current_config['query']).encode()).hexdigest()
                
                # Check for cached response
                if cache_key in current_cache:
                    print(f"Using cached response for {pdf_path.name} ({process_type})")
                    response = current_cache[cache_key]
                else:
                    # Process with Claude
                    response = process_with_claude(text, current_config['query'], temperature, max_tokens)
                    print(f"Received response of length {len(response)}")
                    
                    # Cache the response
                    current_cache[cache_key] = response
                    
                    # Save cache after each new response
                    with open(current_config['cache_file'], 'w') as f:
                        json.dump(current_cache, f)
                
                # Parse the CSV response
                parsed_result = parse_csv_response(response, current_config['columns'])
                
                # Add the filename for reference
                parsed_result['filename'] = pdf_path.name
                
                # Add to results
                current_results.append(parsed_result)
                
                # Save intermediate results
                interim_df = pd.DataFrame(current_results)
                interim_df.to_csv(current_config['output_file'], index=False)
                
                print(f"Successfully processed {pdf_path.name} for {process_type}")
            
            except Exception as e:
                print(f"Error processing {pdf_path.name} for {process_type}: {str(e)}")
                # Add error record
                error_result = {
                    "error": str(e),
                    "filename": pdf_path.name
                }
                current_results.append(error_result)
                
                # Save intermediate results even after errors
                interim_df = pd.DataFrame(current_results)
                interim_df.to_csv(current_config['output_file'], index=False)
        else:
            print(f"Skipping already processed file for {process_type}: {pdf_path.name}")

# Create final DataFrames from the results
extraction_df = pd.DataFrame(results['extraction'])
relevance_df = pd.DataFrame(results['relevance'])

# Add timestamp for versioned outputs
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
versioned_extraction_csv = f"{os.path.splitext(config['extraction']['output_file'])[0]}_{timestamp}.csv"
versioned_relevance_csv = f"{os.path.splitext(config['relevance']['output_file'])[0]}_{timestamp}.csv"

# Save versioned outputs
extraction_df.to_csv(config['extraction']['output_file'], index=False)
extraction_df.to_csv(versioned_extraction_csv, index=False)
relevance_df.to_csv(config['relevance']['output_file'], index=False)
relevance_df.to_csv(versioned_relevance_csv, index=False)

print(f"Extraction results saved to {config['extraction']['output_file']} and {versioned_extraction_csv}")
print(f"Relevance results saved to {config['relevance']['output_file']} and {versioned_relevance_csv}")

# Display results
print("Extraction results:")
print(extraction_df.head())
print("\nRelevance results:")
print(relevance_df.head())

Processing PDFs:   0%|          | 0/1 [00:00<?, ?it/s]

Processing s13561-023-00428-9.pdf for extraction...
Extracted 64009 characters from PDF.
Token count is approximately 17781
API Response Token Usage:
  - Total input tokens: 943
  - Cache creation tokens: 18351
  - Cache read tokens: 0
Received response of length 2833
Successfully processed s13561-023-00428-9.pdf for extraction
Processing s13561-023-00428-9.pdf for relevance...
Token count is approximately 17781


Processing PDFs: 100%|██████████| 1/1 [00:20<00:00, 20.75s/it]

API Response Token Usage:
  - Total input tokens: 204
  - Cache creation tokens: 0
  - Cache read tokens: 18351
  - Approximate cache savings: 98.9% of input tokens
Received response of length 5
Successfully processed s13561-023-00428-9.pdf for relevance
Extraction results saved to gcr_resilience_extraction_results.csv and gcr_resilience_extraction_results_20250413_200146.csv
Relevance results saved to gcr_resilience_relevance_results.csv and gcr_resilience_relevance_results_20250413_200146.csv
Extraction results:
                                      paper_citation publication_type  \
0  Da'ar, O. B., & Kalmey, F. (2023). The level o...  Journal article   

  gcr_types geographic_focus  \
0  Pandemic           Global   

                                    regions_compared  \
0  Yes, WHO regions compared (Africa, Americas, E...   

                  geographic_factors  \
0  Geographic location (WHO regions)   

                               institutional_factors  \
0  Governance effe




In [10]:
print("Validate for GitHub")

Validate for GitHub
