# PDF Information Extractor 

In [None]:
import os
from pathlib import Path
import PyPDF2
import anthropic
import pandas as pd
from tqdm import tqdm
import tiktoken

## Setup Anthropic Client

In [None]:
# API Key
api_key = 'HAHA, SURELY NOT ON A PUBLIC REPOSITORY'

# Initialize Anthropic client
client = anthropic.Anthropic(api_key=api_key)

## PDF Processing Functions

In [None]:
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from a PDF file"""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def process_with_claude(text: str, query: str, temperature: float = 0, max_tokens: int = 1000) -> str:
    """Process text with Claude model"""
    encoding = tiktoken.get_encoding("cl100k_base")
    token_count = len(encoding.encode(text))
    token_count = int(token_count * 1.1)
    print(f"token count is {token_count}")
    response = client.messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=max_tokens,
        temperature=temperature,
        messages=[{
            "role": "user",
            "content": f"Based on the following text, {query}\n\nText: {text}"
        }]
    )
    return response.content[0].text

## Process PDFs

In [None]:
query = """I need you to extract information from a research paper for a systematic literature review on regional resilience to catastrophic risks. Our research question is: What specific geographical, institutional, and infrastructural factors have been empirically or theoretically identified as enhancing regional resilience to nuclear winter, large magnitude volcanic eruptions, extreme pandemics, and infrastructure collapse catastrophes, and how do these resilience factors vary across catastrophe types?
After analyzing the paper, provide your output in CSV format with the following columns, without repeating the column names:
paper_id_citation,publication_type,gcr_types_addressed,geographic_focus,regions_compared,geographic_factors,institutional_factors,infrastructural_factors,other_resilience_factors,study_approach,evidence_strength,evidence_type_direct_causal,evidence_type_predictive,evidence_type_correlational,evidence_type_theoretical,evidence_type_case_study,evidence_type_model_based,validation_external,validation_alternative_datasets,validation_multiple_periods,validation_cross_regional,validation_none,counterfactual_robust,counterfactual_limited,counterfactual_none,limitations_thorough,limitations_limited,limitations_none,confidence_assessment,key_evidence_gaps,resilience_phase,measures_implemented,proposed_measures,main_resilience_factors,differential_effectiveness,resilience_tradeoffs,vulnerable_resilient_regions,overall_relevance,key_supporting_quotes,additional_notes,evidence_type_causal,evidence_type_correlation,evidence_type_theoretical_simulation,evidence_type_expert_opinion,evidence_type_case_study_no_controls,scale_gcr_magnitude,scale_smaller_with_scaleup,scale_implicit_smallscale,scale_unclear,generalizability_geographic_context,generalizability_time_period,generalizability_single_disaster,generalizability_appropriate_caveats,limitations_correlation_causation,limitations_extrapolation,limitations_theoretical_empirical,limitations_proxy_measures,limitations_cherry_picking,limitations_side_effects,limitations_implementation,limitations_other,gcr_relevance_rating,gcr_relevance_justification
For the evidence type, validation approach, counterfactual analysis, limitations acknowledgment, and critical evaluation checklist sections, use TRUE or FALSE to indicate which options apply.
For text fields, place the content in double quotes to properly handle any commas in the text.
Please analyze the paper thoroughly before extracting information according to this template. Answer with just the values of the csv"""

In [None]:
# Configuration
pdf_dir = "pdf"  # Directory containing PDF files
temperature = 0 # keep it as deterministic as possible
max_tokens = 1000
output_csv = "extraction_results.csv"

# Process PDFs
results = []
pdf_files = list(Path(pdf_dir).glob('*.pdf'))

for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
    try:
        text = extract_text_from_pdf(str(pdf_path))
        response = "test"#process_with_claude(text, query, temperature, max_tokens)
        
        results.append(response)
    except Exception as e:
        print(f"Error processing {pdf_path.name}: {str(e)}")

# Save results to CSV
print(results)
# Define the column names based on the query structure
columns = [
    "paper_id_citation", "publication_type", "gcr_types_addressed", "geographic_focus", 
    "regions_compared", "geographic_factors", "institutional_factors", "infrastructural_factors", 
    "other_resilience_factors", "study_approach", "evidence_strength", 
    "evidence_type_direct_causal", "evidence_type_predictive", "evidence_type_correlational", 
    "evidence_type_theoretical", "evidence_type_case_study", "evidence_type_model_based", 
    "validation_external", "validation_alternative_datasets", "validation_multiple_periods", 
    "validation_cross_regional", "validation_none", "counterfactual_robust", 
    "counterfactual_limited", "counterfactual_none", "limitations_thorough", 
    "limitations_limited", "limitations_none", "confidence_assessment", 
    "key_evidence_gaps", "resilience_phase", "measures_implemented", 
    "proposed_measures", "main_resilience_factors", "differential_effectiveness", 
    "resilience_tradeoffs", "vulnerable_resilient_regions", "overall_relevance", 
    "key_supporting_quotes", "additional_notes", "evidence_type_causal", 
    "evidence_type_correlation", "evidence_type_theoretical_simulation", 
    "evidence_type_expert_opinion", "evidence_type_case_study_no_controls", 
    "scale_gcr_magnitude", "scale_smaller_with_scaleup", "scale_implicit_smallscale", 
    "scale_unclear", "generalizability_geographic_context", "generalizability_time_period", 
    "generalizability_single_disaster", "generalizability_appropriate_caveats", 
    "limitations_correlation_causation", "limitations_extrapolation", 
    "limitations_theoretical_empirical", "limitations_proxy_measures", 
    "limitations_cherry_picking", "limitations_side_effects", "limitations_implementation", 
    "limitations_other", "gcr_relevance_rating", "gcr_relevance_justification"
]

results =["""Da'ar and Kalmey (2023),journal article,extreme pandemics,global,multiple regions,\""\""\""geographic factors not directly addressed\""\""\"",\""\""\""effective governance, health financing, communication infrastructure\""\""\"",\""\""\""supply chain capacity for medicines and technologies, health workforce (doctors and nurses per 1000 population), hospital beds per 1000 population\""\""\"",\""\""\""Sustainable Development Goals (SDGs), Human Development Index (HDI), income level\""\""\"",quantitative analysis,MEDIUM,FALSE,FALSE,TRUE,TRUE,FALSE,TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,MEDIUM,\""\""\""Need for more direct causal evidence on how health system factors translate to pandemic resilience\""\""\"",preparedness,\""\""\""Not directly addressed\""\""\"",\""\""\""Strengthening supply chain capacity, health financing, communication infrastructure; maintaining health workforce and services; boosting SDGs particularly health-related sub-scales; curbing complacency in high-income countries\""\""\"",\""\""\""Supply chain capacity, effective governance, health financing, and SDGs were most significant for preparedness\""\""\"",\""\""\""Effects of health system building blocks were considerably larger for countries with higher levels of preparedness\""\""\"",\""\""\""Complacency in high-income countries can reduce preparedness despite resource advantages\""\""\"",\""\""\""African region was least prepared; Americas, European, and Southeast Asia regions had higher preparedness\""\""\"",HIGH,\""\""\""The positive gradient trends signaled a sense of capacity on the part of countries with higher global health security.\"" \""The effects of SDGs were greater for countries with higher levels of preparedness to health risks.\"" \""Relative to poor countries, middle- and high-income countries had lower levels of preparedness to health risks, an indication of a sense of complacency.\""\""\"",\""\""\""Study examines differential effects of health system building blocks and socioeconomic factors on pandemic preparedness across 195 countries, comparing pre-pandemic (2019) and pandemic (2021) periods\""\""\"",FALSE,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE,TRUE,HIGH,\""\""\""The paper directly examines factors affecting preparedness for extreme pandemics across multiple regions, with quantitative analysis of 195 countries. It identifies specific institutional and infrastructural factors that enhance resilience, showing how these vary across different levels of preparedness.\""\""\""""]

# Process the results to create a proper DataFrame with the defined columns
processed_results = []
for result in results:
    # If the result is a CSV string, parse it
    if isinstance(result, str) and "," in result:
        # Split the CSV string into values, being careful with quoted values
        values = []
        current_value = ""
        quote_count = 0
        
        for char in result:
            if char == '"':
                quote_count += 1
                if quote_count % 6 == 0:  # Handle triple double quotes
                    current_value += char
            elif char == ',' and quote_count % 6 == 0:  # Only split on commas outside quotes
                values.append(current_value.strip())
                current_value = ""
            else:
                current_value += char
                
        # Add the last value
        values.append(current_value.strip())
        
        # Clean up the values - remove excessive quotes
        values = [v.strip('"""') for v in values]
        
        # Create a dictionary with column names and values
        row_dict = {col: val for col, val in zip(columns, values)}
        processed_results.append(row_dict)
    else:
        # If not a CSV string, just add as is
        processed_results.append({"raw_result": result})

# Replace the original results with the processed ones
results = processed_results

df = pd.DataFrame(results)
df.to_csv(output_csv, index=False)
# Add timestamp to create a versioned output file
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
versioned_output_csv = f"{output_csv.split('.')[0]}_{timestamp}.csv"

# Save both versioned and standard output
df.to_csv(versioned_output_csv, index=False)
print(f"Results saved to {output_csv} and versioned copy at {versioned_output_csv}")

# Display results
display(df)

In [None]:
print(df)