In [30]:
import os
import pandas as pd
import pypdfium2 as pdfium
import re

In [68]:
def process_pdf(file_path):
    try:
        pdf = pdfium.PdfDocument(file_path)
        n_pages = len(pdf)
        all_text = ""

        for page_num in range(n_pages):
            page = pdf[page_num]
            textpage = page.get_textpage()
            text_all = textpage.get_text_range()
            
            # Remove specific parts
            text = text_all

            # Find the index of "Disease Outbreak News"
            outbreak_index = text_all.find("Disease Outbreak News")
            if outbreak_index != -1:
                # Find the first occurrence of a date after "Disease Outbreak News"
                date_match = re.search(r'\n\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b', text_all[outbreak_index:])
                if date_match:
                    # Extract everything from "Disease Outbreak News" until the date
                    outbreak_text = text_all[outbreak_index + len("Disease Outbreak News"):outbreak_index + date_match.end()].strip()
                    outbreak_text = outbreak_text.replace("\r\n", " ")

            # Remove "Description of the Situation" or "Situation at a Glance" header
            header_index = text.find("Description of the Situation")
            if header_index == -1:
                header_index = text.find("Situation at a Glance")
            if header_index != -1:
                text = text[header_index:]

            fig_index = text.find("\r\nFigure")
            if fig_index != -1:
                text = text[:fig_index]

            table_index = text.find("\r\nTable")
            if table_index != -1:
                text = text[:table_index]

            # Remove all Figure and Table captions
            text = re.sub(r'(Figure|Table)\s+\d+.*?\n', '', text)

            text = text.replace("\r\nSee all DONs related to this event", "")

            all_text += text

        # Calculate word count
        total_words = len(all_text.split())
        
        return total_words, all_text, outbreak_text
    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None

In [69]:
# Directory containing all PDF documents
pdf_directory = "pdf-test/"

data = []

# Iterate over all PDF files in the directory
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_directory, filename)
        word_count, all_text, outbreak_text = process_pdf(file_path)
        if word_count is not None and all_text is not None:
            data.append({'ID': outbreak_text, 'PDF Name': filename, 'Word Count': word_count, 'Text': all_text})

# Create a DataFrame
df = pd.DataFrame(data)

df.head()

Unnamed: 0,ID,PDF Name,Word Count,Text
0,1996 - Bosnia and Herzegovina 5 February 1996,1996 - Haemorrhagic fever with renal syndrome ...,201,Description of the Situation\r\n05 February 19...
1,Multi-country monkeypox outbreak: situation up...,Multi-country monkeypox outbreak_ situation up...,3058,Situation at a Glance\r\nThis Disease Outbreak...
2,Legionellosis - Poland 14 September 2023,Legionellosis - Poland.pdf,1188,"Description of the SituationOn 18 August 2023,..."
