In [1]:
import os
import pandas as pd
import pypdfium2 as pdfium
import re
from datetime import datetime

In [2]:
def process_new_pdf(file_path):
    try:
        pdf = pdfium.PdfDocument(file_path)
        all_text = ""
        outbreak_text = ""  # Initialize outbreak_text outside the loop
        
        # Extract text from the entire PDF document
        for page_num in range(len(pdf)):
            page = pdf[page_num]
            textpage = page.get_textpage()
            text = textpage.get_text_range()
            # Convert all_text to lowercase
            text = text.lower()
            all_text += text + " "
            
        # Find the index of "Disease Outbreak News"
        outbreak_index = all_text.find("disease outbreak news")
        if outbreak_index != -1:
            # Find the first occurrence of a date after "Disease Outbreak News"
            date_match = re.search(r'\n\b\d{1,2}\s+(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{4}\b', all_text[outbreak_index:])
            if date_match:
                # Extract everything from "Disease Outbreak News" until the date
                outbreak_text = all_text[outbreak_index + len("disease outbreak news"):outbreak_index + date_match.end()].strip()
                outbreak_text = outbreak_text.replace("\r\n", " ")
                extracted_date = date_match.group(0)
                extracted_date = extracted_date.replace("\n", "")

    
        # Find "Description of the Situation" or "Situation at a Glance" header
        header_index = all_text.find("description of the situation")
        if header_index == -1:
            header_index = all_text.find("situation at a glance")
        
        if header_index != -1:
            all_text = all_text[header_index:]
        
            # Remove Figures and Tables
            all_text = re.sub(r'(\r\n)?figure\s+\d+.*?\n', '', all_text)
            all_text = re.sub(r'(\r\n)?table\s+\d+.*?\n', '', all_text)
            all_text = re.sub(r'\r\nsource:.*?\r\n', ' ', all_text)
        
            # Remove "See all DONs related to this event"
            all_text = all_text.replace("\r\nsee all dons related to this event", "")
        
            # Replace newline characters with space
            all_text = all_text.replace("\r\n", " ")
            
            total_words = len(all_text.split())
        
        else:
            print("Header not found in the PDF: " + {file_path})
        
        return total_words, all_text, outbreak_text, extracted_date
    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None, None, None

In [3]:
# Directory containing all PDF documents
pdf_directory = "new-pdfs"

data = []

# Iterate over all PDF files in the directory
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_directory, filename)
        total_words, all_text, outbreak_text, extracted_date = process_new_pdf(file_path)
        if total_words is not None and all_text is not None and outbreak_text is not None:
            data.append({'ID':outbreak_text,'PDF Name': filename, 'Word Count': total_words, 'Text': all_text, 'Date': extracted_date})

# Create a DataFrame
new_df = pd.DataFrame(data)

In [4]:
new_df.head()

Unnamed: 0,ID,PDF Name,Word Count,Text,Date
0,dengue - global situation 21 december 2023,Dengue- Global situation.pdf,5317,description of the situation global overview c...,21 december 2023
1,avian influenza a h5n1 - united kingdom of gre...,30maAvian Influenza A H5N1 - United Kingdom of...,1703,"description of the situation in late april, th...",30 may 2023
2,measles - occupied palestinian territory 10 ja...,Measles.pdf,922,description of the situation from 1 january th...,10 january 2020
3,"ebola virus disease - african region (afro), d...",26juEbola virus disease – Democratic Republic ...,1874,"description of the situation on 25 june 2020, ...",26 june 2020
4,japanese encephalitis - australia 28 april 2022,Japanese Encephalitis - Australia.pdf,1720,"description of the situation on 7 march 2022, ...",28 april 2022


In [5]:
new_df.to_csv("new_dons_length.csv")

In [6]:
def process_old_pdf(file_path):
    try:
        pdf = pdfium.PdfDocument(file_path)
        all_text = ""
        outbreak_text = ""
        extracted_date = ""
        
        # Extract text from the entire PDF document
        for page_num in range(len(pdf)):
            page = pdf[page_num]
            textpage = page.get_textpage()
            text = textpage.get_text_range()
            # Convert all_text to lowercase
            text = text.lower()
            all_text += text + " "
        
        # Define the patterns to search for
        patterns = ["disease outbreak news", "diseaseoutbreaknews", "disease outbreak news (dons)"]
        
        # Initialize the index to store the result
        outbreak_index = -1
        
        # Iterate over the patterns and search for each one
        for pattern in patterns:
            index = all_text.find(pattern, 0, 150)
            if index != -1:
                outbreak_index = index
                break
        
        if outbreak_index != -1:
            # Find the first occurrence of a date after "Disease Outbreak News"
            date_match = re.search(r'(?:^|\n)\b\d{1,2}\s*(?:january|february|march|april|may|june|july|august|september|october|november|december)\s*\d{4}\b', all_text[outbreak_index:])
            if date_match:
                # Extract everything from "Disease Outbreak News" until the date
                outbreak_text = all_text[:outbreak_index].strip()
                outbreak_text = outbreak_text.replace("\r\n", " ")
                
                extracted_date = date_match.group(0)
                extracted_date = extracted_date.replace("\n", "")
                
                # Find the position of the matched date string in all_text
                date_index = all_text.find(extracted_date)
                if date_index != -1:
                    # Cut out everything from all_text before and including the date
                    all_text = all_text[date_index + len(extracted_date):]
                    
                    # Remove stuff
                    all_text = re.sub(r'(\r\n)?figure\s+\d+.*?\n', '', all_text)
                    all_text = re.sub(r'(\r\n)?table\s+\d+.*?\n', '', all_text)
                    all_text = re.sub(r'\r\nsource:.*?\r\n', ' ', all_text)
                
                    all_text = all_text.replace("see all dons related to this event", "")
                    all_text = all_text.replace("\r\nhome", "")
                    all_text = all_text.replace("\r\nalert and response operations", "")
                    all_text = all_text.replace("diseases\r\nbiorisk reduction\r\ndisease outbreak news", "")
                    all_text = all_text.replace("\r\nemergencies preparedness, response", "")
                    all_text = all_text.replace("\r\nmenu", "")
                    all_text = all_text.replace("what we do regions about us","")
                    all_text = all_text.replace("subscribe to our newsletters","")
                    all_text = all_text.replace("privacy legal notice","")
                    all_text = all_text.replace("© 2017 who","")
                    all_text = all_text.replace("© 2018 who","")
                    all_text = all_text.replace("© 2019 who","")
                    all_text = all_text.replace("© 2020 who","")
                    all_text = all_text.replace("© 2021 who","")
        
                    all_text = all_text.replace("\r\n", " ")
        
                    all_text = all_text.replace(" - ", " ")
        
                    total_words = len(all_text.split())
        
        else:
                
            # Find "Description of the Situation" or "Situation at a Glance" header
            header_index = all_text.find("weekly update\r\n",0,150)
            # if header_index == -1:
            #     header_index = all_text.find("situation at a glance")
            
            if header_index != -1:
                all_text = all_text[header_index:]
                        
                # Remove stuff
                all_text = re.sub(r'(\r\n)?figure\s+\d+.*?\n', '', all_text)
                all_text = re.sub(r'(\r\n)?table\s+\d+.*?\n', '', all_text)
                all_text = re.sub(r'\r\nsource:.*?\r\n', ' ', all_text)
            
                all_text = all_text.replace("see all dons related to this event", "")
                all_text = all_text.replace("\r\nhome", "")
                all_text = all_text.replace("\r\nalert and response operations", "")
                all_text = all_text.replace("diseases\r\nbiorisk reduction\r\ndisease outbreak news", "")
                all_text = all_text.replace("\r\nemergencies preparedness, response", "")
                all_text = all_text.replace("\r\nmenu", "")
                all_text = all_text.replace("what we do regions about us","")
                all_text = all_text.replace("subscribe to our newsletters","")
                all_text = all_text.replace("privacy legal notice","")
                all_text = all_text.replace("© 2017 who","")
                all_text = all_text.replace("© 2018 who","")
                all_text = all_text.replace("© 2019 who","")
                all_text = all_text.replace("© 2020 who","")
                all_text = all_text.replace("© 2021 who","")
        
                all_text = all_text.replace("\r\n", " ")
        
                all_text = all_text.replace(" - ", " ")
        
                total_words = len(all_text.split())
                
            else:
                date_match = re.search(r'(?:^|\n)\b\d{1,2}\s*(?:january|february|march|april|may|june|july|august|september|october|november|december)\s*\d{4}\b', all_text)
                if date_match:
                    # Extract the matched date
                    extracted_date = date_match.group(0)
                
                    # Find the index of the extracted date in the text
                    date_index = all_text.find(extracted_date)
                
                    if date_index != -1:
                        # Find the index of the next newline character after the date
                        next_newline_index = all_text.find('\r\n', date_index)
                        
                        if next_newline_index != -1:
                            table_madness = all_text.find("following table and map.\r\n")
        
                            
                            if table_madness != -1:
                                table_total = all_text.find('grand total', table_madness)
                                table_end = all_text.find('\r\n', table_total)
                                all_text = all_text[:table_madness + len("following table and map.\r\n")] + all_text[table_end:]
                                
                            # Cut out everything before the date and the date itself until the next line
                            all_text = all_text[next_newline_index+1:].strip()
                            # Remove stuff
                            all_text = re.sub(r'(\r\n)?figure\s+\d+.*?\n', '', all_text)
                            all_text = re.sub(r'(\r\n)?table\s+\d+.*?\n', '', all_text)
                            all_text = re.sub(r'\r\nsource:.*?\r\n', ' ', all_text)
                        
                            all_text = all_text.replace("see all dons related to this event", "")
                            all_text = all_text.replace("\r\nhome", "")
                            all_text = all_text.replace("\r\nalert and response operations", "")
                            all_text = all_text.replace("diseases\r\nbiorisk reduction\r\ndisease outbreak news", "")
                            all_text = all_text.replace("\r\nemergencies preparedness, response", "")
                            all_text = all_text.replace("\r\nmenu", "")
                            all_text = all_text.replace("what we do regions about us","")
                            all_text = all_text.replace("subscribe to our newsletters","")
                            all_text = all_text.replace("privacy legal notice","")
                            all_text = all_text.replace("© 2017 who","")
                            all_text = all_text.replace("© 2018 who","")
                            all_text = all_text.replace("© 2019 who","")
                            all_text = all_text.replace("© 2020 who","")
                            all_text = all_text.replace("© 2021 who","")
        
                            all_text = all_text.replace("\r\n", " ")
                            all_text = all_text.replace(" - ", " ")
                    
                            total_words = len(all_text.split())
                            
        return total_words, all_text, outbreak_text, extracted_date
    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None, None, None

In [7]:
# Directory containing all PDF documents
pdf_directory = "old-pdfs"

data = []

# Iterate over all PDF files in the directory
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_directory, filename)
        total_words, all_text, outbreak_text, extracted_date = process_old_pdf(file_path)

        # Regular expression pattern to extract date components from file path
        file_path_pattern = r'csr_don_(\d{4})_(\d{2})_(\d{2})_en_\.pdf'
        
        # Search for the date components in the file path
        file_path_match = re.search(file_path_pattern, file_path)
        
        # Extracted date from file path
        if extracted_date == "" and file_path_match:
            # Extract year, month, and day components
            year, month, day = file_path_match.groups()
            # Construct the date string in the desired format
            extracted_date = datetime(int(year), int(month), int(day)).strftime("%d %B %Y")        

        if total_words is not None and all_text is not None and outbreak_text is not None:
            data.append({'ID':outbreak_text,'PDF Name': filename, 'Word Count': total_words, 'Text': all_text, 'Date': extracted_date})

# Create a DataFrame
old_df = pd.DataFrame(data)

Error processing old-pdfs/csr_don_2003_07_03_en_.pdf: cannot access local variable 'total_words' where it is not associated with a value
Error processing old-pdfs/csr_don_2003_06_23_en_.pdf: cannot access local variable 'total_words' where it is not associated with a value
Error processing old-pdfs/csr_don_2003_07_02_en_.pdf: cannot access local variable 'total_words' where it is not associated with a value
Error processing old-pdfs/csr_don_2003_06_20_en_.pdf: cannot access local variable 'total_words' where it is not associated with a value
Error processing old-pdfs/csr_don_2003_06_30_en_.pdf: cannot access local variable 'total_words' where it is not associated with a value
Error processing old-pdfs/csr_don_2003_06_16_en_.pdf: cannot access local variable 'total_words' where it is not associated with a value
Error processing old-pdfs/csr_don_2003_06_17_en_.pdf: cannot access local variable 'total_words' where it is not associated with a value
Error processing old-pdfs/csr_don_2009_11

In [8]:
old_df.head()

Unnamed: 0,ID,PDF Name,Word Count,Text,Date
0,"emergencies preparedness, response",csr_don_2003_01_21_en_.pdf,98,disease outbreak reported following the previ...,21 january 2003
1,"emergencies preparedness, response",csr_don_2004_02_03_en_.pdf,147,the ministry of health in viet nam has today ...,3 february 2004
2,middle east respiratory syndrome coronavirus (...,csr_don_31-august-2018-mers-united-kingdom_en_...,1022,"on 22 august 2018, the international health r...",31 august 2018
3,"emergencies preparedness, response",csr_don_2004_02_13_en_.pdf,97,situation (human) in thailand the ministry of...,13 february 2004
4,global alert and response (gar),csr_don_2009_02_09_en_.pdf,112,the ministry of health and population of egyp...,9 february 2009


In [9]:
old_df.to_csv("old_dons_length.csv")

In [14]:
concatenated_df = pd.concat([new_df, old_df])

In [15]:
concatenated_df = concatenated_df.drop(columns={"Text","ID"})

In [13]:
concatenated_df.to_csv("dons_words.csv")

## testing

In [159]:
# pdf = pdfium.PdfDocument("old-pdfs/csr_don_2009_06_24_en_.pdf")
# all_text = ""
# outbreak_text = ""
# extracted_date = ""

# # Extract text from the entire PDF document
# for page_num in range(len(pdf)):
#     page = pdf[page_num]
#     textpage = page.get_textpage()
#     text = textpage.get_text_range()
#     # Convert all_text to lowercase
#     text = text.lower()
#     all_text += text + " "

# # Define the patterns to search for
# patterns = ["disease outbreak news", "diseaseoutbreaknews", "disease outbreak news (dons)"]

# # Initialize the index to store the result
# outbreak_index = -1

# # Iterate over the patterns and search for each one
# for pattern in patterns:
#     index = all_text.find(pattern, 0, 150)
#     if index != -1:
#         outbreak_index = index
#         break

# if outbreak_index != -1:
#     # Find the first occurrence of a date after "Disease Outbreak News"
#     date_match = re.search(r'(?:^|\n)\b\d{1,2}\s*(?:january|february|march|april|may|june|july|august|september|october|november|december)\s*\d{4}\b', all_text[outbreak_index:])
#     if date_match:
#         # Extract everything from "Disease Outbreak News" until the date
#         outbreak_text = all_text[:outbreak_index].strip()
#         outbreak_text = outbreak_text.replace("\r\n", " ")
        
#         extracted_date = date_match.group(0)
#         extracted_date = extracted_date.replace("\n", "")
        
#         # Find the position of the matched date string in all_text
#         date_index = all_text.find(extracted_date)
#         if date_index != -1:
#             # Cut out everything from all_text before and including the date
#             all_text = all_text[date_index + len(extracted_date):]
            
#             # Remove stuff
#             all_text = re.sub(r'(\r\n)?figure\s+\d+.*?\n', '', all_text)
#             all_text = re.sub(r'(\r\n)?table\s+\d+.*?\n', '', all_text)
#             all_text = re.sub(r'\r\nsource:.*?\r\n', ' ', all_text)
        
#             all_text = all_text.replace("see all dons related to this event", "")
#             all_text = all_text.replace("\r\nhome", "")
#             all_text = all_text.replace("\r\nalert and response operations", "")
#             all_text = all_text.replace("diseases\r\nbiorisk reduction\r\ndisease outbreak news", "")
#             all_text = all_text.replace("\r\nemergencies preparedness, response", "")
#             all_text = all_text.replace("\r\nmenu", "")
#             all_text = all_text.replace("what we do regions about us","")
#             all_text = all_text.replace("subscribe to our newsletters","")
#             all_text = all_text.replace("privacy legal notice","")
#             all_text = all_text.replace("© 2020 who","")
#             all_text = all_text.replace("© 2021 who","")

#             all_text = all_text.replace("\r\n", " ")

#             all_text = all_text.replace(" - ", " ")

#             total_words = len(all_text.split())

# else:
        
#     # Find "Description of the Situation" or "Situation at a Glance" header
#     header_index = all_text.find("weekly update\r\n",0,150)
#     # if header_index == -1:
#     #     header_index = all_text.find("situation at a glance")
    
#     if header_index != -1:
#         all_text = all_text[header_index:]
                
#         # Remove stuff
#         all_text = re.sub(r'(\r\n)?figure\s+\d+.*?\n', '', all_text)
#         all_text = re.sub(r'(\r\n)?table\s+\d+.*?\n', '', all_text)
#         all_text = re.sub(r'\r\nsource:.*?\r\n', ' ', all_text)
    
#         all_text = all_text.replace("see all dons related to this event", "")
#         all_text = all_text.replace("\r\nhome", "")
#         all_text = all_text.replace("\r\nalert and response operations", "")
#         all_text = all_text.replace("diseases\r\nbiorisk reduction\r\ndisease outbreak news", "")
#         all_text = all_text.replace("\r\nemergencies preparedness, response", "")
#         all_text = all_text.replace("\r\nmenu", "")
#         all_text = all_text.replace("what we do regions about us","")
#         all_text = all_text.replace("subscribe to our newsletters","")
#         all_text = all_text.replace("privacy legal notice","")
#         all_text = all_text.replace("© 2020 who","")
#         all_text = all_text.replace("© 2021 who","")

#         all_text = all_text.replace("\r\n", " ")

#         all_text = all_text.replace(" - ", " ")

#         total_words = len(all_text.split())
        
#     else:
#         date_match = re.search(r'(?:^|\n)\b\d{1,2}\s*(?:january|february|march|april|may|june|july|august|september|october|november|december)\s*\d{4}\b', all_text)
#         if date_match:
#             # Extract the matched date
#             extracted_date = date_match.group(0)
        
#             # Find the index of the extracted date in the text
#             date_index = all_text.find(extracted_date)
        
#             if date_index != -1:
#                 # Find the index of the next newline character after the date
#                 next_newline_index = all_text.find('\r\n', date_index)
                
#                 if next_newline_index != -1:
#                     table_madness = all_text.find("following table and map.\r\n")

                    
#                     if table_madness != -1:
#                         table_total = all_text.find('grand total', table_madness)
#                         table_end = all_text.find('\r\n', table_total)
#                         all_text = all_text[:table_madness + len("following table and map.\r\n")] + all_text[table_end:]
                        
#                     # Cut out everything before the date and the date itself until the next line
#                     all_text = all_text[next_newline_index+1:].strip()
#                     # Remove stuff
#                     all_text = re.sub(r'(\r\n)?figure\s+\d+.*?\n', '', all_text)
#                     all_text = re.sub(r'(\r\n)?table\s+\d+.*?\n', '', all_text)
#                     all_text = re.sub(r'\r\nsource:.*?\r\n', ' ', all_text)
                
#                     all_text = all_text.replace("see all dons related to this event", "")
#                     all_text = all_text.replace("\r\nhome", "")
#                     all_text = all_text.replace("\r\nalert and response operations", "")
#                     all_text = all_text.replace("diseases\r\nbiorisk reduction\r\ndisease outbreak news", "")
#                     all_text = all_text.replace("\r\nemergencies preparedness, response", "")
#                     all_text = all_text.replace("\r\nmenu", "")
#                     all_text = all_text.replace("what we do regions about us","")
#                     all_text = all_text.replace("subscribe to our newsletters","")
#                     all_text = all_text.replace("privacy legal notice","")
#                     all_text = all_text.replace("© 2020 who","")
#                     all_text = all_text.replace("© 2021 who","")
#                     all_text = all_text.replace("\r\n", " ")
#                     all_text = all_text.replace(" - ", " ")
            
#                     total_words = len(all_text.split())