In [31]:
# program that reads all the PDF files from a specified folder and extracts 
# the required data from them into a pandas DataFrame

import os
import re
import pandas as pd
import fitz

# Define the columns for the DataFrame
columns = ['date', 'national_preparedness_level', 'initial_attack_activity',
           'initial_attack_activity_fires', 'new_large_incidents',
           'large_fires_contained', 'uncontained_large_fires',
           'area_command_teams_committed', 'nimos_committed',
           'type_1_imts_committed', 'type_2_imts_committed',
           'complex_imts_committed']

# Initialize an empty DataFrame
df = pd.DataFrame(columns=columns)

# Define a function to extract data from a PDF file and append it to the DataFrame
def process_pdf(file_path):
    # Open the PDF file with PyMuPDF
    with fitz.open(file_path) as doc:
        # Extract text from all pages
        text = ""
        for page in doc:
            text += page.get_text()

            # Extract data using regular expressions

            date_match = re.findall(r'Incident Management Situation Report\s+([\w ,:–-]+)\s+', text)
            date = date_match[0] if date_match else ''

            national_preparedness_level_match = re.findall(r'National Preparedness Level\s+(\d)\s+', text)
            national_preparedness_level = national_preparedness_level_match[0] if national_preparedness_level_match else ''

            initial_attack_activity_match = re.findall(r'Initial attack activity:\s+(\w+)\s+\((\d+) fires\)', text)
            initial_attack_activity = initial_attack_activity_match[0] if initial_attack_activity_match else ('', '')
            
            new_large_incidents_match = re.findall(r'New large incidents:\s+(\d+)', text)
            new_large_incidents = new_large_incidents_match[0] if new_large_incidents_match else ''

            large_fires_contained_match = re.findall(r'Large fires contained:\s+(\d+)', text)
            large_fires_contained = large_fires_contained_match[0] if large_fires_contained_match else ''

            uncontained_large_fires_match = re.findall(r'Uncontained large fires:\s+\*\*\s+(\d+)', text)
            uncontained_large_fires = uncontained_large_fires_match[0] if uncontained_large_fires_match else ''

            area_command_teams_committed_match = re.findall(r'Area Command teams committed:\s+(\d+)', text)
            area_command_teams_committed = area_command_teams_committed_match[0] if area_command_teams_committed_match else ''

            nimos_committed_match = re.findall(r'NIMOs committed:\s+(\d+)', text)
            nimos_committed = nimos_committed_match[0] if nimos_committed_match else ''

            type_1_imts_committed_match = re.findall(r'Type 1 IMTs committed:\s+(\d+)', text)
            type_1_imts_committed = type_1_imts_committed_match[0] if type_1_imts_committed_match else ''
  
            type_2_imts_committed_match = re.findall(r'Type 2 IMTs committed:\s+(\d+)', text)
            type_2_imts_committed = type_2_imts_committed_match[0] if type_2_imts_committed_match else ''

            complex_imts_committed_match = re.findall(r'Complex IMTs\s+committed:\s+(\d+)', text)
            complex_imts_committed = complex_imts_committed_match[0] if complex_imts_committed_match else ''
            
        # Create a DataFrame from the extracted data
        data = [[date, national_preparedness_level, initial_attack_activity[0],
                 initial_attack_activity[1], new_large_incidents,
                 large_fires_contained, uncontained_large_fires,
                 area_command_teams_committed, nimos_committed,
                 type_1_imts_committed, type_2_imts_committed,
                 complex_imts_committed]]
        df_new = pd.DataFrame(data, columns=columns)

        # Append the new data to the existing DataFrame
        global df
        df = pd.concat([df, df_new], ignore_index=True)

# Define the folder path
folder_path = './imsr_folder_2023/'

# Process all PDF files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        file_path = os.path.join(folder_path, filename)
        process_pdf(file_path)

# Print the resulting DataFrame
print(df)

                                     date national_preparedness_level  \
0     Friday, January 6, 2023 – 0730 MDT                            1   
1    Friday, January 13, 2023 – 0730 MDT                            1   
2    Friday, January 20, 2023 – 0730 MDT                            1   
3    Friday, January 27, 2023 – 0730 MDT                            1   
4    Friday, February 3, 2023 – 0730 MDT                            1   
5   Friday, February 10, 2023 – 0730 MDT                            1   
6   Friday, February 17, 2023 – 0730 MDT                            1   
7   Friday, February 24, 2023 – 0730 MDT                            1   
8       Friday, March 3, 2023 – 0730 MDT                            1   
9      Friday, March 10, 2023 – 0730 MDT                            1   
10     Friday, March 17, 2023 – 0730 MDT                            1   
11     Friday, March 24, 2023 – 0730 MDT                            1   
12     Friday, March 31, 2023 – 0730 MDT           

In [32]:
df

Unnamed: 0,date,national_preparedness_level,initial_attack_activity,initial_attack_activity_fires,new_large_incidents,large_fires_contained,uncontained_large_fires,area_command_teams_committed,nimos_committed,type_1_imts_committed,type_2_imts_committed,complex_imts_committed
0,"Friday, January 6, 2023 – 0730 MDT",1,Light,165.0,1,1,0,0,0,0,0,0
1,"Friday, January 13, 2023 – 0730 MDT",1,Light,343.0,3,2,1,0,0,0,0,0
2,"Friday, January 20, 2023 – 0730 MDT",1,Light,321.0,1,1,1,0,0,1,0,0
3,"Friday, January 27, 2023 – 0730 MDT",1,Light,224.0,0,1,0,0,0,0,0,0
4,"Friday, February 3, 2023 – 0730 MDT",1,Light,227.0,0,0,0,0,0,0,0,0
5,"Friday, February 10, 2023 – 0730 MDT",1,,,4,0,4,0,0,0,0,0
6,"Friday, February 17, 2023 – 0730 MDT",1,,,3,7,0,0,0,0,0,0
7,"Friday, February 24, 2023 – 0730 MDT",1,,,4,2,2,0,0,0,0,0
8,"Friday, March 3, 2023 – 0730 MDT",1,,,16,9,7,0,0,0,0,0
9,"Friday, March 10, 2023 – 0730 MDT",1,,,17,20,2,0,0,0,0,0
