In [25]:
# Program that reads all the PDF files from a specified folder and extracts 
# the required data from them into a pandas DataFrame and then
# creates a csv file with the dataframe data in my directory

import os
import re
import pandas as pd
import fitz
import datetime

# specify the year of the folder path here
year = 2020

# Define the columns for the DataFrame
columns = ['date', 'national_preparedness_level', 'initial_attack_activity',
           'initial_attack_activity_fires', 'new_large_incidents',
           'large_fires_contained', 'uncontained_large_fires',
           'area_command_teams_committed', 'nimos_committed',
           'type_1_imts_committed', 'type_2_imts_committed',
           'complex_imts_committed']

# Initialize an empty DataFrame
df = pd.DataFrame(columns=columns)

# Define a function to extract data from a PDF file and append it to the DataFrame
def process_pdf(file_path):
    # Open the PDF file with PyMuPDF
    with fitz.open(file_path) as doc:
        # Extract text from all pages
        text = ""
        for page in doc:
            text += page.get_text()

            # Extract data using regular expressions

            date_match = re.findall(r'Incident Management Situation Report\s+([\w ,:–-]+)\s+', text)
            date = date_match[0] if date_match else ''

            national_preparedness_level_match = re.findall(r'National Preparedness Level\s+(\d)\s+', text)
            national_preparedness_level = national_preparedness_level_match[0] if national_preparedness_level_match else ''

            initial_attack_activity_match = re.findall(r'Initial attack activity:\s+(\w+)\s+\((\d+) fires\)', text)
            initial_attack_activity = initial_attack_activity_match[0] if initial_attack_activity_match else ('', '')
            
            new_large_incidents_match = re.findall(r'New large incidents:\s+(\d+)', text)
            new_large_incidents = new_large_incidents_match[0] if new_large_incidents_match else ''

            large_fires_contained_match = re.findall(r'Large fires contained:\s+(\d+)', text)
            large_fires_contained = large_fires_contained_match[0] if large_fires_contained_match else ''

            uncontained_large_fires_match = re.findall(r'Uncontained large fires:\s+\*\*\s+(\d+)', text)
            uncontained_large_fires = uncontained_large_fires_match[0] if uncontained_large_fires_match else ''

            area_command_teams_committed_match = re.findall(r'Area Command teams committed:\s+(\d+)', text)
            area_command_teams_committed = area_command_teams_committed_match[0] if area_command_teams_committed_match else ''

            nimos_committed_match = re.findall(r'NIMOs committed:\s+(\d+)', text)
            nimos_committed = nimos_committed_match[0] if nimos_committed_match else ''

            type_1_imts_committed_match = re.findall(r'Type 1 IMTs committed:\s+(\d+)', text)
            type_1_imts_committed = type_1_imts_committed_match[0] if type_1_imts_committed_match else ''
  
            type_2_imts_committed_match = re.findall(r'Type 2 IMTs committed:\s+(\d+)', text)
            type_2_imts_committed = type_2_imts_committed_match[0] if type_2_imts_committed_match else ''

            complex_imts_committed_match = re.findall(r'Complex IMTs\s+committed:\s+(\d+)', text)
            complex_imts_committed = complex_imts_committed_match[0] if complex_imts_committed_match else ''
            
        # Create a DataFrame from the extracted data
        data = [[date, national_preparedness_level, initial_attack_activity[0],
                 initial_attack_activity[1], new_large_incidents,
                 large_fires_contained, uncontained_large_fires,
                 area_command_teams_committed, nimos_committed,
                 type_1_imts_committed, type_2_imts_committed,
                 complex_imts_committed]]
        df_new = pd.DataFrame(data, columns=columns)

        # Append the new data to the existing DataFrame
        global df
        df = pd.concat([df, df_new], ignore_index=True)

# Define the folder path **THIS IS WHERE TO DEFINE FOLDER BEING LOADED INTO DF**
folder_path = './imsr_folder_{}/'.format(year)

# Process all PDF files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        file_path = os.path.join(folder_path, filename)
        process_pdf(file_path)

# Print the resulting DataFrame
print(df)

# Define the folder path for CSV files
csv_folder_path = './imsr_csv_folder'

# Create the folder if it doesn't exist
if not os.path.exists(csv_folder_path):
    os.makedirs(csv_folder_path)

# Extract the year from the folder path
year = re.findall(r'imsr_folder_(\d{4})', folder_path)[0]

# Generate the CSV file name with the year and a creation date
csv_file_name = 'imsr_csv_{}_{}.csv'.format(year, datetime.date.today())

# Generate the full file path
csv_file_path = os.path.join(csv_folder_path, csv_file_name)

# Write the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

# Check if the CSV file was successfully created
if os.path.exists(csv_file_path):
    print("CSV file successfully created!")
else:
    print("CSV file creation failed.")

                                      date national_preparedness_level  \
0      Friday, January 8, 2021 – 0800 MDT                            1   
1     Friday, January 15, 2021 – 0800 MDT                            1   
2     Friday, January 22, 2021 – 0800 MDT                            1   
3     Friday, January 29, 2021 – 0800 MDT                            1   
4     Friday, February 5, 2021 – 0800 MDT                            1   
..                                     ...                         ...   
201   Friday, December 3, 2021 – 0730 MDT                            1   
202  Friday, December 10, 2021 – 0730 MDT                            1   
203  Friday, December 17, 2021 – 0730 MDT                            1   
204  Friday, December 24, 2021 – 0730 MDT                            1   
205  Friday, December 31, 2021 – 0730 MDT                            1   

    initial_attack_activity initial_attack_activity_fires new_large_incidents  \
0                             