In [1]:
# Updated version of program that works for all years until 2016
import tabula
import os
import pandas as pd
from tqdm import tqdm
from datetime import datetime

# Set the path of the folder containing the PDF files
pdf_folder_path = "./imsr_folder_2023"

# Set the path of the folder where the CSV files will be saved
csv_folder_path = "./imsr_table_csv_folder/imsr_table_csv_folder_2023"

# Create the CSV folder if it doesn't exist
if not os.path.exists(csv_folder_path):
    os.makedirs(csv_folder_path)

# Get a list of the PDF files in the PDF folder
pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith('.pdf')]

# Initialize a progress bar
pbar = tqdm(total=len(pdf_files))

# Loop through the PDF files
for pdf_file in pdf_files:
    # Set the path of the input PDF file
    pdf_path = os.path.join(pdf_folder_path, pdf_file)

    # Set the path of the output CSV file
    csv_file = os.path.splitext(pdf_file)[0] + ".csv"
    csv_path = os.path.join(csv_folder_path, csv_file)

    # extract date from pdf file name
    pdf_date = datetime.strptime(pdf_file[:10], '%Y_%m_%d').strftime('%m/%d/%Y')

    # Extract the table from the PDF file
    table_df = pd.DataFrame()
    for table in tabula.read_pdf(pdf_path, pages='all', lattice=True, pandas_options={'header': None}):
        if 'Active Incident Resource Summary' in str(table.values):
            table_df = pd.DataFrame(table)
            break

    # Add the date to the table DataFrame
    table_df.insert(0, 'Date', pdf_date)

    # Save the table DataFrame to a CSV file
    table_df.to_csv(csv_path, index=False)

    # Update the progress bar
    pbar.update(1)

# Close the progress bar
pbar.close()

print("Done!")

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:58<00:00,  3.92s/it]

Done!



