In [11]:
import tabula
import os
import pandas as pd
import warnings

# Input and output file paths
input_pdf_directory = r"C:\Users\Loop\Desktop\Process Tools\PDF Convertor\PDF"
output_excel_path = r"C:\Users\Loop\Desktop\Process Tools\PDF Convertor\Output Excel\output.xlsx"

# Ensure the output directory exists
os.makedirs(os.path.dirname(output_excel_path), exist_ok=True)

# Suppress specific FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# List to hold DataFrames from all PDF files
all_dfs = []

# Loop through all PDF files in the directory
for file_name in os.listdir(input_pdf_directory):
    if file_name.endswith('.pdf'):
        input_pdf_path = os.path.join(input_pdf_directory, file_name)
        try:
            # Adjust the table extraction to improve data extraction accuracy
            dfs = tabula.read_pdf(input_pdf_path, pages='all', multiple_tables=True, lattice=True, stream=False)
            header = None
            for i, df in enumerate(dfs):
                # Drop empty columns
                df = df.dropna(axis=1, how='all')
                # Drop empty rows
                df = df.dropna(axis=0, how='all')
                # Reset index to avoid indexing issues
                df.reset_index(drop=True, inplace=True)

                if df.empty:
                    continue  # Skip empty DataFrames

                if header is None:
                    header = df.columns
                else:
                    if len(df.columns) == len(header):
                        df.columns = header
                    else:
                        print(f"Skipping inconsistent table structure in {file_name} on page {i + 1}")
                        continue  # Skip inconsistent tables

                # Add each cleaned DataFrame to the list
                all_dfs.append(df)
        except Exception as e:
            print(f"Error reading {input_pdf_path}: {e}")

# Combine all DataFrames into one
if all_dfs:
    combined_df = pd.concat(all_dfs, ignore_index=True)
    # Save to Excel
    combined_df.to_excel(output_excel_path, index=False)
    print(f"PDF files have been successfully converted to {output_excel_path}")
else:
    print("No tables were extracted from the PDF files.")


PDF files have been successfully converted to C:\Users\Loop\Desktop\Process Tools\PDF Convertor\Output Excel\output.xlsx
