In [3]:
import os
from collections import defaultdict

# Path to the folder
folder_path = "./test data/"

# Check if the folder exists
if os.path.exists(folder_path) and os.path.isdir(folder_path):
    try:
        # Initialize a dictionary to store unique paths and their files
        unique_paths = defaultdict(list)

        # Walk through the directory
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                unique_paths[root].append(file_path)

        # Count total files
        total_file_count = sum(len(files) for files in unique_paths.values())

        # Print the total number of files
        print(f"Total files count: {total_file_count}")

        # Print the count of all files in each unique path
        print("\nFile summary for each unique path:")
        for path, files in unique_paths.items():
            file_count = len(files)
            # Get the relative path
            relative_path = os.path.relpath(path, folder_path)
            print(f"{relative_path} (Number of files: {file_count})")

            # Optional: Count file extensions
            extension_count = defaultdict(int)
            for file in files:
                ext = os.path.splitext(file)[1]  # Get the file extension
                extension_count[ext] += 1
            
            # Print extension summary
            for ext, count in extension_count.items():
                print(f"    Extension '{ext}' count: {count}")

    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("The folder does not exist or is not a directory.")


Total files count: 96

File summary for each unique path:
PDF AUTOMATION (Number of files: 1)
    Extension '' count: 1
PDF AUTOMATION/KPP (Number of files: 1)
    Extension '.zip' count: 1
PDF AUTOMATION/Sri bhavani plastics (Number of files: 8)
    Extension '.pdf' count: 8
PDF AUTOMATION/Opulent Duende (Number of files: 14)
    Extension '.pdf' count: 14
PDF AUTOMATION/JARVICONS PRIVATE LIMITED (Number of files: 1)
    Extension '.zip' count: 1
PDF AUTOMATION/Venture commerce (Number of files: 1)
    Extension '.zip' count: 1
PDF AUTOMATION/SRI HARI ENTERPRISE (Number of files: 1)
    Extension '.pdf' count: 1
PDF AUTOMATION/Campos Technologies (Number of files: 8)
    Extension '.pdf' count: 8
PDF AUTOMATION/Agni Integrated farm (Number of files: 6)
    Extension '.pdf' count: 6
PDF AUTOMATION/XCELLENT XEROX AND ONLINE SERVICES (Number of files: 1)
    Extension '.pdf' count: 1
PDF AUTOMATION/100 CUBES (Number of files: 11)
    Extension '.pdf' count: 11
Jan to Mar (Number of files

In [5]:
import pdfplumber
import pandas as pd
import os
import re

def extract_invoice_data(pdf_path):
    invoice_data = {}
    
    with pdfplumber.open(pdf_path) as pdf:
        # Iterate through the pages
        for page in pdf.pages:
            text = page.extract_text()

            if text:
                # Use regex or specific text patterns to extract required information
                invoice_number = re.search(r'Invoice Number:\s*(\S+)', text)
                invoice_date = re.search(r'Date:\s*(\S+)', text)
                total_amount = re.search(r'Total Amount:\s*\$?(\d+\.?\d*)', text)
                vendor_name = re.search(r'Vendor:\s*(.*)', text)

                # Save extracted data if found
                if invoice_number:
                    invoice_data['Invoice Number'] = invoice_number.group(1)
                if invoice_date:
                    invoice_data['Date'] = invoice_date.group(1)
                if total_amount:
                    invoice_data['Total Amount'] = total_amount.group(1)
                if vendor_name:
                    invoice_data['Vendor'] = vendor_name.group(1)

    return invoice_data

def process_invoices(folder_path):
    all_invoices = []

    # Walk through all directories and files
    for root, _, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.pdf'):
                pdf_path = os.path.join(root, filename)
                invoice_data = extract_invoice_data(pdf_path)
                invoice_data['File Name'] = filename  # Include the file name
                invoice_data['File Path'] = pdf_path  # Include the full file path
                all_invoices.append(invoice_data)

    # Convert the list of dictionaries to a DataFrame
    return pd.DataFrame(all_invoices)

# Set your folder path
folder_path = "./test data/"

# Process the invoices and get the DataFrame
invoice_dataframe = process_invoices(folder_path)

# Display the DataFrame
print(invoice_dataframe)


Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20231228->pdfplumber)
  Using cached cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0meta [36m0