In [1]:
import os
from collections import defaultdict

# Path to the folder
folder_path = "C:/Users/Tiger/Desktop/personal/vakilsearch/Test_Data-Zolvit"

# Check if the folder exists
if os.path.exists(folder_path) and os.path.isdir(folder_path):
    try:
        # Initialize a dictionary to store unique paths and their files
        unique_paths = defaultdict(list)

        # Walk through the directory
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                unique_paths[root].append(file_path)

        # Count total files
        total_file_count = sum(len(files) for files in unique_paths.values())

        # Print the total number of files
        print(f"Total files count: {total_file_count}")

        # Print the count of all files in each unique path
        print("\nFile summary for each unique path:")
        for path, files in unique_paths.items():
            file_count = len(files)
            # Get the relative path
            relative_path = os.path.relpath(path, folder_path)
            print(f"{relative_path} (Number of files: {file_count})")

            # Optional: Count file extensions
            extension_count = defaultdict(int)
            for file in files:
                ext = os.path.splitext(file)[1]  # Get the file extension
                extension_count[ext] += 1
            
            # Print extension summary
            for ext, count in extension_count.items():
                print(f"    Extension '{ext}' count: {count}")

    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("The folder does not exist or is not a directory.")


Total files count: 97

File summary for each unique path:
. (Number of files: 1)
    Extension '.ipynb' count: 1
test data\Jan to Mar (Number of files: 43)
    Extension '' count: 1
    Extension '.pdf' count: 41
    Extension '.xlsx' count: 1
test data\PDF AUTOMATION (Number of files: 1)
    Extension '' count: 1
test data\PDF AUTOMATION\100 CUBES (Number of files: 11)
    Extension '.pdf' count: 11
test data\PDF AUTOMATION\Agni Integrated farm (Number of files: 6)
    Extension '.pdf' count: 6
test data\PDF AUTOMATION\Campos Technologies (Number of files: 8)
    Extension '.pdf' count: 8
test data\PDF AUTOMATION\JARVICONS PRIVATE LIMITED (Number of files: 1)
    Extension '.zip' count: 1
test data\PDF AUTOMATION\KPP (Number of files: 1)
    Extension '.zip' count: 1
test data\PDF AUTOMATION\Opulent Duende (Number of files: 14)
    Extension '.pdf' count: 14
test data\PDF AUTOMATION\Sri bhavani plastics (Number of files: 8)
    Extension '.pdf' count: 8
test data\PDF AUTOMATION\SRI HA

In [8]:
import pdfplumber
import pandas as pd
import os
import re

def extract_invoice_data(pdf_path):
    invoice_data = {}
    
    with pdfplumber.open(pdf_path) as pdf:
        # Iterate through the pages
        for page in pdf.pages:
            text = page.extract_text()

            if text:
                # Use regex or specific text patterns to extract required information
                invoice_number = re.search(r'Invoice Number:\s*(\S+)', text)
                invoice_date = re.search(r'Date:\s*(\S+)', text)
                total_amount = re.search(r'Total Amount:\s*\$?(\d+\.?\d*)', text)
                vendor_name = re.search(r'Vendor:\s*(.*)', text)

                # Save extracted data if found
                if invoice_number:
                    invoice_data['Invoice Number'] = invoice_number.group(1)
                if invoice_date:
                    invoice_data['Date'] = invoice_date.group(1)
                if total_amount:
                    invoice_data['Total Amount'] = total_amount.group(1)
                if vendor_name:
                    invoice_data['Vendor'] = vendor_name.group(1)

    return invoice_data

def process_invoices(folder_path):
    all_invoices = []

    # Walk through all directories and files
    for root, _, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.pdf'):
                pdf_path = os.path.join(root, filename)
                invoice_data = extract_invoice_data(pdf_path)
                invoice_data['File Name'] = filename  # Include the file name
                invoice_data['File Path'] = pdf_path  # Include the full file path
                all_invoices.append(invoice_data)

    # Convert the list of dictionaries to a DataFrame
    return pd.DataFrame(all_invoices)

# Set your folder path
folder_path = "C:/Users/Tiger/Desktop/personal/vakilsearch/Test_Data-Zolvit"

# Process the invoices and get the DataFrame
invoice_dataframe = process_invoices(folder_path)

# Display the DataFrame
print(invoice_dataframe)


          Date                       File Name  \
0           24      INV-100_Agrani Kandele.pdf   
1           24   INV-101_Abhikaran Jalonha.pdf   
2           24      INV-102_Kasturi Kalwar.pdf   
3           27  INV-103_Jaiprakash Kumawat.pdf   
4           27       INV-104_Joseph Wincet.pdf   
..         ...                             ...   
85         NaN                     INV-807.pdf   
86         NaN                     INV-808.pdf   
87         NaN                     INV-809.pdf   
88         NaN           EDISON INVOICE 39.pdf   
89  31-07-2024           GST Sales July 24.pdf   

                                            File Path  
0   C:/Users/Tiger/Desktop/personal/vakilsearch/Te...  
1   C:/Users/Tiger/Desktop/personal/vakilsearch/Te...  
2   C:/Users/Tiger/Desktop/personal/vakilsearch/Te...  
3   C:/Users/Tiger/Desktop/personal/vakilsearch/Te...  
4   C:/Users/Tiger/Desktop/personal/vakilsearch/Te...  
..                                                ...  
85  C:/