In [12]:
import os
import zipfile
import pandas as pd
from datetime import datetime
from pathlib import Path

In [25]:
class InvoiceProcessor:
    def __init__(self, input_zip, output_zip):
        self.input_zip = input_zip
        self.output_zip = output_zip
        self.extract_dir = "extracted_files"
        self.csv_dir = os.path.join(self.extract_dir, "csv_files")
        self.pdf_dir = os.path.join(self.extract_dir, "pdf_files")
        Path(self.extract_dir).mkdir(exist_ok=True)
        Path(self.csv_dir).mkdir(exist_ok=True)
        Path(self.pdf_dir).mkdir(exist_ok=True)
        self.records = []

    # extract zip file and create separate dir for csv and invoice
    def extract_zip(self): 
        with zipfile.ZipFile(self.input_zip, 'r') as zip_ref:
            for file_info in zip_ref.infolist():
                if file_info.filename.endswith('.xlsx') or file_info.filename.endswith('.xls'):
                    zip_ref.extract(file_info, self.csv_dir)
                elif file_info.filename.endswith('.pdf'):
                    zip_ref.extract(file_info, self.pdf_dir)
        print(f"Extracted '{self.input_zip}' to '{self.extract_dir}'")

    # search info in excel file
    def parse_bank_statements(self):
        for root, _, files in os.walk(self.csv_dir):
            for file in files:
                if file.endswith('.xlsx') or file.endswith('.xls'):
                    excel_file = os.path.join(root, file)
                    df = pd.read_excel(excel_file)
                    
                    required_columns = ['Posting Date', 'Merchant Category', 'Amount']
                    if not all(col in df.columns for col in required_columns):
                        raise ValueError(f"Excel file must contain the following columns: {', '.join(required_columns)}")

                    for index, row in df.iterrows():
                        posting_date = row['Posting Date']
                        merchant = row['Merchant Category']
                        amount = row['Amount']
                        
                        if isinstance(posting_date, datetime):
                            posting_date_str = posting_date.strftime('%Y-%m-%d')
                        else:
                            posting_date_str = str(posting_date)

                        self.records.append({
                            'date': posting_date_str,
                            'merchant': merchant,
                            'amount': amount,
                            'file_name': f"{posting_date_str}_{merchant}_${amount}".replace('/', '-')
                        })

    def rename_invoices(self):
        for root, _, files in os.walk(self.pdf_dir):
            for file in files:
                for record in self.records:
                    if record['amount'] in file:
                        old_path = os.path.join(root, file)
                        file_extension = os.path.splitext(file)[1]
                        new_path = os.path.join(root, record['file_name'] + file_extension)
                        os.rename(old_path, new_path)
                        print(f"Renamed '{old_path}' to '{new_path}'")

    def create_zip(self):
        with zipfile.ZipFile(self.output_zip, 'w') as zip_ref:
            for root, _, files in os.walk(self.invoice_dir):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, self.invoice_dir)
                    zip_ref.write(file_path, arcname)
        print(f"Created '{self.output_zip}' from '{self.invoice_dir}'")

In [26]:
if __name__ == "__main__":
    input_zip = "test.zip"  # Replace with the actual input zip file path
    output_zip = "test_output.zip"  # Replace with the desired output zip file path

    # Create an instance of FileProcessor
    processor = InvoiceProcessor(input_zip, output_zip)

    # Extract the zip file
    processor.extract_zip()

    # Parse the CSV files to get renaming information
    processor.parse_bank_statements()

    # Rename the invoices based on the parsed CSV data
    processor.rename_invoices()

    # Create a new zip file with the renamed invoices
    # processor.create_zip()


Extracted 'test.zip' to 'extracted_files'


TypeError: 'in <string>' requires string as left operand, not float