In [1]:
import fitz # PyMuPDF
import pandas as pd
import re
import os
from datetime import datetime

In [2]:
# Load all the files needed
# Folder with the PDFs
pdf_folder = r'G:\ACCOUNT PAYABLE\001 - INVOICES - FATTURE\LOGISTIC INVOICES\1- NEW\MSC invoices received from 06-01-2025'
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

# Most recent SOA - NEEDS TO BE CONVERTED FROM 97-2003 Worksheet to regular. The file name will need to be changed to the most recent one
soa_file = r'g:\ACCOUNT PAYABLE\001 - INVOICES - FATTURE\LOGISTIC INVOICES\1- NEW\MSC invoices received from 06-01-2025\SOA for PRODOTTI MEDITERRANEI INC- 7-23-2025_1.xlsx'

# Bruzzone File 
bruzzone_file = r'c:\Users\ACarrion\OneDrive - F.lli De Cecco di Filippo Fara San Martino S.p.A\Documents\Logistics Invoicing\Bruzzone\2025-YTD-Consolidated-Invoices.xlsx'

# MSC Tracker File
msc_invoice_tracker = r'g:\ACCOUNT PAYABLE\001 - INVOICES - FATTURE\LOGISTIC INVOICES\1- NEW\MSC invoices received from 06-01-2025\MSC Invoice Tracker.xlsx' 

In [3]:
# Pattern to extract bill of lading
bol_pattern = re.compile(r'\bMEDU\w+\b')
type_keywords = {
    "Rail Detention": 'Rail',
    "IMPORT CHASSIS PER DIEM": "Chassis Per Diem",
    "PER DIEM IMPORT": "Per Diem",
    "FREIGHT INVOICE": "Chassis/Freight"
}

In [4]:
invoice_data_minimal = []

for filepath in pdf_files:
    with fitz.open(filepath) as doc:
        text = ''
        for page in doc:
            text += page.get_text()
        
        # Invoice number = filename without extensions
        filename = os.path.basename(filepath)
        invoice_number = os.path.splitext(filename)[0]

        # Extract first BOL number
        bol_match = bol_pattern.findall(text)
        bol = bol_match[0] if bol_match else ""

        # Classify type from keywords
        invoice_type = "Unknown"
        for keyword, label in type_keywords.items():
            if keyword.lower() in text.lower():
                invoice_type = label
                break
        
        invoice_data_minimal.append({
            "Filename": filename,
            "Invoice Number": invoice_number,
            "BOL Number": bol,
            "Invoice Type": invoice_type
        })

# Create dataframe for phase 1 output
df_minimal = pd.DataFrame(invoice_data_minimal)

timestamp  = datetime.now().strftime('%Y%m%d_%H%M')
file_name = f'LIST_OPEN_INVOICES_{timestamp}.xlsx'
save_path = os.path.join(pdf_folder, file_name)

# Save to MSC INVOICES received from 06-01-2025
# df_minimal.to_excel(save_path, index=False)

In [5]:
df_minimal.to_excel(save_path, index=False)
df_minimal.shape

(144, 4)

In [6]:

# Load sheet Ocean Freight 
df_soa_oceanFreight = pd.read_excel(soa_file, sheet_name='Ocean Freight')
df_soa_oceanFreight.rename(columns={'BOL': 'Invoice Number'}, inplace=True)
df_soa_oceanFreight['BOL'] = df_soa_oceanFreight['Invoice Number']
# Keep Relevant Columns
df_soa_oceanFreight = df_soa_oceanFreight[['Invoice Number', 'BOL', 'Invoice Date', 'Due Date', 'Manifested Amt', 'Paid',
                                           'Outstanding amt#', 'Containers','Disputed?', 'Dispute Status',]]

In [7]:
# Load Non-OFT sheet
df_soa_non_oft = pd.read_excel(soa_file, sheet_name='Non-OFT')
df_soa_non_oft.rename(columns={'BOLs': 'BOL'}, inplace=True)
# Keep Relevant Columns
df_soa_non_oft = df_soa_non_oft[['Invoice Number', 'Invoice Date', 'Manifested Amt', 'Paid', 'Outstanding amt#', 'BOL', 'Containers',
                                 'Disputed?',  'Dispute Status']]


In [8]:
# Combine both sheets
soa_combined = pd.concat([df_soa_oceanFreight, df_soa_non_oft], axis=0, join='outer')

In [9]:
# Merge to Bruzzone to get Customer and PO
df_bruzzone = pd.read_excel(bruzzone_file)
df_bruzzone.rename(columns={'Customer Ref # 1': 'PO', 'BILL OF LADING #': 'BOL', 'DELIVER TO NAME': 'Customer'}, inplace=True)
df_bruzzone.columns
# Keep Relevant Columns
df_bruzzone = df_bruzzone[['BOL', 'PO', 'Customer']]
df_bruzzone = df_bruzzone.drop_duplicates(subset=['BOL'])
df_bruzzone

Unnamed: 0,BOL,PO,Customer
0,MEDUGD708889,4500195790,DAIRYLAND BEL CANTO WAREHOUSE
1,MEDUGD683371,4500200541,SADDLE CREEK LOGISTICS SVCS
2,MEDUGD683926,4500200436,UNFI MORENO VALLEY
3,MEDUGD734729,4500200323,UNFI RICHURG
4,MEDUGD735015,4500200831,INTERNATIONAL GOURMET FOOD INC
...,...,...,...
5623,MEDUGK238293,4500215138,KEHE FOOD DISTRIBUTORS INC
5624,MEDUGK244549,4500215131,PERFORMANCE FOOD SERVICE INC
5625,MEDUGK243434,4500214779,BAUGH NORTHEAST CO-OP INC
5626,MEDUGK249043,4500215243,GRECO & SONS INC


In [10]:
df_customer = pd.merge(soa_combined, df_bruzzone, on='BOL', how='left')
df_customer.columns

Index(['Invoice Number', 'BOL', 'Invoice Date', 'Due Date', 'Manifested Amt',
       'Paid', 'Outstanding amt#', 'Containers', 'Disputed?', 'Dispute Status',
       'PO', 'Customer'],
      dtype='object')

In [11]:
# Check if we have the pdfs for the invoices in the statement
minimal_invoices = set(df_minimal['Invoice Number'])
df_customer['Invoice on File'] = df_customer['Invoice Number'].apply(lambda x: 'Yes' if x in minimal_invoices else 'No')

In [12]:
# Check if invoices in the folder are NOT in the statement
invoices_check = pd.merge(df_minimal, df_customer[['Invoice Number', 'BOL']], on='Invoice Number', how='left', indicator=True)
invoices_check

Unnamed: 0,Filename,Invoice Number,BOL Number,Invoice Type,BOL,_merge
0,100000393087R.pdf,100000393087R,MEDUGD729539,Rail,MEDUGD729539,both
1,100000400964R.pdf,100000400964R,MEDUFK205765,Rail,MEDUFK205765,both
2,100000401437R.pdf,100000401437R,MEDUFK440420,Rail,MEDUFK440420,both
3,100000404538R.pdf,100000404538R,MEDUFK205633,Rail,MEDUFK205633,both
4,100000404846R.pdf,100000404846R,MEDUFK600460,Rail,MEDUFK600460,both
...,...,...,...,...,...,...
139,MEDUGK144244.pdf,MEDUGK144244,MEDUGK144244,Chassis/Freight,MEDUGK144244,both
140,MEDUGK181899.pdf,MEDUGK181899,MEDUGK181899,Chassis/Freight,,left_only
141,MEDUGK220176.pdf,MEDUGK220176,MEDUGK220176,Chassis/Freight,MEDUGK220176,both
142,MEDUGK243434.pdf,MEDUGK243434,MEDUGK243434,Chassis/Freight,,left_only


In [13]:
df_invoice_tracker = pd.read_excel(msc_invoice_tracker, sheet_name= 'MSC Open Invoices')
df_invoice_tracker.columns

Index(['Invoice Date', 'Invoice Number', 'BOL', 'Container Number', 'Service',
       'Amount ', 'Invoice Status', 'Notes', 'Customer', 'PO'],
      dtype='object')

In [14]:
final_merge = pd.merge(df_customer, df_invoice_tracker[['Invoice Number','Invoice Status', 'Notes']], on='Invoice Number', how='left')
final_merge.to_excel(f'SOA_CHECK_{timestamp}.xlsx', index=False)