In [1]:
import fitz # PyMuPDF
import pandas as pd
import re
import os
from datetime import datetime

In [2]:
# Load all the files needed
# Directory with the PDFs
root_pdf_dir = r'G:\ACCOUNT PAYABLE\001 - INVOICES - FATTURE\LOGISTIC INVOICES\1- NEW\MSC invoices received from 06-01-2025'

# Recursively collect all PDFs file paths
pdf_files = []
for dirpath, dirnames, filenames in os.walk(root_pdf_dir):
    for filename in filenames:
        if filename.lower().endswith('.pdf'):
            pdf_files.append(os.path.join(dirpath, filename))

# Most recent SOA - NEEDS TO BE CONVERTED FROM 97-2003 Worksheet to regular. The file name will need to be changed to the most recent one
soa_file = r'g:\ACCOUNT PAYABLE\001 - INVOICES - FATTURE\LOGISTIC INVOICES\1- NEW\MSC invoices received from 06-01-2025\SOA for PRODOTTI MEDITERRANEI INC- 7-23-2025_1.xlsx'

# MSC Tracker File
msc_invoice_tracker = r'g:\ACCOUNT PAYABLE\001 - INVOICES - FATTURE\LOGISTIC INVOICES\1- NEW\MSC invoices received from 06-01-2025\MSC Invoice Tracker.xlsx' 

In [3]:
# Pattern to extract bill of lading
bol_pattern = re.compile(r'\bMEDU\w+\b')
type_keywords = {
    "Rail Detention": 'Rail',
    "IMPORT CHASSIS PER DIEM": "Chassis Per Diem",
    "PER DIEM IMPORT": "Per Diem",
    "FREIGHT INVOICE": "Chassis/Freight"
}

In [4]:
invoice_data_minimal = []

for filepath in pdf_files:
    with fitz.open(filepath) as doc:
        # Identify the folder where the PDF resides
        folder = os.path.basename(os.path.dirname(filepath))
        # Extract all text for BOL and type classification
        text = ''
        for page in doc:
            text += page.get_text()
        
        # Invoice number = filename without extensions
        filename = os.path.basename(filepath)
        invoice_number = os.path.splitext(filename)[0]

        # Extract first BOL number
        bol_match = bol_pattern.findall(text)
        bol = bol_match[0] if bol_match else ""

        # Classify type from keywords
        invoice_type = "Unknown"
        for keyword, label in type_keywords.items():
            if keyword.lower() in text.lower():
                invoice_type = label
                break
        
        invoice_data_minimal.append({
            'Folder': folder,
            "Filename": filename,
            "Invoice Number": invoice_number,
            "BOL Number": bol,
            "Invoice Type": invoice_type
        })

# Create dataframe for phase 1 output
df_minimal = pd.DataFrame(invoice_data_minimal)

timestamp  = datetime.now().strftime('%Y%m%d_%H%M')
file_name = f'LIST_OPEN_INVOICES_{timestamp}.xlsx'
# Save to MSC INVOICES received from 06-01-2025
df_minimal.to_excel(file_name, index=False)

In [5]:

df_minimal.shape

(151, 5)

In [6]:

# Load sheet Ocean Freight 
df_soa_oceanFreight = pd.read_excel(soa_file, sheet_name='Ocean Freight')
df_soa_oceanFreight.rename(columns={'BOL': 'Invoice Number'}, inplace=True)
df_soa_oceanFreight['BOL'] = df_soa_oceanFreight['Invoice Number']
# Keep Relevant Columns
df_soa_oceanFreight = df_soa_oceanFreight[['Invoice Number', 'BOL', 'Invoice Date', 'Due Date', 'Manifested Amt', 'Paid',
                                           'Outstanding amt#', 'Containers','Disputed?', 'Dispute Status',]]

In [7]:
# Load Non-OFT sheet
df_soa_non_oft = pd.read_excel(soa_file, sheet_name='Non-OFT')
df_soa_non_oft.rename(columns={'BOLs': 'BOL'}, inplace=True)
# Keep Relevant Columns
df_soa_non_oft = df_soa_non_oft[['Invoice Number', 'Invoice Date', 'Manifested Amt', 'Paid', 'Outstanding amt#', 'BOL', 'Containers',
                                 'Disputed?',  'Dispute Status']]


In [8]:
# Combine both sheets
soa_combined = pd.concat([df_soa_oceanFreight, df_soa_non_oft], axis=0, join='outer')

In [9]:
# Phase 3: Read multiple Bruzzone Files (YTD and last year)
bruzzone_files = [
    r'c:\Users\ACarrion\OneDrive - F.lli De Cecco di Filippo Fara San Martino S.p.A\Documents\Logistics Invoicing\Bruzzone\2025-YTD-Consolidated-Invoices.xlsx',
    r'c:\Users\ACarrion\OneDrive - F.lli De Cecco di Filippo Fara San Martino S.p.A\Documents\Logistics Invoicing\Bruzzone\2024\2024-Consolidated-Invoices.xlsx'
]

bruzzone_dfs = []
needed_cols = ['BILL OF LADING #', 'DELIVER TO NAME', 'Customer Ref # 1']
for bf in bruzzone_files:
    df_temp = pd.read_excel(bf)
    # Keep only the needed columns if they exist
    available = [c for c in needed_cols if c in df_temp.columns]
    df_temp = df_temp[available]
    # Rename to uniform names
    df_temp.rename(columns={
        'BILL OF LADING #': 'BOL',
        'DELIVER TO NAME': 'Customer from Bruzzone File',
        'Customer Ref # 1': 'PO'
    }, inplace=True)

    # Ensure all three columns exist, fill missing
    for col in ['BOL', 'PO', 'Customer']:
        if col not in df_temp.columns:
            df_temp[col] = pd.NA
    df_temp = df_temp[['BOL', 'PO', 'Customer from Bruzzone File']]
    bruzzone_dfs.append(df_temp)

df_bruzzone = pd.concat(bruzzone_dfs, ignore_index=True)
df_bruzzone.drop_duplicates(subset=['BOL'], inplace=True)

In [10]:
df_customer = pd.merge(soa_combined, df_bruzzone, on='BOL', how='left')

In [11]:
pipeline_file = ''
transportation_file = ''
for file in os.listdir():
    if file.startswith('SAP_Pipeline'):
        pipeline_file = file
    if file.startswith('SAP_Transportation'):
        transportation_file = file

print(pipeline_file)
print(transportation_file)

SAP_Pipeline_8.4.xlsx
SAP_Transportation_8.4.xlsx


In [12]:
# Merge with pipeline file
df_pipe = pd.read_excel(pipeline_file)
df_pipe = df_pipe[['Purchasing Document', 'Sales Document', 'Sold-to Name']]
df_pipe.rename(columns={'Purchasing Document': 'PO', 'Sold-to Name': 'Customer Name in SAP'}, inplace=True)
# Fill missing sales with PO
df_pipe['Sales Document'] = df_pipe['Sales Document'].fillna(df_pipe['PO'])
# Merge onto customer
df_customer = pd.merge(df_customer, df_pipe, on='PO', how='left')

  for idx, row in parser.parse():


In [14]:
# Merge with Transportation file
df_transportation = pd.read_excel(transportation_file)
df_transportation = df_transportation[['Sales Document', 'Service agent description']]
# Merge onto customer
df_customer = pd.merge(df_customer, df_transportation, on='Sales Document', how='left')
df_customer.rename(columns={'Service agent description': 'Carrier'}, inplace=True)
df_customer

Unnamed: 0,Invoice Number,BOL,Invoice Date,Due Date,Manifested Amt,Paid,Outstanding amt#,Containers,Disputed?,Dispute Status,PO,Customer from Bruzzone File,Sales Document,Customer Name in SAP,Carrier
0,MEDUGD857751,MEDUGD857751,2025-01-17 00:00:00,2025-01-17,350.0,0.0,350.0,CAAU5579511,False,,4500199830,DAIRYLAND,2.971580e+05,DAIRYLAND USA CORP. East,
1,MEDUGD845764,MEDUGD845764,2025-01-20 00:00:00,2025-01-20,700.0,0.0,700.0,MSNU7467194; MSNU9627178,False,,4500201306,HUB GROUP CASESTACK,4.500201e+09,PMI - Casestack Plant,
2,MEDUGD915278,MEDUGD915278,2025-01-22 00:00:00,2025-01-22,350.0,0.0,350.0,CAAU7810430; GAOU7373877,False,,4500201383,PRODOTTI MEDITERRANEI INC,2.994730e+05,Restaurant Depot Enterprices,
3,MEDUGD970489,MEDUGD970489,2025-01-29 00:00:00,2025-01-29,700.0,0.0,700.0,BEAU5709125; TRHU5118612,False,,4500201439,EUROPEAN IMP C/O CENTRAL WHSE,2.994860e+05,EUROPEAN IMPORTS LTD,Contra America Corp
4,MEDUGD845368,MEDUGD845368,2025-02-06 00:00:00,2025-02-06,350.0,0.0,350.0,MSNU7887432; TIIU4840631,False,,4500201226,UNFI RICHURG,2.987670e+05,"UNITED NATURAL FOODS, INC. / N",Contra America Corp
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,100000413561R,MEDUGK082691,2025-07-02 00:00:00,NaT,185.0,0.0,185.0,GAOU7418020,False,,4500214002,KEHE FOOD DISTRIBUTORS INC,3.149310e+05,KEHE DISTRIBUTORS LLC.,Contra America Corp
129,100003301211P,MEDUFK982082,2025-07-02 00:00:00,NaT,250.0,0.0,250.0,FFAU2383869,False,,4500210686,MELROSE PARK GROCERY,3.094890e+05,JEWEL-OSCO,Contra America Corp
130,100003304506P,MEDUFK816884,2025-07-07 00:00:00,NaT,1185.0,0.0,1185.0,MSNU7086080,False,,4500210963,BAUGH NORTHEAST CO-OP INC,3.091640e+05,Sysco Merchandising and Supply,Contra America Corp
131,100000413782R,MEDUFK982017,2025-07-08 00:00:00,NaT,765.0,0.0,765.0,TIIU4795770,False,,4500211360,EUROPEAN IMPORTS INC,3.106050e+05,EUROPEAN IMPORTS LTD,MAREXPORT SRL


In [15]:
# Check if we have the pdfs for the invoices in the statement
minimal_invoices = set(df_minimal['Invoice Number'])
df_customer['Invoice on File'] = df_customer['Invoice Number'].apply(lambda x: 'Yes' if x in minimal_invoices else 'No')

In [16]:
# Check if invoices in the folder are NOT in the statement
invoices_check = pd.merge(df_minimal, df_customer[['Invoice Number', 'BOL']], on='Invoice Number', how='left', indicator=True)
invoices_check.to_excel('Invoices_Check.xlsx', index=False)

In [17]:
df_invoice_tracker = pd.read_excel(msc_invoice_tracker, sheet_name= 'MSC Open Invoices')
# df_invoice_tracker.columns

Index(['Invoice Date', 'Invoice Number', 'BOL', 'Container Number', 'Service',
       'Amount ', 'Invoice Status', 'Notes', 'Customer', 'PO'],
      dtype='object')

In [18]:
final_merge = pd.merge(df_customer, df_invoice_tracker[['Invoice Number','Invoice Status', 'Notes']], on='Invoice Number', how='left')
final_merge.to_excel(f'SOA_CHECK_{timestamp}.xlsx', index=False)