In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
# Get full timestamp as string
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Get just the date component as a datetime object
date = datetime.now().strftime("%Y%m%d")

In [3]:


# Load your dataset
file_path = "../../data/input/Freight_Cost_Analysis_CY2024-03.25.csv"
raw= pd.read_csv(file_path, encoding="latin1", low_memory=False)
raw.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,PART DESCRIPTION,COMM 1,COMM 2,PO PURCH QTY,PURCH UOM,PO INV QTY,INV UOM,INVOICED LINE QTY,INVOICE LINE TOTAL,PO PRICE
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,,,,,,,-600.0,
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,PROTECT ALL PRE-NOTCHED Z-BAR INSIDE CORNER AL...,1ACC,1ACC,4.0,EA,4.0,EA,4.0,18.4,18.4


In [4]:
filtered_site_df = raw[raw['SITE'] == 'DIT']
# Save the filtered_site_df DataFrame to a CSV file named 'dit_input.csv'
filtered_site_df.to_csv('dit_input.csv', index=False)
filtered_site_df.head()

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,PART DESCRIPTION,COMM 1,COMM 2,PO PURCH QTY,PURCH UOM,PO INV QTY,INV UOM,INVOICED LINE QTY,INVOICE LINE TOTAL,PO PRICE
49292,DIT,Diverzify Itasca,101928,VAN GELDER INC,430794,267481-00 48431,2-Jan-24,2309113311,River Center Entry Mats,2008,...,Van Gelder Medium Profile Edging (30390070MP) ...,150,150,18.0,LF,18.0,LF,18.0,30.6,30.6
49293,DIT,Diverzify Itasca,101928,VAN GELDER INC,430803,267481-00 DIT39490,2-Jan-24,2309113311,River Center Entry Mats,2008,...,VAN GELDER\PLAIN EAGLE MAT CUSTOM\BEIGE\MEDIUM...,1MATS,2MATS,1.0,EA,1.0,EA,1.0,891.51,891.51
49294,DIT,Diverzify Itasca,101928,VAN GELDER INC,430803,267481-00 DIT39490,2-Jan-24,2309113311,River Center Entry Mats,2008,...,VAN GELDER\PLAIN EAGLE MAT CUSTOM\BEIGE\MEDIUM...,1MATS,2MATS,3.0,EA,3.0,EA,1.0,960.02,960.02
49295,DIT,Diverzify Itasca,101928,VAN GELDER INC,430803,267481-00 DIT39490,2-Jan-24,2309113311,River Center Entry Mats,2008,...,VAN GELDER\PLAIN EAGLE MAT CUSTOM\BEIGE\MEDIUM...,1MATS,2MATS,1.0,EA,1.0,EA,1.0,455.08,455.08
49296,DIT,Diverzify Itasca,101928,VAN GELDER INC,430803,267481-00 DIT39490,2-Jan-24,2309113311,River Center Entry Mats,2008,...,VAN GELDER\PLAIN EAGLE MAT CUSTOM\BEIGE\MEDIUM...,1MATS,2MATS,1.0,EA,1.0,EA,1.0,960.02,960.02


In [5]:
filtered_site_df.shape

(23937, 28)

In [12]:
conversion_path = "data/input/freight_model/conversion_table_standardized.csv"


In [16]:
#  === Load Commodity Groups ===
# Load the commodity groups from the Excel file
commodity= pd.read_excel('../../data/input/IFS Cloud Commodity Groups.xlsx', sheet_name='Commodity Groups')
commodity.head()


Unnamed: 0,Commodity Group,Description,Old/New,Priority
0,1ACC,Accessories,New Commodity,No
1,1ADH,Adhesive,New Commodity,No
2,2ADH,Adhesive,New Commodity,No
3,280,Adhesives,Old Commodity,No
4,2ALL,All,New Commodity,No


In [15]:

#  === Load Manufacturers ===
# Load the manufacturers from the Excel file
manufacturer = pd.read_excel('../../data/input/Manufacturer List.xlsx', sheet_name='Sheet1')
manufacturer.head()


Unnamed: 0,Supplier No,Supplier Name
0,104471,Adleta Corporation
1,128340,"AHF, LLC dba AHF Products"
2,X100072,"Altro USA, Inc"
3,110988,"Altro USA, Inc."
4,1026,American Sports Surfacing


In [167]:
# data cleaning function to standardise the description conversion
# This function will classify the commodity based on the description
def classify_commodity(row):
    desc = str(row['description']).strip()
    desc_lower = desc.lower()

    if desc_lower == 'vinyl':
        return ''.join(filter(str.isalpha, str(row['comm_2'])))
    elif desc_lower == 'carpet bl':
        return 'Carpet Roll'
    elif desc_lower == 'carpet tile':
        return 'Carpet Tiles'
    elif desc_lower == 'carpet':
        return 'Carpet Roll'
    else:
        return desc  # Default fallback to original


In [168]:
# This function will classify the commodity from old codes to new codes
def map_commodity_group(x):
    x_str = str(x).strip()  # Strip whitespace, just in case

    if x_str == '10':
        return '1CBL'
    elif x_str == '100':
        return '1CPT'
    elif x_str == '40':
        return '1VNL'
    else:
        return x  # Keep original value if none of the above match

In [169]:
def data_cleaning(input_df,commodity_df,manufacturer_df):
    input_df.columns = input_df.columns.str.strip().str.lower().str.replace(" ", "_")
    commodity_df.columns = commodity_df.columns.str.strip().str.lower().str.replace(" ", "_")
    manufacturer_df.columns = manufacturer_df.columns.str.strip().str.lower().str.replace(" ", "_")
    # Convert 'Commodity Group' to string and create a new column 'COMM 1'
    commodity_df['comm_1'] = commodity_df['commodity_group'].astype(str)
    # Convert 'Commodity Group' to string in the main DataFrame
    input_df['comm_1'] = input_df['comm_1'].astype(str)
    # Perform the join on the 'COMM 1' column
    input_commodity_df = input_df.merge(commodity_df, on='comm_1', how='left')
# Flag matched and unmatched rows clearly
    input_commodity_df['match_commodity'] = input_commodity_df['commodity_group'].apply(
    lambda x: 'Commodity Found' if pd.notna(x) else 'Commodity Not Found'
)
    # Replace values in the 'uom' column
    input_commodity_df['inv_uom'] = input_commodity_df['inv_uom'].replace({'SF': 'SQFT', 'SY': 'SQYD'})
        
    # Convert 'Commodity Group' to string and create a new column 'COMM 1'
    manufacturer_df['supplier_no'] = manufacturer_df['supplier_no'].astype(str)
    # Convert 'Commodity Group' to string in the main DataFrame
    input_commodity_df['supplier_no'] = input_commodity_df['supplier_no'].astype(str)
    # Perform the join on the 'COMM 1' column
    input_commodity_manufactuer_df = input_commodity_df.merge(manufacturer_df[['supplier_no']], on='supplier_no', how='left')
    input_commodity_manufactuer_df['match_supplier'] = input_commodity_manufactuer_df['supplier_name'].apply(
        lambda x: 'Supplier registered' if pd.notna(x) else 'No supplier found'
    )
        # Normalize the 'INV UOM' column to handle case sensitivity and strip spaces
    input_commodity_manufactuer_df['inv_uom'] = input_commodity_manufactuer_df['inv_uom'].str.strip().str.upper()
    # Classify rows based on 'INV UOM' values
    input_commodity_manufactuer_df['classification'] = input_commodity_manufactuer_df.apply(
    lambda row: 'Classified' if row['inv_uom'] in ['SQFT', 'SQYD']
    else ('No UOM' if pd.isna(row['inv_uom']) or row['inv_uom'] == '' else 'Unclassified'),
    axis=1
)
    input_commodity_manufactuer_df['new_commodity_description'] = input_commodity_manufactuer_df.apply(classify_commodity, axis=1)
    input_commodity_manufactuer_df['new_commodity_group'] = input_commodity_manufactuer_df['commodity_group'].apply(map_commodity_group)

# Create a new column 'conversion_code' based on the 'Description' + 'Comodity Group' + 'INV UOM' column
    input_commodity_manufactuer_df['conversion_code'] = input_commodity_manufactuer_df['new_commodity_description'].str.replace(' ', '_', regex=True).astype(str) + '_' + input_commodity_manufactuer_df['new_commodity_group'].astype(str) + '_' + input_commodity_manufactuer_df['inv_uom'].astype(str)
  


    return input_commodity_manufactuer_df

In [170]:
testing_df = data_cleaning(raw,commodity,manufacturer)
testing_df.head(2)

Unnamed: 0,site,site_description,supplier_no,supplier_name,invoice_id,invoice_no,date_posted,project_id,project_name,account,...,commodity_group,description,old/new,priority,match_commodity,match_supplier,classification,new_commodity_description,new_commodity_group,conversion_code
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,,,Commodity Not Found,Supplier registered,No UOM,,,nan_nan_nan
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,1ACC,Accessories,New Commodity,No,Commodity Found,Supplier registered,Unclassified,Accessories,1ACC,Accessories_1ACC_EA


In [171]:
def uom_cleaning(df):
        # checking which of the rows in an invoice matching 2008 has unclassified items
    # Check if all rows with account == 2008 are classified
    # Step 1: Identify invoice_ids where ALL rows with ACCOUNT == 2008 are classified
    uom_output = df
    classified_invoice_ids = (
        uom_output[uom_output['account'] == 2008]
        .groupby('invoice_id')['classification']
        .apply(lambda x: all(x == 'Classified'))
    )

    # Step 2: Filter to only invoice IDs where ALL 2008 accounts are classified
    fully_classified_ids = classified_invoice_ids[classified_invoice_ids].index

    # Step 3: Create a new column to mark if entire invoice is considered classified (based on the 2008 rule)
    uom_output['all_accounts_2008_uom_classified'] = uom_output['invoice_id'].isin(fully_classified_ids)

    return uom_output

In [172]:
testing_uom = uom_cleaning(testing_df)
testing_uom.head(2)

Unnamed: 0,site,site_description,supplier_no,supplier_name,invoice_id,invoice_no,date_posted,project_id,project_name,account,...,description,old/new,priority,match_commodity,match_supplier,classification,new_commodity_description,new_commodity_group,conversion_code,all_accounts_2008_uom_classified
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,,Commodity Not Found,Supplier registered,No UOM,,,nan_nan_nan,False
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,Accessories,New Commodity,No,Commodity Found,Supplier registered,Unclassified,Accessories,1ACC,Accessories_1ACC_EA,False


In [173]:
def flag_fully_converted_invoices(df: pd.DataFrame, conversion_csv_path: str) -> pd.DataFrame:
    """
    Flags invoices where all account == 2008 rows have valid conversion codes.
    
    Parameters:
    - df: main DataFrame with invoice lines
    - conversion_csv_path: path to the CSV file with valid conversion codes
    
    Returns:
    - df: updated DataFrame with a boolean column 'all_2008_accounts_converted'
    """
    # Load and prepare conversion table
    rates_df = pd.read_csv(conversion_csv_path)
    rates_df['conversion_code'] = rates_df['conversion_code'].astype(str)
    df['conversion_code'] = df['conversion_code'].astype(str)

    # Set of valid codes
    valid_codes = set(rates_df['conversion_code'].unique())

    # Filter 2008 account rows
    df_2008 = df[df['account'] == 2008].copy()  # assuming column is lowercase

    # Check validity per invoice
    invoice_validity = df_2008.groupby('invoice_id')['conversion_code'].apply(
        lambda codes: all(code in valid_codes for code in codes)
    )

    # Flag full matches
    fully_valid_invoice_ids = invoice_validity[invoice_validity].index
    df['all_2008_accounts_converted'] = df['invoice_id'].isin(fully_valid_invoice_ids)

    # Optional logging or return of count
    count_all_valid_invoices = df[df['all_2008_accounts_converted']]['invoice_id'].nunique()
    print(f"✅ {count_all_valid_invoices} invoices have all account == 2008 rows with valid conversion codes")

    return df


In [174]:
testing_acc = flag_fully_converted_invoices(testing_uom,'../../app/conversion_table_standardized.csv')
testing_acc.head(2)

✅ 8590 invoices have all account == 2008 rows with valid conversion codes


Unnamed: 0,site,site_description,supplier_no,supplier_name,invoice_id,invoice_no,date_posted,project_id,project_name,account,...,old/new,priority,match_commodity,match_supplier,classification,new_commodity_description,new_commodity_group,conversion_code,all_accounts_2008_uom_classified,all_2008_accounts_converted
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,Commodity Not Found,Supplier registered,No UOM,,,nan_nan_nan,False,False
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,New Commodity,No,Commodity Found,Supplier registered,Unclassified,Accessories,1ACC,Accessories_1ACC_EA,False,False


In [175]:
def enrich_invoice_flags(df: pd.DataFrame) -> pd.DataFrame:
    # Step 1: Flag invoices with at least one freight line (ACCOUNT == 5504)
    freight_invoice_ids = df[df['account'] == 5504]['invoice_id'].unique()
    df['has_freight_line'] = df['invoice_id'].isin(freight_invoice_ids)
    count_freight_invoices = df[df['has_freight_line']]['invoice_id'].nunique()
    print(f"Number of invoices with at least one freight line: {count_freight_invoices}")

    # Step 2: Flag invoices with multiple freight lines
    freight_count = df[df['account'] == 5504].groupby('invoice_id').size()
    df['multiple_freight_lines'] = df['invoice_id'].map(freight_count > 1).fillna(False)
    count_multiple_freight_invoices = df[df['multiple_freight_lines']]['invoice_id'].nunique()
    print(f"Number of invoices with multiple freight lines: {count_multiple_freight_invoices}")

    # Step 3: Flag invoices with multiple distinct PART NO (ACCOUNT == 2008)
    df_2008 = df[df['account'] == 2008]
    component_count = df_2008.groupby('invoice_id')['part_no'].nunique()
    df['multiple_parts'] = df['invoice_id'].map(component_count > 1).fillna(False)
    count_multiple_parts_invoices = df[df['multiple_parts']]['invoice_id'].nunique()
    print(f"Number of invoices with multiple distinct parts: {count_multiple_parts_invoices}")

    # Step 4: Flag invoices with multiple distinct COMMODITY GROUP (ACCOUNT == 2008)
    commodity_count = df_2008.groupby('invoice_id')['new_commodity_group'].nunique()
    df['multiple_commodities'] = df['invoice_id'].map(commodity_count > 1).fillna(False)
    count_multiple_commodities_invoices = df[df['multiple_commodities']]['invoice_id'].nunique()
    print(f"Number of invoices with multiple distinct commodities: {count_multiple_commodities_invoices}")

    # Step 5: Flag invoices where all ACCOUNT == 2008 rows have Priority == 'Yes'
    priority_flag_all = df_2008.groupby('invoice_id')['priority'].apply(lambda x: all(x == 'Yes'))
    priority_invoice_ids_all = priority_flag_all[priority_flag_all].index
    df['all__invoice_priority_products_(2008)'] = df['invoice_id'].isin(priority_invoice_ids_all)
    count_priority_invoices = df[df['all__invoice_priority_products_(2008)']]['invoice_id'].nunique()
    print(f"Number of invoices where all ACCOUNT == 2008 have Priority == 'Yes': {count_priority_invoices}")

    # Step 6: Flag invoices where any ACCOUNT == 2008 row has Priority == 'Yes'
    priority_flag_any = df_2008.groupby('invoice_id')['priority'].apply(lambda x: any(x == 'Yes'))
    priority_invoice_ids_any = priority_flag_any[priority_flag_any].index
    df['any__invoice_priority_products_(2008)'] = df['invoice_id'].isin(priority_invoice_ids_any)
    count_any_priority_invoices = df[df['any__invoice_priority_products_(2008)']]['invoice_id'].nunique()
    print(f"Number of invoices where at least one ACCOUNT == 2008 has Priority == 'Yes': {count_any_priority_invoices}")

    return df


In [176]:
testing_enriched = enrich_invoice_flags(testing_acc)
testing_enriched.head(2)

Number of invoices with at least one freight line: 55942
Number of invoices with multiple freight lines: 197
Number of invoices with multiple distinct parts: 47577
Number of invoices with multiple distinct commodities: 16091
Number of invoices where all ACCOUNT == 2008 have Priority == 'Yes': 17084
Number of invoices where at least one ACCOUNT == 2008 has Priority == 'Yes': 23536


Unnamed: 0,site,site_description,supplier_no,supplier_name,invoice_id,invoice_no,date_posted,project_id,project_name,account,...,new_commodity_group,conversion_code,all_accounts_2008_uom_classified,all_2008_accounts_converted,has_freight_line,multiple_freight_lines,multiple_parts,multiple_commodities,all__invoice_priority_products_(2008),any__invoice_priority_products_(2008)
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,nan_nan_nan,False,False,False,False,False,False,False,False
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,1ACC,Accessories_1ACC_EA,False,False,True,False,True,True,False,True


In [177]:
testing_enriched.columns

Index(['site', 'site_description', 'supplier_no', 'supplier_name',
       'invoice_id', 'invoice_no', 'date_posted', 'project_id', 'project_name',
       'account', 'account_description', 'planned_delivery_date',
       'ship_to_zip', 'po_no', 'po_line_no', 'po_rel_no', 'receipt_no',
       'part_no', 'part_description', 'comm_1', 'comm_2', 'po_purch_qty',
       'purch_uom', 'po_inv_qty', 'inv_uom', 'invoiced_line_qty',
       'invoice_line_total', 'po_price', 'commodity_group', 'description',
       'old/new', 'priority', 'match_commodity', 'match_supplier',
       'classification', 'new_commodity_description', 'new_commodity_group',
       'conversion_code', 'all_accounts_2008_uom_classified',
       'all_2008_accounts_converted', 'has_freight_line',
       'multiple_freight_lines', 'multiple_parts', 'multiple_commodities',
       'all__invoice_priority_products_(2008)',
       'any__invoice_priority_products_(2008)'],
      dtype='object')

## Structuring the sample sizing

In [178]:
mapped_df = testing_enriched 
# === Step 0: Define sites and filter columns ===
#site_list = ['SPJ', 'SPT', 'SPW']
site_list = ['DIT', 'SPJ', 'SPN', 'SPT', 'SPW']

filter_columns = [
    'all_accounts_2008_uom_classified',
    'all_2008_accounts_converted',
    'all__invoice_priority_products_(2008)',
    'has_freight_line',
]



# === Step B: Apply combined filters ===
filtered_df = mapped_df[
    (mapped_df['all_accounts_2008_uom_classified'] == True) &
    (mapped_df['all_2008_accounts_converted'] == True) &
    (mapped_df['all__invoice_priority_products_(2008)'] == True) &
    (mapped_df['has_freight_line'] == True) &
    (mapped_df['site'].isin(site_list)) 
]




In [179]:
def add_freight_per_invoice(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds a 'freight_per_invoice' column to the DataFrame where each row reflects
    the total freight cost (ACCOUNT == 5504) for its invoice_id.

    Parameters:
    - df: DataFrame with at least 'invoice_id', 'account', and 'invoice_line_total' columns

    Returns:
    - df: updated DataFrame with 'freight_per_invoice' column
    """
    # Step 1: Filter freight lines
    freight_lines = df[df['account'] == 5504]

    # Step 2: Sum freight per invoice
    freight_per_invoice = (
        freight_lines
        .groupby('invoice_id', as_index=False)['invoice_line_total']
        .sum()
        .rename(columns={'invoice_line_total': 'freight_per_invoice'})
    )

    # Step 3: Merge and propagate to all rows
    df = df.merge(freight_per_invoice, on='invoice_id', how='left')

    # Step 4: Fill NaN with 0 (invoices with no freight)
    df['freight_per_invoice'] = df['freight_per_invoice'].fillna(0)

    return df


In [180]:
output_df = add_freight_per_invoice(filtered_df)
output_df.head(2)

Unnamed: 0,site,site_description,supplier_no,supplier_name,invoice_id,invoice_no,date_posted,project_id,project_name,account,...,conversion_code,all_accounts_2008_uom_classified,all_2008_accounts_converted,has_freight_line,multiple_freight_lines,multiple_parts,multiple_commodities,all__invoice_priority_products_(2008),any__invoice_priority_products_(2008),freight_per_invoice
0,DIT,Diverzify Itasca,890,All Surfaces,443670,5378334,10-Jan-24,2311123624,Rick Sidor Residence,5504,...,nan_nan_nan,True,True,True,False,False,False,True,True,54.55
1,DIT,Diverzify Itasca,890,All Surfaces,443670,5378334,10-Jan-24,2311123624,Rick Sidor Residence,2008,...,LVT_1VNL_SQFT,True,True,True,False,False,False,True,True,54.55


In [181]:


filtered_df = output_df[output_df['conversion_code'] != 'nan_nan_nan']
# Create a new filename with the current timestamp
filtered_df.to_excel(f'../../data/output/{date}_modelling_input_v_{timestamp}.xlsx', index=False, engine='openpyxl')
filtered_df.to_csv(f'../../data/output/{date}_modelling_input_v_{timestamp}.csv', index=False,)

# Save the filtered DataFrame to an Excel file