In [186]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [187]:


# Load your dataset
file_path = "../../data/input/Freight_Cost_Analysis_CY2024-03.25.csv"
df = pd.read_csv(file_path, encoding="latin1", low_memory=False)
df.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,PART DESCRIPTION,COMM 1,COMM 2,PO PURCH QTY,PURCH UOM,PO INV QTY,INV UOM,INVOICED LINE QTY,INVOICE LINE TOTAL,PO PRICE
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,,,,,,,,,19.07,
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,Metroflor Deja New San Marcus Oak 20 mil Micro...,1VNL,2LVP,675.0,SF,675.0,SF,675.0,1343.25,1343.25


In [188]:

#  === Load Commodity Groups ===
# Load the commodity groups from the Excel file
commodity_df = pd.read_excel('../../data/input/IFS Cloud Commodity Groups.xlsx', sheet_name='Commodity Groups')
commodity_df.head()


# Convert 'Commodity Group' to string and create a new column 'COMM 1'
commodity_df['COMM 1'] = commodity_df['Commodity Group'].astype(str)

# Convert 'Commodity Group' to string in the main DataFrame
df['COMM 1'] = df['COMM 1'].astype(str)

# Perform the join on the 'COMM 1' column
merged_df = df.merge(commodity_df, on='COMM 1', how='left')
# Flag matched and unmatched rows clearly
merged_df['Match Commodity'] = merged_df['Commodity Group'].apply(
    lambda x: 'Commodity Found' if pd.notna(x) else 'Commodity Not Found'
)
merged_df.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,PO INV QTY,INV UOM,INVOICED LINE QTY,INVOICE LINE TOTAL,PO PRICE,Commodity Group,Description,Old/New,Priority,Match Commodity
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,,,,19.07,,,,,,Commodity Not Found
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,675.0,SF,675.0,1343.25,1343.25,1VNL,Vinyl,New Commodity,Yes,Commodity Found


In [189]:
# Replace values in the 'uom' column
merged_df['INV UOM'] = merged_df['INV UOM'].replace({'SF': 'SQFT', 'SY': 'SQYD'})
merged_df.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,PO INV QTY,INV UOM,INVOICED LINE QTY,INVOICE LINE TOTAL,PO PRICE,Commodity Group,Description,Old/New,Priority,Match Commodity
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,,,,19.07,,,,,,Commodity Not Found
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,675.0,SQFT,675.0,1343.25,1343.25,1VNL,Vinyl,New Commodity,Yes,Commodity Found


In [190]:

#  === Load Manufacturers ===
# Load the manufacturers from the Excel file
manufacturer_df = pd.read_excel('../../data/input/Manufacturer List.xlsx', sheet_name='Sheet1')
manufacturer_df.head()

# Convert 'Commodity Group' to string and create a new column 'COMM 1'
manufacturer_df['SUPPLIER NO'] = manufacturer_df['Supplier No'].astype(str)
manufacturer_df.head(2)


Unnamed: 0,Supplier No,Supplier Name,SUPPLIER NO
0,104471,Adleta Corporation,104471
1,128340,"AHF, LLC dba AHF Products",128340


In [191]:

# Convert 'Commodity Group' to string in the main DataFrame
merged_df['SUPPLIER NO'] = merged_df['SUPPLIER NO'].astype(str)
# Perform the join on the 'COMM 1' column
merged_df2 = merged_df.merge(manufacturer_df[['SUPPLIER NO','Supplier Name']], on='SUPPLIER NO', how='left')

merged_df2['Match Supplier'] = merged_df2['Supplier Name'].apply(
    lambda x: 'Supplier registered' if pd.notna(x) else 'No supplier found'
)

# Display the updated DataFrame
merged_df2.head(2)


Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,INVOICED LINE QTY,INVOICE LINE TOTAL,PO PRICE,Commodity Group,Description,Old/New,Priority,Match Commodity,Supplier Name,Match Supplier
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,,19.07,,,,,,Commodity Not Found,Mohawk Industries,Supplier registered
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,675.0,1343.25,1343.25,1VNL,Vinyl,New Commodity,Yes,Commodity Found,,No supplier found


In [192]:
# data cleaning function to standardise the description conversion
# This function will classify the commodity based on the description
def classify_commodity(row):
    desc = str(row['Description']).strip()
    desc_lower = desc.lower()

    if desc_lower == 'vinyl':
        return ''.join(filter(str.isalpha, str(row['COMM 2'])))
    elif desc_lower == 'carpet bl':
        return 'Carpet Roll'
    elif desc_lower == 'carpet tile':
        return 'Carpet Tiles'
    elif desc_lower == 'carpet':
        return 'Carpet Roll'
    else:
        return desc  # Default fallback to original


# Apply to new column
merged_df2['new commodity description'] = merged_df2.apply(classify_commodity, axis=1)
# Display the updated DataFrame
merged_df2.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,INVOICE LINE TOTAL,PO PRICE,Commodity Group,Description,Old/New,Priority,Match Commodity,Supplier Name,Match Supplier,new commodity description
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,19.07,,,,,,Commodity Not Found,Mohawk Industries,Supplier registered,
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,1343.25,1343.25,1VNL,Vinyl,New Commodity,Yes,Commodity Found,,No supplier found,LVP


In [193]:
# This function will classify the commodity from old codes to new codes
def map_commodity_group(x):
    x_str = str(x).strip()  # Strip whitespace, just in case

    if x_str == '10':
        return '1CBL'
    elif x_str == '100':
        return '1CPT'
    elif x_str == '40':
        return '1VNL'
    else:
        return x  # Keep original value if none of the above match

# Apply the function to update the column
merged_df2['new commodity group'] = merged_df2['Commodity Group'].apply(map_commodity_group)


In [194]:

# Display the updated DataFrame
merged_df2.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,PO PRICE,Commodity Group,Description,Old/New,Priority,Match Commodity,Supplier Name,Match Supplier,new commodity description,new commodity group
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,,,,,,Commodity Not Found,Mohawk Industries,Supplier registered,,
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,1343.25,1VNL,Vinyl,New Commodity,Yes,Commodity Found,,No supplier found,LVP,1VNL


In [195]:
merged_df2['new commodity description'].unique()

array(['nan', 'LVP', 'Carpet Roll', 'VCT', 'LVT', 'Accessories',
       'Transitions', 'ROLL', 'Base', 'Vinyl Accessories', 'Adhesive',
       'Carpet Tiles', 'Vinyl Tile', 'Padding', 'Miscellaneous', 'Stairs',
       'Ceramic/Natural Stone', 'Sundry', 'Floor Prep', 'Rubber Flooring',
       'Ceramic Sundries', 'Ceramic', 'Mapei Products',
       'Installation Supplies', 'Specialty Tile', 'Sheet Vinyl',
       'Wallcovering', 'Linoleum', 'Equipment', 'Sundries',
       'Metal Transitions', 'Mats', 'Tools', 'Adhesives',
       'Wood Accessories', 'Rubber Roll Goods', 'Product Care', 'Other',
       'Underlayment', 'Vinyl Transitions', 'Rugs', 'Stair Accessories',
       'Backing', 'Epoxy', 'Wood', 'Grout', 'Trim Piece',
       'General Inventory', 'Weld Rod', 'Rubber Accessories', 'Laminate',
       'Rubber Tile', 'Specialty Flooring', 'Cage - Adhesives & Supplies',
       'Wood Transitions', 'ESD', 'Natural Stone', 'Terrazzo',
       'Linoleum Tiles', 'Turf', 'Product Care Supplies',
 

In [196]:
# From this point df becomes default DB
# Set the DataFrame to merged_df2 for further processing
df = merged_df2

# Normalize the 'INV UOM' column to handle case sensitivity and strip spaces
df['INV UOM'] = df['INV UOM'].str.strip().str.upper()

# Classify rows based on 'INV UOM' values
df['Classification'] = df.apply(
    lambda row: 'Classified' if row['INV UOM'] in ['SQFT', 'SQYD']
    else ('No UOM' if pd.isna(row['INV UOM']) or row['INV UOM'] == '' else 'Unclassified'),
    axis=1
)
# Create a new column 'conversion_code' based on the 'Description' + 'Comodity Group' + 'INV UOM' column
df['conversion_code'] = df['new commodity description'].str.replace(' ', '_', regex=True).astype(str) + '_' + df['new commodity group'].astype(str) + '_' + df['INV UOM'].astype(str)
df.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,Description,Old/New,Priority,Match Commodity,Supplier Name,Match Supplier,new commodity description,new commodity group,Classification,conversion_code
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,,,,Commodity Not Found,Mohawk Industries,Supplier registered,,,No UOM,nan_nan_nan
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,Vinyl,New Commodity,Yes,Commodity Found,,No supplier found,LVP,1VNL,Classified,LVP_1VNL_SQFT


In [197]:
# checking which of the rows in an invoice matching 2008 has unclassified items
# Check if all rows with ACCOUNT == 2008 are classified
# Step 1: Identify invoice IDs where ALL rows with ACCOUNT == 2008 are classified
classified_invoice_ids = (
    df[df['ACCOUNT'] == 2008]
    .groupby('INVOICE ID')['Classification']
    .apply(lambda x: all(x == 'Classified'))
)

# Step 2: Filter to only invoice IDs where ALL 2008 accounts are classified
fully_classified_ids = classified_invoice_ids[classified_invoice_ids].index

# Step 3: Create a new column to mark if entire invoice is considered classified (based on the 2008 rule)
df['All Accounts 2008 Classified'] = df['INVOICE ID'].isin(fully_classified_ids)

# Step 4: Count how many invoices meet this condition
count_all_classified_invoices = df[df['All Accounts 2008 Classified']]['INVOICE ID'].nunique()

print(f"Number of invoices where all ACCOUNT == 2008 are classified: {count_all_classified_invoices}")


Number of invoices where all ACCOUNT == 2008 are classified: 28052


In [198]:
# Checking which of the rows in an invoice matching 2008 has existing conversion codes to area
# Step 1: Identify invoice IDs where ALL rows with ACCOUNT == 2008 have existing conversion codes
# 1. Load the rates table
rates_df = pd.read_csv('../../data/input/freight_model/conversion_table_standardized.csv')  # adjust path and filename
# Step 1: Ensure consistent data types
rates_df['conversion_code'] = rates_df['conversion_code'].astype(str)
df['conversion_code'] = df['conversion_code'].astype(str)

# Step 2: Create a set of valid conversion codes from the rates table
valid_codes = set(rates_df['conversion_code'].unique())

# Step 3: Filter only rows with ACCOUNT == 2008
df_2008 = df[df['ACCOUNT'] == 2008].copy()

# Step 4: For each INVOICE ID, check if all conversion codes for ACCOUNT 2008 are valid
invoice_validity = df_2008.groupby('INVOICE ID')['conversion_code'].apply(
    lambda codes: all(code in valid_codes for code in codes)
)

# Step 5: Get list of INVOICE IDs where all ACCOUNT 2008 codes are valid
fully_valid_invoice_ids = invoice_validity[invoice_validity].index

# Step 6: Create a new column in the main df that flags all rows for those invoices
df['All 2008 Accounts Converted'] = df['INVOICE ID'].isin(fully_valid_invoice_ids)

# Step 7: Count how many invoices meet this condition
count_all_valid_invoices = df[df['All 2008 Accounts Converted']]['INVOICE ID'].nunique()

print(f"Number of invoices where all ACCOUNT == 2008 have valid conversion codes: {count_all_valid_invoices}")


Number of invoices where all ACCOUNT == 2008 have valid conversion codes: 8590


In [199]:
# Step 1: Get INVOICE IDs that have at least one freight line (ACCOUNT == 5504)
freight_invoice_ids = df[df['ACCOUNT'] == 5504]['INVOICE ID'].unique()

# Step 2: Flag all rows where the INVOICE ID appears in that list
df['Has Freight Line'] = df['INVOICE ID'].isin(freight_invoice_ids)
# Step 3: Count how many invoices have at least one freight line
count_freight_invoices = df[df['Has Freight Line']]['INVOICE ID'].nunique()

print(f"Number of invoices with at least one freight line: {count_freight_invoices}")

Number of invoices with at least one freight line: 55942


In [200]:
# 
# Group by INVOICE ID and count the number of rows where ACCOUNT == 5504
freight_count = df[df['ACCOUNT'] == 5504].groupby('INVOICE ID').size()

# Flag invoices with more than 1 Project Freight line item
df['Multiple Freight Lines'] = df['INVOICE ID'].map(freight_count > 1).fillna(False)
# Step 4: Count how many invoices have multiple freight lines
count_multiple_freight_invoices = df[df['Multiple Freight Lines']]['INVOICE ID'].nunique()

print(f"Number of invoices with multiple freight lines: {count_multiple_freight_invoices}")

# Display the updated DataFrame
df.head(2)

Number of invoices with multiple freight lines: 197


  df['Multiple Freight Lines'] = df['INVOICE ID'].map(freight_count > 1).fillna(False)


Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,Supplier Name,Match Supplier,new commodity description,new commodity group,Classification,conversion_code,All Accounts 2008 Classified,All 2008 Accounts Converted,Has Freight Line,Multiple Freight Lines
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,Mohawk Industries,Supplier registered,,,No UOM,nan_nan_nan,True,True,True,False
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,,No supplier found,LVP,1VNL,Classified,LVP_1VNL_SQFT,True,True,True,False


In [201]:
# Step 1: Filter the DataFrame to only include rows where ACCOUNT == 2008
# These represent "Part Component" line items we're interested in
df_2008 = df[df['ACCOUNT'] == 2008]

# Step 2: Group by INVOICE ID and count the number of distinct partnumbers per invoice
# This tells us how many unique parts are associated with each invoice
component_count = df_2008.groupby('INVOICE ID')['PART NO'].nunique()

# Step 3: Create a new column 'Multiple Parts' in the main DataFrame
# For each INVOICE ID, mark True if it has more than one unique partnumber; otherwise False
# Invoices without any ACCOUNT == 2008 lines will get NaN, so we fill those with False
df['Multiple Parts'] = df['INVOICE ID'].map(component_count > 1).fillna(False)

# Step 4: Count how many invoices have multiple distinct part components
count_multiple_parts_invoices = df[df['Multiple Parts']]['INVOICE ID'].nunique()

# Step 5: Print the result for quick validation
print(f"Number of invoices with multiple distinct parts: {count_multiple_parts_invoices}")

# Step 6: Preview the updated DataFrame
df.head(2)


Number of invoices with multiple distinct parts: 47577


  df['Multiple Parts'] = df['INVOICE ID'].map(component_count > 1).fillna(False)


Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,Match Supplier,new commodity description,new commodity group,Classification,conversion_code,All Accounts 2008 Classified,All 2008 Accounts Converted,Has Freight Line,Multiple Freight Lines,Multiple Parts
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,Supplier registered,,,No UOM,nan_nan_nan,True,True,True,False,False
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,No supplier found,LVP,1VNL,Classified,LVP_1VNL_SQFT,True,True,True,False,False


In [202]:
# Step 1: Filter to only ACCOUNT == 2008 (Part Component line items)
df_2008 = df[df['ACCOUNT'] == 2008]

# Step 2: Group by INVOICE ID and count unique COMMODITY_GROUP per invoice
commodity_count = df_2008.groupby('INVOICE ID')['new commodity group'].nunique()

# Step 3: Create a new column 'Multiple Commodities' in the main DataFrame
df['Multiple Commodities'] = df['INVOICE ID'].map(commodity_count > 1).fillna(False)

# Step 4: Count how many invoices have multiple distinct commodity components
count_multiple_commodities_invoices = df[df['Multiple Commodities']]['INVOICE ID'].nunique()

# Step 5: Print the result for validation
print(f"Number of invoices with multiple distinct commodities: {count_multiple_commodities_invoices}")

# Step 6: Preview updated DataFrame
df.head(2)


Number of invoices with multiple distinct commodities: 16091


  df['Multiple Commodities'] = df['INVOICE ID'].map(commodity_count > 1).fillna(False)


Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,new commodity description,new commodity group,Classification,conversion_code,All Accounts 2008 Classified,All 2008 Accounts Converted,Has Freight Line,Multiple Freight Lines,Multiple Parts,Multiple Commodities
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,,,No UOM,nan_nan_nan,True,True,True,False,False,False
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,LVP,1VNL,Classified,LVP_1VNL_SQFT,True,True,True,False,False,False


In [203]:
# Step 1: Filter to ACCOUNT == 2008
df_2008 = df[df['ACCOUNT'] == 2008]

# Step 2: For each invoice, check if all rows with ACCOUNT == 2008 have Priority == 'Yes'
priority_flag = df_2008.groupby('INVOICE ID')['Priority'].apply(
    lambda x: all(x == 'Yes')
)

# Step 3: Get invoice IDs where all ACCOUNT 2008 rows have Priority == 'Yes'
priority_invoice_ids = priority_flag[priority_flag].index

# Step 4: Flag those invoice IDs across the full dataframe
df['All Priority Products (2008)'] = df['INVOICE ID'].isin(priority_invoice_ids)   
# Step 5: Count how many invoices meet this condition
count_priority_invoices = df[df['All Priority Products (2008)']]['INVOICE ID'].nunique()

# Step 6: Print the result for quick validation
print(f"Number of invoices where all ACCOUNT == 2008 have Priority == 'Yes': {count_priority_invoices}")


Number of invoices where all ACCOUNT == 2008 have Priority == 'Yes': 17084


In [204]:
# True if at least one ACCOUNT == 2008 line in the invoice has Priority == 'Yes'
priority_flag_any = df_2008.groupby('INVOICE ID')['Priority'].apply(lambda x: any(x == 'Yes'))
priority_invoice_ids_any = priority_flag_any[priority_flag_any].index
df['Any Priority Product (2008)'] = df['INVOICE ID'].isin(priority_invoice_ids_any)
# Step 5: Count how many invoices meet this condition
count_any_priority_invoices = df[df['Any Priority Product (2008)']]['INVOICE ID'].nunique()

# Step 6: Print the result for quick validation
print(f"Number of invoices where at least one ACCOUNT == 2008 has Priority == 'Yes': {count_any_priority_invoices}")

Number of invoices where at least one ACCOUNT == 2008 has Priority == 'Yes': 23536


In [205]:

template_columns = ['PROJECT ID','PROJECT NAME','PO NO','ACCOUNT', 'ACCOUNT DESCRIPTION',
                     'SITE','SITE DESCRIPTION',
                     'SUPPLIER NO', 'SUPPLIER NAME', 
                     'PART NO', 'PART DESCRIPTION',
                    'INVOICED LINE QTY','INVOICE ID',
                    'INVOICE NO','INV UOM','COMM 1','COMM 2',
                    'Commodity Group', 'Description',
                    'Old/New', 'Priority', 'Classification',
                    'conversion_code','INVOICE LINE TOTAL',
                    'Has Freight Line','Multiple Freight Lines',
                    'Multiple Parts','All Priority Products (2008)',
                    'Any Priority Product (2008)','Match Commodity','Match Supplier','new commodity description',
                    'new commodity group','All Accounts 2008 Classified','All 2008 Accounts Converted','Multiple Commodities']

In [206]:
# Define the mapping of template columns to the desired column names
column_mapping = {
    
    'PROJECT ID':'project_id',
    'PROJECT NAME':'project_name',
    'PO NO': 'po_no',
    'INVOICE ID': 'invoice_id',
    'INVOICE NO': 'invoice_no',
    'ACCOUNT':'account', 
    'ACCOUNT DESCRIPTION':'account_description',
    'SITE': 'siteid',
    'SITE DESCRIPTION': 'site',
    'SUPPLIER NO': 'supplierid',
    'SUPPLIER NAME': 'suppliername',
    'INVOICED LINE QTY': 'quantity',
    'PART NO': 'partnumber',
    'PART DESCRIPTION':'partdescription',
    'COMM 1': 'comm1',S
    'COMM 2': 'comm2',
    'Commodity Group': 'commodity_group',
    'Description': 'commoditydescription',
    'INV UOM': 'uom',
    'Priority':'priority', 
    'Classification': 'classification',
    'conversion_code': 'conversion_code',
    'Old/New': 'old_new',
    'Has Freight Line':'freight_invoice',
    'INVOICE LINE TOTAL': 'invoice_line_total',
    'Multiple Freight Lines':'multiple_freight_lines',
    'Multiple Parts':'multiple_parts',
    'All Priority Products (2008)':'all_priority_products',
    'Any Priority Product (2008)':'any_priority_products',
    'Match Commodity':'match_commodity',
    'Match Supplier':'match_supplier',
    'new commodity description':'new_commodity_description',
    'new commodity group':'new_commodity_group',
    'All Accounts 2008 Classified':'all_accounts_2008_classified',
    'All 2008 Accounts Converted':'all_2008_accounts_converted',
    'Multiple Commodities':'multiple_commodities'
}

# Rename the columns in the DataFrame
mapped_df = df[template_columns].rename(columns=column_mapping)

# Display the first few rows of the mapped DataFrame
mapped_df.head()

Unnamed: 0,project_id,project_name,po_no,account,account_description,siteid,site,supplierid,suppliername,partnumber,...,multiple_parts,all_priority_products,any_priority_products,match_commodity,match_supplier,new_commodity_description,new_commodity_group,all_accounts_2008_classified,all_2008_accounts_converted,multiple_commodities
0,2312127222,Blue Chip Casino Deli,100317,5504,PROJECT Freight,DIT,Diverzify Itasca,121550,Mohawk Industries,,...,False,True,True,Commodity Not Found,Supplier registered,,,True,True,False
1,2401132763,FAROPOINT LOBBY RENOVATION,55700,2008,Received Not Yet Invoiced,SPN,Spectra Norcross,103277,William M. Bird,198962-003,...,False,True,True,Commodity Found,No supplier found,LVP,1VNL,True,True,False
2,2401132815,PCI-SCU Griffin Carpet Replacement,55857,2008,Received Not Yet Invoiced,SPN,Spectra Norcross,103423,"Shaw Industries, Inc.",175523-002,...,False,True,True,Commodity Found,Supplier registered,Carpet Roll,1CBL,True,True,False
3,2407168896,Stanley D Lindsey Associates,131646,5504,PROJECT Freight,SPN,Spectra Norcross,107776,J.J. Haines & Company,,...,False,True,True,Commodity Not Found,Supplier registered,,,True,True,False
4,2310117701,IMP Suite 116,50923,2008,Received Not Yet Invoiced,SPT,Spectra Tampa,107776,J.J. Haines & Company,126732-137,...,False,True,True,Commodity Found,Supplier registered,VCT,1VNL,True,True,False


In [207]:
# Count the number of unique invoices where account == 5504
unique_invoices_5504 = mapped_df[mapped_df['account'] == 5504]['invoice_id'].nunique()

print(f"Number of unique invoices with account == 5504: {unique_invoices_5504}")

Number of unique invoices with account == 5504: 55942


In [208]:
# Filter for rows where account == 5504
account_5504_df = mapped_df[mapped_df['account'] == 5504]

# Calculate the total invoice_line_total for these rows
total_invoice_line_total_5504 = account_5504_df['invoice_line_total'].sum()

print(f"Total invoice_line_total for unique invoices with account == 5504: {total_invoice_line_total_5504}")

Total invoice_line_total for unique invoices with account == 5504: 16678535.819999998


In [209]:
import numpy as np
# Step 1: Filter rows where account == 5504
account_5504_df = mapped_df[mapped_df['account'] == 5504]

# Step 2: Group by invoice_id and calculate the total invoice_line_total for each invoice
total_invoice_per_invoice = mapped_df.groupby('invoice_id')['invoice_line_total'].sum().reset_index()
total_invoice_per_invoice.rename(columns={'invoice_line_total': 'total_invoice_line_total'}, inplace=True)

# Step 3: Merge the total invoice_line_total back into the account_5504_df
account_5504_df = account_5504_df.merge(total_invoice_per_invoice, on='invoice_id', how='left')

# Step 4: Calculate the percentage of total_invoice_line_total_5504 to the total invoice_line_total
account_5504_df['percentage_of_total'] = (account_5504_df['invoice_line_total'] / account_5504_df['total_invoice_line_total']) * 100
# Replace NaN values with 0 in the 'percentage_of_total' column
account_5504_df['percentage_of_total'] = account_5504_df['percentage_of_total'].fillna(0)
# # Clean: Drop infinities
# account_5504_df = account_5504_df[np.isfinite(account_5504_df['percentage_of_total'])]

# Step 5: Calculate the average percentage for all invoices with account == 5504
average_percentage = account_5504_df['percentage_of_total'].mean()

print(f"Average percentage of total_invoice_line_total_5504 to the total invoice_line_total: {average_percentage:.2f}%")

Average percentage of total_invoice_line_total_5504 to the total invoice_line_total: nan%


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


In [210]:
account_5504_df['percentage_of_total'].describe()

count    5.614500e+04
mean              NaN
std               NaN
min              -inf
25%      2.109168e+00
50%      6.168169e+00
75%      1.638568e+01
max               inf
Name: percentage_of_total, dtype: float64

In [211]:
mapped_df.columns

Index(['project_id', 'project_name', 'po_no', 'account', 'account_description',
       'siteid', 'site', 'supplierid', 'suppliername', 'partnumber',
       'partdescription', 'quantity', 'invoice_id', 'invoice_no', 'uom',
       'comm1', 'comm2', 'commodity_group', 'commoditydescription', 'old_new',
       'priority', 'classification', 'conversion_code', 'invoice_line_total',
       'freight_invoice', 'multiple_freight_lines', 'multiple_parts',
       'all_priority_products', 'any_priority_products', 'match_commodity',
       'match_supplier', 'new_commodity_description', 'new_commodity_group',
       'all_accounts_2008_classified', 'all_2008_accounts_converted',
       'multiple_commodities'],
      dtype='object')

In [212]:
# Filter the DataFrame
filtered_invoices = mapped_df[(mapped_df['freight_invoice'] == True) & (mapped_df['any_priority_products'] == True)]

# Count unique invoice IDs
unique_invoice_count = filtered_invoices['invoice_id'].nunique()

print(f"Number of unique invoices where freight_invoice == True and any_priority_products == True: {unique_invoice_count}")

Number of unique invoices where freight_invoice == True and any_priority_products == True: 18551


In [213]:
# Group by invoice_id and sum the invoice_line_total where Account == 2008 and Priority == 'Yes'
priority_yes_totals = filtered_invoices[
    (filtered_invoices['account'] == 2008) & (filtered_invoices['priority'] == 'Yes')
].groupby('invoice_id')['invoice_line_total'].sum().reset_index()
priority_yes_totals.rename(columns={'invoice_line_total': 'priority_yes_total'}, inplace=True)

# Count the number of unique invoices for Priority == 'Yes'
priority_yes_unique_invoices = priority_yes_totals['invoice_id'].nunique()

# Group by invoice_id and sum the invoice_line_total where Account == 2008 and Priority == 'No'
priority_no_totals = filtered_invoices[
    (filtered_invoices['account'] == 2008) & (filtered_invoices['priority'] == 'No')
].groupby('invoice_id')['invoice_line_total'].sum().reset_index()
priority_no_totals.rename(columns={'invoice_line_total': 'priority_no_total'}, inplace=True)

# Count the number of unique invoices for Priority == 'No'
priority_no_unique_invoices = priority_no_totals['invoice_id'].nunique()

# # Print the results
# print(f"Total unique invoices for Priority == 'Yes': {priority_yes_unique_invoices}")
# print(f"Total unique invoices for Priority == 'No': {priority_no_unique_invoices}")
priority_yes_totals.head(2)

Unnamed: 0,invoice_id,priority_yes_total
0,357583,817.2
1,357863,3484.01


In [214]:
priority_no_totals.head(2)

Unnamed: 0,invoice_id,priority_no_total
0,357583,283.0
1,357863,135.2


In [215]:
# Merge the priority_yes_totals and priority_no_totals dataframes on invoice_id
priority_totals = pd.merge(priority_yes_totals, priority_no_totals, on='invoice_id', how='outer').fillna(0)

# Calculate the total invoice_line_total for each invoice
priority_totals['total_invoice_line_total'] = priority_totals['priority_yes_total'] + priority_totals['priority_no_total']

# Calculate the percentage of "Yes" and "No" totals for each invoice
priority_totals['percentage_yes'] = (priority_totals['priority_yes_total'] / priority_totals['total_invoice_line_total']) * 100
priority_totals['percentage_no'] = (priority_totals['priority_no_total'] / priority_totals['total_invoice_line_total']) * 100

# Replace NaN values with 0 in the percentage columns
priority_totals['percentage_yes'] = priority_totals['percentage_yes'].fillna(0)
priority_totals['percentage_no'] = priority_totals['percentage_no'].fillna(0)

# Display the resulting dataframe
priority_totals.head()

Unnamed: 0,invoice_id,priority_yes_total,priority_no_total,total_invoice_line_total,percentage_yes,percentage_no
0,357583,817.2,283.0,1100.2,74.277404,25.722596
1,357863,3484.01,135.2,3619.21,96.264378,3.735622
2,357870,14475.5,0.0,14475.5,100.0,0.0
3,357875,10615.6,0.0,10615.6,100.0,0.0
4,357881,54.52,0.0,54.52,100.0,0.0


In [216]:
mapped_dic = {
           'priority_yes_total': 'Priority_product_invoice_total',
           'total_invoice_line_total': 'All_priority_invoice_total',
           'percentage_yes': 'pct_priority_product_invoice_total'
        }

df_renamed =priority_totals.rename(columns=mapped_dic)
df_renamed.head()

Unnamed: 0,invoice_id,Priority_product_invoice_total,priority_no_total,All_priority_invoice_total,pct_priority_product_invoice_total,percentage_no
0,357583,817.2,283.0,1100.2,74.277404,25.722596
1,357863,3484.01,135.2,3619.21,96.264378,3.735622
2,357870,14475.5,0.0,14475.5,100.0,0.0
3,357875,10615.6,0.0,10615.6,100.0,0.0
4,357881,54.52,0.0,54.52,100.0,0.0


In [217]:
# Create a new flag column
df_renamed['low_mix_priority_flag'] = df_renamed['pct_priority_product_invoice_total'] > 70

# Display the updated DataFrame
df_renamed.head()

Unnamed: 0,invoice_id,Priority_product_invoice_total,priority_no_total,All_priority_invoice_total,pct_priority_product_invoice_total,percentage_no,low_mix_priority_flag
0,357583,817.2,283.0,1100.2,74.277404,25.722596,True
1,357863,3484.01,135.2,3619.21,96.264378,3.735622,True
2,357870,14475.5,0.0,14475.5,100.0,0.0,True
3,357875,10615.6,0.0,10615.6,100.0,0.0,True
4,357881,54.52,0.0,54.52,100.0,0.0,True


In [230]:
df_renamed['pct_priority_product_invoice_total'].describe()

count    18551.000000
mean        96.492913
std         10.308001
min          0.000000
25%         98.584908
50%        100.000000
75%        100.000000
max        100.000000
Name: pct_priority_product_invoice_total, dtype: float64

In [218]:
# Count unique invoice IDs grouped by low_mix_Priority_Flag
low_mix_priority_counts = df_renamed.groupby('low_mix_priority_flag')['invoice_id'].nunique()

# Display the counts
print(low_mix_priority_counts)

low_mix_priority_flag
False      538
True     18013
Name: invoice_id, dtype: int64


In [219]:
# Assuming the flag column is in a DataFrame named `priority_totals`
# and the flag column is named 'High_Priority_Flag'

# Merge the flag column into the filtered_invoices DataFrame
mapped_df = mapped_df.merge(
    df_renamed[['invoice_id', 'low_mix_priority_flag']],
    on='invoice_id',
    how='left'
)

# Display the updated DataFrame
mapped_df.head()

Unnamed: 0,project_id,project_name,po_no,account,account_description,siteid,site,supplierid,suppliername,partnumber,...,all_priority_products,any_priority_products,match_commodity,match_supplier,new_commodity_description,new_commodity_group,all_accounts_2008_classified,all_2008_accounts_converted,multiple_commodities,low_mix_priority_flag
0,2312127222,Blue Chip Casino Deli,100317,5504,PROJECT Freight,DIT,Diverzify Itasca,121550,Mohawk Industries,,...,True,True,Commodity Not Found,Supplier registered,,,True,True,False,True
1,2401132763,FAROPOINT LOBBY RENOVATION,55700,2008,Received Not Yet Invoiced,SPN,Spectra Norcross,103277,William M. Bird,198962-003,...,True,True,Commodity Found,No supplier found,LVP,1VNL,True,True,False,True
2,2401132815,PCI-SCU Griffin Carpet Replacement,55857,2008,Received Not Yet Invoiced,SPN,Spectra Norcross,103423,"Shaw Industries, Inc.",175523-002,...,True,True,Commodity Found,Supplier registered,Carpet Roll,1CBL,True,True,False,True
3,2407168896,Stanley D Lindsey Associates,131646,5504,PROJECT Freight,SPN,Spectra Norcross,107776,J.J. Haines & Company,,...,True,True,Commodity Not Found,Supplier registered,,,True,True,False,True
4,2310117701,IMP Suite 116,50923,2008,Received Not Yet Invoiced,SPT,Spectra Tampa,107776,J.J. Haines & Company,126732-137,...,True,True,Commodity Found,Supplier registered,VCT,1VNL,True,True,False,True


In [220]:
# Export the `mapped_df` DataFrame to a CSV file in the `data/downloads` directory
mapped_df.to_csv('../../data/output/mapped_df.csv', index=False)

## Structuring the sample sizing

In [221]:
# === Step 0: Define sites and filter columns ===
#site_list = ['SPJ', 'SPT', 'SPW']
site_list = ['DIT', 'SPJ', 'SPN', 'SPT', 'SPW']

filter_columns = [
    'all_accounts_2008_classified',
    'all_2008_accounts_converted',
    'all_priority_products',
    'freight_invoice'
]

# === Step A: Pre-filter Summary Table ===
summary_table = pd.DataFrame(index=filter_columns, columns=site_list)

for f in filter_columns:
    for site in site_list:
        count = mapped_df[
            (mapped_df['siteid'] == site) & (mapped_df[f] == True)
        ]['invoice_id'].nunique()
        summary_table.loc[f, site] = count

# Print pre-filter summary table
print("\n=== Pre-Filter Summary Table (Unique Invoice Counts per Filter per Site) ===")
print(summary_table)

# === Step B: Apply combined filters ===
filtered_df = mapped_df[
    (mapped_df['all_accounts_2008_classified'] == True) &
    (mapped_df['all_2008_accounts_converted'] == True) &
    (mapped_df['all_priority_products'] == True) &
    (mapped_df['freight_invoice'] == True)
]

# === Step C: Apply site-specific filter ===
filtered_df = filtered_df[filtered_df['siteid'].isin(site_list)]

# === Step D: Count and print summary info ===
unique_invoices_count = filtered_df['invoice_id'].nunique()
print(f"\nTotal number of unique invoices in the filtered DataFrame: {unique_invoices_count}")

unique_invoices_per_site = filtered_df.groupby('siteid')['invoice_id'].nunique()
print("\nUnique invoice count by site (after full filtering):")
print(unique_invoices_per_site)

# === Step E: Post-filter Summary Table ===
summary_table_post_filter = pd.DataFrame(index=filter_columns, columns=site_list)

for f in filter_columns:
    for site in site_list:
        count = filtered_df[
            (filtered_df['siteid'] == site) & (filtered_df[f] == True)
        ]['invoice_id'].nunique()
        summary_table_post_filter.loc[f, site] = count

# Print post-filter summary table
print("\n=== Post-Filter Summary Table (Unique Invoice Counts per Filter per Site) ===")
print(summary_table_post_filter)

# === Step F: Preview filtered DataFrame ===
print("\n=== Preview of Filtered Data ===")
print(filtered_df.head())



=== Pre-Filter Summary Table (Unique Invoice Counts per Filter per Site) ===
                               DIT   SPJ   SPN   SPT   SPW
all_accounts_2008_classified   680   817  3745  2224  2319
all_2008_accounts_converted    145   293  1205   618   709
all_priority_products          832   451  2056  1136  1313
freight_invoice               4848  1192  4532  3908  4102

Total number of unique invoices in the filtered DataFrame: 2040

Unique invoice count by site (after full filtering):
siteid
DIT    108
SPJ    259
SPN    477
SPT    543
SPW    653
Name: invoice_id, dtype: int64

=== Post-Filter Summary Table (Unique Invoice Counts per Filter per Site) ===
                              DIT  SPJ  SPN  SPT  SPW
all_accounts_2008_classified  108  259  477  543  653
all_2008_accounts_converted   108  259  477  543  653
all_priority_products         108  259  477  543  653
freight_invoice               108  259  477  543  653

=== Preview of Filtered Data ===
   project_id                   

In [222]:
filtered_df.columns

Index(['project_id', 'project_name', 'po_no', 'account', 'account_description',
       'siteid', 'site', 'supplierid', 'suppliername', 'partnumber',
       'partdescription', 'quantity', 'invoice_id', 'invoice_no', 'uom',
       'comm1', 'comm2', 'commodity_group', 'commoditydescription', 'old_new',
       'priority', 'classification', 'conversion_code', 'invoice_line_total',
       'freight_invoice', 'multiple_freight_lines', 'multiple_parts',
       'all_priority_products', 'any_priority_products', 'match_commodity',
       'match_supplier', 'new_commodity_description', 'new_commodity_group',
       'all_accounts_2008_classified', 'all_2008_accounts_converted',
       'multiple_commodities', 'low_mix_priority_flag'],
      dtype='object')

In [223]:
# Step 1: Filter rows where account == 5504 (freight lines)
freight_lines = filtered_df[filtered_df['account'] == 5504]

# Step 2: Group by invoice_id and calculate the total freight cost per invoice
freight_per_invoice = freight_lines.groupby('invoice_id', as_index=False).agg({'invoice_line_total': 'sum'})
freight_per_invoice.rename(columns={'invoice_line_total': 'freight_per_invoice'}, inplace=True)

# Step 3: Merge the freight cost back into the original DataFrame
filtered_df = filtered_df.merge(freight_per_invoice, on='invoice_id', how='left')

# Step 4: Fill NaN values in the new column with 0
filtered_df['freight_per_invoice'] = filtered_df['freight_per_invoice'].fillna(0)
# Display the updated DataFrame
filtered_df.head()

Unnamed: 0,project_id,project_name,po_no,account,account_description,siteid,site,supplierid,suppliername,partnumber,...,any_priority_products,match_commodity,match_supplier,new_commodity_description,new_commodity_group,all_accounts_2008_classified,all_2008_accounts_converted,multiple_commodities,low_mix_priority_flag,freight_per_invoice
0,2312127222,Blue Chip Casino Deli,100317,5504,PROJECT Freight,DIT,Diverzify Itasca,121550,Mohawk Industries,,...,True,Commodity Not Found,Supplier registered,,,True,True,False,True,19.07
1,2401132763,FAROPOINT LOBBY RENOVATION,55700,2008,Received Not Yet Invoiced,SPN,Spectra Norcross,103277,William M. Bird,198962-003,...,True,Commodity Found,No supplier found,LVP,1VNL,True,True,False,True,5.79
2,2401132815,PCI-SCU Griffin Carpet Replacement,55857,2008,Received Not Yet Invoiced,SPN,Spectra Norcross,103423,"Shaw Industries, Inc.",175523-002,...,True,Commodity Found,Supplier registered,Carpet Roll,1CBL,True,True,False,True,2.02
3,2407168896,Stanley D Lindsey Associates,131646,5504,PROJECT Freight,SPN,Spectra Norcross,107776,J.J. Haines & Company,,...,True,Commodity Not Found,Supplier registered,,,True,True,False,True,1.13
4,2310117701,IMP Suite 116,50923,2008,Received Not Yet Invoiced,SPT,Spectra Tampa,107776,J.J. Haines & Company,126732-137,...,True,Commodity Found,Supplier registered,VCT,1VNL,True,True,False,True,2.23


In [224]:
# Group by siteid and conversion_code, then calculate the unique total for each group based on freight_per_invoice
grouped_summary = filtered_df.groupby(['siteid','new_commodity_group','new_commodity_description',]).agg(
    unique_freight_total=('invoice_id', lambda x: freight_per_invoice.loc[freight_per_invoice['invoice_id'].isin(x), 'freight_per_invoice'].sum().round(2)),
    unique_invoices=('invoice_id', 'nunique')
).reset_index()



# Display the grouped summary
grouped_summary.head(50)


Unnamed: 0,siteid,new_commodity_group,new_commodity_description,unique_freight_total,unique_invoices
0,DIT,1CBL,Carpet Roll,16381.65,44
1,DIT,1VNL,LVP,8194.05,27
2,DIT,1VNL,LVT,18910.37,33
3,DIT,1VNL,VCT,4791.19,6
4,SPJ,1CBL,Carpet Roll,7960.92,41
5,SPJ,1VNL,LVP,21829.93,55
6,SPJ,1VNL,LVT,41303.94,105
7,SPJ,1VNL,VCT,23431.56,64
8,SPN,1CBL,Carpet Roll,18486.17,35
9,SPN,1VNL,LVP,48006.12,108


In [225]:
filtered_df.columns

Index(['project_id', 'project_name', 'po_no', 'account', 'account_description',
       'siteid', 'site', 'supplierid', 'suppliername', 'partnumber',
       'partdescription', 'quantity', 'invoice_id', 'invoice_no', 'uom',
       'comm1', 'comm2', 'commodity_group', 'commoditydescription', 'old_new',
       'priority', 'classification', 'conversion_code', 'invoice_line_total',
       'freight_invoice', 'multiple_freight_lines', 'multiple_parts',
       'all_priority_products', 'any_priority_products', 'match_commodity',
       'match_supplier', 'new_commodity_description', 'new_commodity_group',
       'all_accounts_2008_classified', 'all_2008_accounts_converted',
       'multiple_commodities', 'low_mix_priority_flag', 'freight_per_invoice'],
      dtype='object')

In [226]:
# Get full timestamp as string
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Get just the date component as a datetime object
date= datetime.now().strftime("%Y%m%d")

filtered_df = filtered_df[filtered_df['conversion_code'] != 'nan_nan_nan']
# Create a new filename with the current timestamp
filtered_df.to_excel(f'../../data/output/{date}_modelling_input_v_{timestamp}.xlsx', index=False, engine='openpyxl')
filtered_df.to_csv(f'../../data/output/{date}_modelling_input_v_{timestamp}.csv', index=False,)

# Save the filtered DataFrame to an Excel file