In [35]:
import pandas as pd
import matplotlib.pyplot as plt

# Load your dataset
file_path = "../../data/input/Freight_Cost_Analysis_CY2024-03.25.csv"
df = pd.read_csv(file_path, encoding="latin1", low_memory=False)

#  === Load Commodity Groups ===
# Load the commodity groups from the Excel file
commodity_df = pd.read_excel('../../data/input/IFS Cloud Commodity Groups.xlsx', sheet_name='Commodity Groups')
commodity_df.head()

# Convert 'Commodity Group' to string and create a new column 'COMM 1'
commodity_df['COMM 1'] = commodity_df['Commodity Group'].astype(str)

# Convert 'Commodity Group' to string in the main DataFrame
df['COMM 1'] = df['COMM 1'].astype(str)

# Perform the join on the 'COMM 1' column
merged_df = df.merge(commodity_df, on='COMM 1', how='left')

# Display the first few rows of the merged DataFrame
merged_df.head()

# Replace values in the 'uom' column
merged_df['INV UOM'] = merged_df['INV UOM'].replace({'SF': 'SQFT', 'SY': 'SQYD'})

# Display the updated DataFrame
df = merged_df

# Normalize the 'INV UOM' column to handle case sensitivity and strip spaces
df['INV UOM'] = df['INV UOM'].str.strip().str.upper()

# Add a classification column to the original dataframe to classified unit of measure
df['Classification'] = df.apply(
    lambda row: 'Classified' if row['INV UOM'] in ['SQFT', 'SQYD'] else 'Unclassified',
    axis=1
)
# Create a new column 'conversion_code' based on the 'Description' + 'Comodity Group' + 'INV UOM' column
df['conversion_code'] = df['Description'].str.replace(' ', '_', regex=True).astype(str) + '_' + df['Commodity Group'].astype(str) + '_' + df['INV UOM'].astype(str)
df.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,INV UOM,INVOICED LINE QTY,INVOICE LINE TOTAL,PO PRICE,Commodity Group,Description,Old/New,Priority,Classification,conversion_code
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,-600.0,,,,,,Unclassified,nan_nan_nan
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,EA,4.0,18.4,18.4,1ACC,Accessories,New Commodity,No,Unclassified,Accessories_1ACC_EA


In [36]:
# Filter rows where ACCOUNT is 5504
freight_invoices = df[df['ACCOUNT'] == 5504]

# Check if INVOICE ID exists in the filtered DataFrame
df['Has Matching Account 5504'] = df['INVOICE ID'].isin(freight_invoices['INVOICE ID'])

# Display the result
df[['INVOICE ID', 'Has Matching Account 5504']]
df.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,INVOICED LINE QTY,INVOICE LINE TOTAL,PO PRICE,Commodity Group,Description,Old/New,Priority,Classification,conversion_code,Has Matching Account 5504
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,-600.0,,,,,,Unclassified,nan_nan_nan,False
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,4.0,18.4,18.4,1ACC,Accessories,New Commodity,No,Unclassified,Accessories_1ACC_EA,True


In [37]:
df.shape

(441707, 35)

In [38]:

# Filter and group: sum of INVOICE LINE TOTAL where ACCOUNT == 5504
freight_5504 = (
    df[df['ACCOUNT'] == 5504]
    .groupby(['PROJECT NAME', 'PROJECT ID', 'INVOICE ID', 'SUPPLIER NO', 'SUPPLIER NAME'])['INVOICE LINE TOTAL']
    .sum()
    .reset_index(name='Freight Cost')
)
freight_5504.head(2)

Unnamed: 0,PROJECT NAME,PROJECT ID,INVOICE ID,SUPPLIER NO,SUPPLIER NAME,Freight Cost
0,BHC1901 Bunker Hill CommuZ024,2403145698,1288202,101558,"The Belknap White Group, LLC dba UCX Flooring",17.35
1,"LEVIAT NE Trade Center, Bldg 1",2405158431,1252144,107297,Trinity Tile,1631.19


In [39]:
# isolate the Invoice ID and Freight Cost columns
invoice = freight_5504[['INVOICE ID','Freight Cost']]
invoice.head()

Unnamed: 0,INVOICE ID,Freight Cost
0,1288202,17.35
1,1252144,1631.19
2,1272161,1285.02
3,1301523,18.2
4,1667076,225.91


In [40]:
# Merge df on invoice ID to get the freight cost
merged_df = df.merge(invoice, on='INVOICE ID', how='left')
merged_df.head(2)
merged_df.shape

(441707, 36)

In [41]:
# Create a flag that shows multiple freight lines for a single invoice ID
# Count the occurrences of Account == 5504 for each INVOICE ID
account_5504_counts = df[df['ACCOUNT'] == 5504].groupby('INVOICE ID').size().reset_index(name='Freight count')

# Identify INVOICE IDs with more than one occurrence of Account == 5504
multiple_5504_ids = account_5504_counts[account_5504_counts > 1].index


# Create a boolean flag in the DataFrame
df['Multiple Account 5504'] = df['INVOICE ID'].isin(multiple_5504_ids)
df[['INVOICE ID', 'Multiple Account 5504']].head()

# Merge tthe flagg back to the merged DataFrame
merged_df = merged_df.merge(df[['INVOICE ID', 'Multiple Account 5504']], on='INVOICE ID', how='left')
merged_df.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,PO PRICE,Commodity Group,Description,Old/New,Priority,Classification,conversion_code,Has Matching Account 5504,Freight Cost,Multiple Account 5504
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,,,,Unclassified,nan_nan_nan,False,,False
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,18.4,1ACC,Accessories,New Commodity,No,Unclassified,Accessories_1ACC_EA,True,50.0,False


In [43]:
# Check if an invoice has multiple conversion codes
conversion_code_counts = merged_df.groupby('INVOICE ID')['conversion_code'].nunique()

# Identify invoices with multiple conversion codes
multiple_conversion_codes = conversion_code_counts[conversion_code_counts > 1].index

# Create a flag in the DataFrame
merged_df['Multiple Conversion Codes'] = merged_df['INVOICE ID'].isin(multiple_conversion_codes)

# Display the updated DataFrame
merged_df[['INVOICE ID', 'Multiple Conversion Codes']].head()
merged_df.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,Commodity Group,Description,Old/New,Priority,Classification,conversion_code,Has Matching Account 5504,Freight Cost,Multiple Account 5504,Multiple Conversion Codes
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,,,Unclassified,nan_nan_nan,False,,False,False
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,1ACC,Accessories,New Commodity,No,Unclassified,Accessories_1ACC_EA,True,50.0,False,True


In [None]:
# Merge the conversion code counts back to the merged Data