In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Load your dataset
file_path = "../data/input/Freight_Cost_Analysis_CY2024-03.25.csv"
df = pd.read_csv(file_path, encoding="latin1", low_memory=False)

# === Flag if a PO has any 'PROJECT Freight' using ACCOUNT code 5504 ===
df['IS_PROJECT_FREIGHT_LINE'] = df['ACCOUNT'] == 5504
project_freight_flag = df[df['IS_PROJECT_FREIGHT_LINE']].groupby('PO NO').size().reset_index(name='PROJECT_FREIGHT_COUNT')
project_freight_flag['PO_HAS_PROJECT_FREIGHT'] = True
df = df.merge(project_freight_flag[['PO NO', 'PO_HAS_PROJECT_FREIGHT']], on='PO NO', how='left')
df['PO_HAS_PROJECT_FREIGHT'] = df['PO_HAS_PROJECT_FREIGHT'].fillna(False).astype(bool)

# === Step 1: Freight Spend Profiling ===
freight_lines = df[df['IS_PROJECT_FREIGHT_LINE']]

total_po_value = df.groupby('PO NO', as_index=False)['INVOICE LINE TOTAL'].sum().rename(
    columns={'INVOICE LINE TOTAL': 'TOTAL_PO_VALUE'}
)
freight_cost = freight_lines.groupby('PO NO', as_index=False)['INVOICE LINE TOTAL'].sum().rename(
    columns={'INVOICE LINE TOTAL': 'PROJECT_FREIGHT_COST'}
)

df = df.merge(total_po_value, on='PO NO', how='left')
df = df.merge(freight_cost, on='PO NO', how='left')

# Convert to numeric types
df['TOTAL_PO_VALUE'] = pd.to_numeric(df['TOTAL_PO_VALUE'], errors='coerce')
df['PROJECT_FREIGHT_COST'] = pd.to_numeric(df['PROJECT_FREIGHT_COST'], errors='coerce').fillna(0)

# Calculations with enforced numeric types
df['PRODUCT_ONLY_PO_VALUE'] = df['TOTAL_PO_VALUE'] - df['PROJECT_FREIGHT_COST']
df['PRODUCT_ONLY_PO_VALUE'] = pd.to_numeric(df['PRODUCT_ONLY_PO_VALUE'], errors='coerce')

df['PROJECT_FREIGHT_PERCENT'] = df['PROJECT_FREIGHT_COST'] / df['TOTAL_PO_VALUE']
df['PROJECT_FREIGHT_PERCENT'] = pd.to_numeric(df['PROJECT_FREIGHT_PERCENT'], errors='coerce')

df['FREIGHT_PERCENT_EXCL_PRODUCT'] = df['PROJECT_FREIGHT_COST'] / df['PRODUCT_ONLY_PO_VALUE']
df['FREIGHT_PERCENT_EXCL_PRODUCT'] = pd.to_numeric(df['FREIGHT_PERCENT_EXCL_PRODUCT'], errors='coerce')

# Flags
df['FREIGHT_≥90%_OF_PO'] = (df['PROJECT_FREIGHT_PERCENT'] >= 0.9).astype(bool)
df['FREIGHT_GT_PRODUCT'] = (df['PROJECT_FREIGHT_COST'] > df['PRODUCT_ONLY_PO_VALUE']).astype(bool)
df['NEGATIVE_FREIGHT_PERCENT'] = (df['PROJECT_FREIGHT_PERCENT'] < 0).astype(bool)

# Count number of ACCOUNT 5504 lines per PO
freight_line_counts = df[df['ACCOUNT'] == 5504].groupby('PO NO').agg(
    PROJECT_FREIGHT_LINE_COUNT=('ACCOUNT', 'count')
).reset_index()

df = df.merge(freight_line_counts, on='PO NO', how='left')
df['PROJECT_FREIGHT_LINE_COUNT'] = df['PROJECT_FREIGHT_LINE_COUNT'].fillna(0).astype(int)
df['PO_HAS_MULTIPLE_PROJECT_FREIGHT_LINES'] = df['PROJECT_FREIGHT_LINE_COUNT'] > 1

# === Step 2: ZIP & Supplier Analysis ===
df['SHIP TO ZIP'] = df['SHIP TO ZIP'].astype(str).str.extract(r'(\d{5})')
freight_df = df[df['PROJECT_FREIGHT_COST'] > 0]
zip_supplier_summary = freight_df.groupby(['SHIP TO ZIP', 'SUPPLIER NO']).agg(
    AVG_FREIGHT_PERCENT=('PROJECT_FREIGHT_PERCENT', 'mean'),
    PO_COUNT=('PO NO', 'nunique')
).reset_index()
zip_supplier_summary['HIGH_FREIGHT_FLAG'] = zip_supplier_summary['AVG_FREIGHT_PERCENT'] > 0.5
df = df.merge(zip_supplier_summary[['SHIP TO ZIP', 'SUPPLIER NO', 'HIGH_FREIGHT_FLAG']],
              on=['SHIP TO ZIP', 'SUPPLIER NO'], how='left')

# === Step 3: Product Analysis ===
labor_keywords = ['LABOR', 'INSTALL', 'SERVICE', 'WAGE', 'CONTRACT', 'EMPLOYEE']
df['PART DESCRIPTION CLEAN'] = df['PART DESCRIPTION'].astype(str).str.upper()
df['IS_LABOR'] = df['PART DESCRIPTION CLEAN'].apply(lambda desc: any(k in desc for k in labor_keywords))
product_counts = df[~df['IS_LABOR'] & df['PART DESCRIPTION'].notna()].groupby(
    'PART DESCRIPTION'
).size().reset_index(name='PRODUCT_ORDER_COUNT')
product_counts['PRODUCT_ORDER_RANK'] = product_counts['PRODUCT_ORDER_COUNT'].rank(method='dense', ascending=False).astype(int)
df = df.merge(product_counts, on='PART DESCRIPTION', how='left')

# === Step 4: UOM Consistency ===
def compare_uom(row):
    if pd.isna(row['INV UOM']):
        return 'Missing INV UOM'
    elif row['PURCH UOM'] == row['INV UOM']:
        return 'Match'
    else:
        return 'Mismatch'
df['UOM_COMPARISON_STATUS'] = df.apply(compare_uom, axis=1)

# === Step 5: Key Metrics Summary ===
po_summary = df[['PO NO', 'PROJECT_FREIGHT_COST', 'PRODUCT_ONLY_PO_VALUE']].drop_duplicates().dropna()
po_summary['FREIGHT_PERCENT_EXCL_PRODUCT'] = po_summary['PROJECT_FREIGHT_COST'] / po_summary['PRODUCT_ONLY_PO_VALUE']
pct_le_10 = (po_summary['FREIGHT_PERCENT_EXCL_PRODUCT'] <= 0.10).mean() * 100
pct_gt_50 = (po_summary['FREIGHT_PERCENT_EXCL_PRODUCT'] > 0.50).mean() * 100
print(f"% of POs with freight ≤ 10% of product spend: {pct_le_10:.2f}%")
print(f"% of POs with freight > 50% of product spend: {pct_gt_50:.2f}%")

# === Step 6: Composite Key Consistency ===
df['PO_INVOICE_COMPOSITE_KEY'] = df['PO NO'].astype(str) + '|' + df['INVOICE ID'].astype(str) + '|' + df['INVOICE NO'].astype(str)
composite_counts = df.groupby('PO NO')['PO_INVOICE_COMPOSITE_KEY'].nunique().reset_index()
composite_counts['PO_COMPOSITE_KEY_CONSISTENCY'] = composite_counts['PO_INVOICE_COMPOSITE_KEY'].apply(
    lambda x: 'Consistent' if x == 1 else 'Inconsistent'
)
df = df.merge(composite_counts[['PO NO', 'PO_COMPOSITE_KEY_CONSISTENCY']], on='PO NO', how='left')

# === Step 7: Quantity Consistency Check ===
# Ensure numeric comparisons
df['INVOICED LINE QTY'] = pd.to_numeric(df['INVOICED LINE QTY'], errors='coerce')
df['PO PURCH QTY'] = pd.to_numeric(df['PO PURCH QTY'], errors='coerce')
df['PO INV QTY'] = pd.to_numeric(df['PO INV QTY'], errors='coerce')

# Compare all three quantities
df['QTY_CONSISTENCY_FLAG'] = (
    (df['INVOICED LINE QTY'] == df['PO PURCH QTY']) &
    (df['PO PURCH QTY'] == df['PO INV QTY'])
)

# === Final Type Cleanup Before Export ===
numeric_cols = [
    'TOTAL_PO_VALUE', 'PROJECT_FREIGHT_COST', 'PROJECT_FREIGHT_PERCENT',
    'PRODUCT_ONLY_PO_VALUE', 'FREIGHT_PERCENT_EXCL_PRODUCT'
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').round(6)

# === Export enriched dataset to data/output/ ===
output_path = "../data/output/Freight_Analysis_Enriched_Output.csv"
#df.to_csv(output_path, index=False, float_format='%.6f')
print(f"\n✅ Enriched dataset exported to: {output_path}")


% of POs with freight ≤ 10% of product spend: 87.67%
% of POs with freight > 50% of product spend: 2.61%

✅ Enriched dataset exported to: ../data/output/Freight_Analysis_Enriched_Output.csv


In [2]:
# Group by 'COMM 1' and aggregate unique UOMs
uom_summary = df.groupby(['COMM 1','COMM 2'])['INV UOM'].unique().reset_index()

# Rename the column for clarity
uom_summary.rename(columns={'INV UOM': 'UNIQUE_UOMS'}, inplace=True)

# Display the result
uom_summary.head()

Unnamed: 0,COMM 1,COMM 2,UNIQUE_UOMS
0,10,10,"[SY, SF, LF, EA, BOX]"
1,10,100,[SY]
2,100,10,[SY]
3,100,100,"[SY, SF, EA, PCS, CTN, yd2]"
4,1000,1000,"[EA, SF]"


In [3]:
df.columns


Index(['SITE', 'SITE DESCRIPTION', 'SUPPLIER NO', 'SUPPLIER NAME',
       'INVOICE ID', 'INVOICE NO', 'DATE POSTED', 'PROJECT ID', 'PROJECT NAME',
       'ACCOUNT', 'ACCOUNT DESCRIPTION', 'PLANNED DELIVERY DATE',
       'SHIP TO ZIP', 'PO NO', 'PO LINE NO', 'PO REL NO', 'RECEIPT NO',
       'PART NO', 'PART DESCRIPTION', 'COMM 1', 'COMM 2', 'PO PURCH QTY',
       'PURCH UOM', 'PO INV QTY', 'INV UOM', 'INVOICED LINE QTY',
       'INVOICE LINE TOTAL', 'PO PRICE', 'IS_PROJECT_FREIGHT_LINE',
       'PO_HAS_PROJECT_FREIGHT', 'TOTAL_PO_VALUE', 'PROJECT_FREIGHT_COST',
       'PRODUCT_ONLY_PO_VALUE', 'PROJECT_FREIGHT_PERCENT',
       'FREIGHT_PERCENT_EXCL_PRODUCT', 'FREIGHT_≥90%_OF_PO',
       'FREIGHT_GT_PRODUCT', 'NEGATIVE_FREIGHT_PERCENT',
       'PROJECT_FREIGHT_LINE_COUNT', 'PO_HAS_MULTIPLE_PROJECT_FREIGHT_LINES',
       'HIGH_FREIGHT_FLAG', 'PART DESCRIPTION CLEAN', 'IS_LABOR',
       'PRODUCT_ORDER_COUNT', 'PRODUCT_ORDER_RANK', 'UOM_COMPARISON_STATUS',
       'PO_INVOICE_COMPOSITE_KEY',

In [4]:
#  === Load Commodity Groups ===
# Load the commodity groups from the Excel file
commodity_df = pd.read_excel('../data/input/IFS Cloud Commodity Groups.xlsx', sheet_name='Commodity Groups')
commodity_df.head()

Unnamed: 0,Commodity Group,Description,Old/New,Priority
0,0,Zero Cost,Old Commodity,No
1,10,Carpet,Old Commodity,Yes
2,20,Sheet Vinyl,Old Commodity,Yes
3,30,Product Care Supplies,Old Commodity,No
4,40,Vinyl Tile,Old Commodity,Yes


In [5]:
# Convert 'Commodity Group' to string and create a new column 'COMM 1'
commodity_df['COMM 1'] = commodity_df['Commodity Group'].astype(str)

In [6]:
# Convert 'Commodity Group' to string in the main DataFrame
df['COMM 1'] = df['COMM 1'].astype(str)

In [7]:
# Perform the join on the 'COMM 1' column
merged_df = df.merge(commodity_df, on='COMM 1', how='left')

# Display the first few rows of the merged DataFrame
merged_df.head()

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,PRODUCT_ORDER_COUNT,PRODUCT_ORDER_RANK,UOM_COMPARISON_STATUS,PO_INVOICE_COMPOSITE_KEY,PO_COMPOSITE_KEY_CONSISTENCY,QTY_CONSISTENCY_FLAG,Commodity Group,Description,Old/New,Priority
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,Missing INV UOM,nan|433731|0007795-CM,,False,,,,
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,1.0,129.0,Match,48180|433340|173373,Inconsistent,True,1ACC,Accessories,New Commodity,No
2,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,1.0,129.0,Match,48180|433340|173373,Inconsistent,True,1TRAN,Transitions,New Commodity,No
3,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,5504,...,,,Missing INV UOM,48180|433340|173373,Inconsistent,False,,,,
4,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2015,...,,,Missing INV UOM,48180|433340|173373,Inconsistent,False,,,,


In [8]:
# === Load Manual Lines ===
# Load the manual lines from the Excel file
manual_lines_df = pd.read_excel('../data/input/Manual Lines.xlsx',sheet_name='Sheet0')
manual_lines_df.head()

Unnamed: 0,Supplier Id,Supplier Name,Po Ref Number,Status,Invoice Series,Invoice No,Invoice Date,Voucher Type,Voucher No,Voucher Date,...,Project Desc,Site Code,Site Code Desc,Fixed Asset,Fixed Asset Desc,Brand Part,Brand Part Desc,Project Activity,Line Ref,Posting Line Amount
0,104323,Nydree Flooring,168676,Paid Posted,SX,0125342-IN,2024-12-17,I,2025001129,2025-01-02,...,Amli Atlantic Station Public Area,SPN,Spectra Norcross,,,SPC,Spectra,100044728.0,,81.31
1,107786,Dal Tile Corporation,130449,Paid Posted,SX,0143516889,2024-08-15,I,2025009505,2025-01-23,...,North Chase 300,FSC,Floor Sol Charleston,,,FLS,Flooring Solutions,100155932.0,,10.45
2,107786,Dal Tile Corporation,130449,Paid Posted,SX,0143516889,2024-08-15,I,2025009505,2025-01-23,...,North Chase 300,FSC,Floor Sol Charleston,,,FLS,Flooring Solutions,100155932.0,,25.98
3,107786,Dal Tile Corporation,130449,Paid Posted,SX,0143516889,2024-08-15,I,2025009505,2025-01-23,...,North Chase 300,FSC,Floor Sol Charleston,,,FLS,Flooring Solutions,100155932.0,,24.19
4,126695,Tarkett USA Inc,107284,Paid Posted,SI,8201991030,2024-07-17,I,2025003166,2025-01-08,...,LCPS Loudoun County High School,CCSG,Contract Carpet Sol Government,,,CCSG,Contract Carpet Systems Government,100147054.0,,6.31


In [9]:
# Convert 'Po Ref Number' to string and create a new column 'PO NO'
manual_lines_df['PO NO'] = manual_lines_df['Po Ref Number'].astype(str)

In [10]:
# Convert 'PO NO' to string in the main DataFrame
merged_df['PO NO'] = merged_df['PO NO'].astype(str)

In [11]:
# Create a new column to indicate if the PO NO is manual
merged_df['IS_MANUAL'] = merged_df['PO NO'].isin(manual_lines_df['PO NO'])

In [12]:
# Group by 'PO NO' and check if all rows in each group have 'Priority' == 'Yes'
merged_df['ALL_PRIORITY'] = merged_df.groupby('PO NO')['Priority'].transform(lambda x: (x == 'Yes').any())

# Flag the PO NOs where all rows have 'Priority' == 'Yes'
merged_df['PRIORITY_PO_FLAG'] = merged_df['ALL_PRIORITY']

In [13]:
# Group by 'COMM 1' and aggregate unique UOMs
uom_summary = merged_df.groupby(['COMM 1','COMM 2','Commodity Group','Description', 
'Old/New', 
'Priority',])['INV UOM'].unique().reset_index()

# Rename the column for clarity
uom_summary.rename(columns={'INV UOM': 'UNIQUE_UOMS'}, inplace=True)

# Display the result
uom_summary.head()

Unnamed: 0,COMM 1,COMM 2,Commodity Group,Description,Old/New,Priority,UNIQUE_UOMS
0,10,10,10,Carpet,Old Commodity,Yes,"[SY, SF, LF, EA, BOX]"
1,10,100,10,Carpet,Old Commodity,Yes,[SY]
2,100,10,100,Carpet Tiles,Old Commodity,Yes,[SY]
3,100,100,100,Carpet Tiles,Old Commodity,Yes,"[SY, SF, EA, PCS, CTN, yd2]"
4,1000,1000,1000,Mapei Products,Old Commodity,No,"[EA, SF]"


In [14]:
# Explode the UNIQUE_UOMS column into separate rows
uom_summary_exploded = uom_summary.explode('UNIQUE_UOMS')

# Display the first few rows of the resulting DataFrame
uom_summary_exploded.head()

Unnamed: 0,COMM 1,COMM 2,Commodity Group,Description,Old/New,Priority,UNIQUE_UOMS
0,10,10,10,Carpet,Old Commodity,Yes,SY
0,10,10,10,Carpet,Old Commodity,Yes,SF
0,10,10,10,Carpet,Old Commodity,Yes,LF
0,10,10,10,Carpet,Old Commodity,Yes,EA
0,10,10,10,Carpet,Old Commodity,Yes,BOX


In [15]:
#uom_summary_exploded.to_csv('../data/output/UOM_Summary.csv', index=False, float_format='%.6f')

In [16]:
important_columns = [
'SITE',
'SITE DESCRIPTION',
'SUPPLIER NO',
'SUPPLIER NAME',
'PO NO',
'ACCOUNT',
'ACCOUNT DESCRIPTION',
'SHIP TO ZIP',
'PART NO', 
'PART DESCRIPTION',
'PO PURCH QTY',
'PO INV QTY', 
'INVOICED LINE QTY',
'PURCH UOM', 
'INV UOM', 
'INVOICE LINE TOTAL', 
'PO PRICE', 
'TOTAL_PO_VALUE',
'PO_HAS_PROJECT_FREIGHT',
'PROJECT_FREIGHT_COST', 
'PROJECT_FREIGHT_PERCENT',
'PRODUCT_ONLY_PO_VALUE', 
'FREIGHT_≥90%_OF_PO',
'FREIGHT_GT_PRODUCT', 
'NEGATIVE_FREIGHT_PERCENT',
'PO_HAS_MULTIPLE_PROJECT_FREIGHT_LINES',
'COMM 1',
'Commodity Group', 
'Description', 
'Old/New', 
'Priority',
'IS_MANUAL',
'PRIORITY_PO_FLAG',
'QTY_CONSISTENCY_FLAG',
'UOM_COMPARISON_STATUS'

]

In [17]:
# Filter the DataFrame to include only the important columns
df_filtered = merged_df[important_columns]
df_filtered.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,PO NO,ACCOUNT,ACCOUNT DESCRIPTION,SHIP TO ZIP,PART NO,PART DESCRIPTION,...,PO_HAS_MULTIPLE_PROJECT_FREIGHT_LINES,COMM 1,Commodity Group,Description,Old/New,Priority,IS_MANUAL,PRIORITY_PO_FLAG,QTY_CONSISTENCY_FLAG,UOM_COMPARISON_STATUS
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,,5400,PROJECT Sub-Contract Labor,,,,...,False,,,,,,True,False,False,Missing INV UOM
1,BNB,Beckers New Brighton,104716,Hank's Specialties,48180.0,2008,Received Not Yet Invoiced,55430.0,1000007968.0,PROTECT ALL PRE-NOTCHED Z-BAR INSIDE CORNER AL...,...,False,1ACC,1ACC,Accessories,New Commodity,No,False,True,True,Match


In [18]:
df_filtered['PO NO'].nunique()

140304

In [19]:
negative_freight_summary = df_filtered.pivot_table(
    index='NEGATIVE_FREIGHT_PERCENT',
    values='PO NO',
    aggfunc=lambda X: len(X.unique()),
).reset_index()

negative_freight_summary.rename(columns={'PO NO': 'COUNT'}, inplace=True)
negative_freight_summary

Unnamed: 0,NEGATIVE_FREIGHT_PERCENT,COUNT
0,False,140077
1,True,227


In [20]:
negative_freight_summary = df_filtered.pivot_table(
    index='PO_HAS_PROJECT_FREIGHT',
    values='PO NO',
    aggfunc=lambda X: len(X.unique()),
).reset_index()

negative_freight_summary.rename(columns={'PO NO': 'COUNT'}, inplace=True)
negative_freight_summary

Unnamed: 0,PO_HAS_PROJECT_FREIGHT,COUNT
0,False,94521
1,True,45783


In [21]:
negative_freight_summary = df_filtered.pivot_table(
    index='PO_HAS_MULTIPLE_PROJECT_FREIGHT_LINES',
    values='PO NO',
    aggfunc=lambda X: len(X.unique()),
).reset_index()

negative_freight_summary.rename(columns={'PO NO': 'COUNT'}, inplace=True)
negative_freight_summary

Unnamed: 0,PO_HAS_MULTIPLE_PROJECT_FREIGHT_LINES,COUNT
0,False,135369
1,True,4935


In [22]:
negative_freight_summary = df_filtered.pivot_table(
    index='PRIORITY_PO_FLAG',
    values='PO NO',
    aggfunc=lambda X: len(X.unique()),
).reset_index()

negative_freight_summary.rename(columns={'PO NO': 'COUNT'}, inplace=True)
negative_freight_summary

Unnamed: 0,PRIORITY_PO_FLAG,COUNT
0,False,119279
1,True,21025


In [23]:
site_summary = df_filtered.groupby('SITE DESCRIPTION')['PO NO'].nunique().reset_index(name='UNIQUE_PO_COUNT').sort_values(by='UNIQUE_PO_COUNT', ascending=False)
site_summary

Unnamed: 0,SITE DESCRIPTION,UNIQUE_PO_COUNT
46,Spectra Norcross,13757
50,Spectra Tampa,11390
43,Spectra Longwood,10809
35,Spectra Coppell,8120
38,Spectra Houston,7220
19,Floor Sol Union,7128
51,Spectra Tempe,6692
10,Diverzify Itasca,6491
36,Spectra Denver,4343
42,Spectra Lockland,3939


In [24]:
supplier_freight_summary = df_filtered.groupby('SUPPLIER NAME', as_index=False)['PROJECT_FREIGHT_COST'].mean().sort_values(by='PROJECT_FREIGHT_COST', ascending=False)
supplier_freight_summary.rename(columns={'PROJECT_FREIGHT_COST': 'TOTAL_PROJECT_FREIGHT_COST'}, inplace=True)
supplier_freight_summary

Unnamed: 0,SUPPLIER NAME,TOTAL_PROJECT_FREIGHT_COST
1132,"Crossley Axminster, Inc.",49368.973770
2212,"InterfaceServices, Inc.",23599.888667
4605,United Office Systems Pvt Ltd,20803.835556
333,Arto Brick California Pavers,13659.817500
4172,Stone Tile International Inc,12421.300000
...,...,...
1391,Divine Flooring LP,-2.040000
112,"ASM Modular Systems, Inc",-5.895765
1545,ERC DELIVERY SERVICE,-41.489362
1206,DNU Bedrosians Tile & Stone,-80.870000


In [25]:
site_freight_summary = df_filtered.groupby('SITE DESCRIPTION', as_index=False)['PROJECT_FREIGHT_COST'].mean().sort_values(by='PROJECT_FREIGHT_COST', ascending=False)
site_freight_summary.rename(columns={'PROJECT_FREIGHT_COST': 'TOTAL_PROJECT_FREIGHT_COST'}, inplace=True)
site_freight_summary

Unnamed: 0,SITE DESCRIPTION,TOTAL_PROJECT_FREIGHT_COST
23,ProSpectra Ceritos,1639.447054
32,Select Prefab Solut,1117.708263
21,Kiefer USA,784.384966
10,Diverzify Itasca,562.621404
24,ProSpectra Las Vegas,472.482885
2,Continental Floors,418.340857
33,Spectra Austin,392.937204
46,Spectra Norcross,348.807706
47,Spectra Pelham,323.813617
34,Spectra Columbus,323.612067


In [26]:
# Merge site_freight_summary and site_summary on 'SITE DESCRIPTION'
merged_site_summary = site_freight_summary.merge(site_summary, on='SITE DESCRIPTION', how='inner')

# Display the first few rows of the merged DataFrame
merged_site_summary.head()

Unnamed: 0,SITE DESCRIPTION,TOTAL_PROJECT_FREIGHT_COST,UNIQUE_PO_COUNT
0,ProSpectra Ceritos,1639.447054,460
1,Select Prefab Solut,1117.708263,641
2,Kiefer USA,784.384966,1262
3,Diverzify Itasca,562.621404,6491
4,ProSpectra Las Vegas,472.482885,132


In [None]:
# Filter for rows where ACCOUNT is 2008
account_2008_df = merged_df[merged_df['ACCOUNT'] == 2008]

# Group by 'PO NO' and check if all rows in each group have 'Priority' == 'Yes'
priority_2008_flag = df_filtered[df_filtered['ACCOUNT'] == 2008].groupby('PO NO')['Priority'].apply(lambda x: (x == 'Yes').all()).reset_index(name='ALL_PRIORITY_2008')

# Merge the result back to the filtered DataFrame
df_filtered = df_filtered.merge(priority_2008_flag, on='PO NO', how='left')

# Fill NaN values with False for the new column
df_filtered['ALL_PRIORITY_2008'] = df_filtered['ALL_PRIORITY_2008'].fillna(False)

# Flag the PO NOs where all rows with ACCOUNT = 2008 have 'Priority' == 'Yes'
df_filtered['PRIORITY_PO_FLAG_2008'] = df_filtered['ALL_PRIORITY_2008']

In [None]:
df_filtered.shape

(441707, 37)

In [None]:
negative_freight_summary = df_filtered.pivot_table(
    index='PRIORITY_PO_FLAG_2008',
    values='PO NO',
    aggfunc=lambda X: len(X.unique()),
).reset_index()

negative_freight_summary.rename(columns={'PO NO': 'COUNT'}, inplace=True)
negative_freight_summary

Unnamed: 0,PRIORITY_PO_FLAG_2008,COUNT
0,False,126444
1,True,13860


In [None]:
# Stage 1: Filter where PO_HAS_PROJECT_FREIGHT is True
stage1_df = df_filtered[df_filtered['PO_HAS_PROJECT_FREIGHT'] == True]

# Stage 2: Filter where NEGATIVE_FREIGHT_PERCENT is False
stage2_df = stage1_df[stage1_df['NEGATIVE_FREIGHT_PERCENT'] == False]

# Stage 3: Filter where PRIORITY_PO_FLAG_2008 is True
final_filtered_df = stage2_df[stage2_df['PRIORITY_PO_FLAG_2008'] == True]

# Display the resulting DataFrame
final_filtered_df['SITE'].nunique()

45

In [60]:
# Filter rows where Account equals 5504
account_5504_df = final_filtered_df[final_filtered_df['ACCOUNT'] == 5504]

# Group by SITE and sum the INVOICE LINE TOTAL
po_zip_account_5504_summary = account_5504_df.groupby('SHIP TO ZIP', as_index=False)['PO NO'].nunique().sort_values(
    by='PO NO', ascending=False).reset_index(drop=True)

# Rename the column for clarity
po_zip_account_5504_summary.rename(columns={'INVOICE LINE TOTAL': 'TOTAL_INVOICE_LINE_5504'}, inplace=True)

# Display the result
po_zip_account_5504_summary

Unnamed: 0,SHIP TO ZIP,PO NO
0,28079,29
1,30071,23
2,45215,23
3,33605,22
4,78247,21
5,32750,20
6,77040,16
7,32256,15
8,17111,14
9,75019,14


In [59]:
# Filter rows where Account equals 5504
account_5504_df = final_filtered_df[final_filtered_df['ACCOUNT'] == 5504]

# Group by SITE and sum the INVOICE LINE TOTAL
zip_account_5504_summary = account_5504_df.groupby('SHIP TO ZIP', as_index=False)['INVOICE LINE TOTAL'].sum().sort_values(
    by='INVOICE LINE TOTAL', ascending=False).reset_index(drop=True)

# Rename the column for clarity
zip_account_5504_summary.rename(columns={'INVOICE LINE TOTAL': 'TOTAL_INVOICE_LINE_5504'}, inplace=True)

# Display the result
zip_account_5504_summary

Unnamed: 0,SHIP TO ZIP,TOTAL_INVOICE_LINE_5504
0,30071,14754.98
1,45215,12129.37
2,77040,12112.98
3,32750,11490.84
4,84120,11230.66
5,75019,10602.7
6,40228,8170.56
7,78247,7693.85
8,28079,6702.73
9,33605,5176.51


In [62]:
# Perform the join on 'SITE DESCRIPTION'
merged_site_5504_summary = zip_account_5504_summary.merge(po_zip_account_5504_summary, on='SHIP TO ZIP', how='inner').sort_values(
    by='TOTAL_INVOICE_LINE_5504', ascending=False).reset_index(drop=True)

# Display the first few rows of the merged DataFrame
merged_site_5504_summary.head()

Unnamed: 0,SHIP TO ZIP,TOTAL_INVOICE_LINE_5504,PO NO
0,30071,14754.98,23
1,45215,12129.37,23
2,77040,12112.98,16
3,32750,11490.84,20
4,84120,11230.66,12


In [63]:
merged_site_5504_summary.to_csv("../data/output/merged_zip_5504_summary.csv", index=False, float_format='%.6f')


In [51]:
# Filter rows where Account equals 5504
account_5504_df = final_filtered_df[df_filtered['ACCOUNT'] == 5504]

# Group by SITE and sum the INVOICE LINE TOTAL
site_account_5504_summary = account_5504_df.groupby('SITE DESCRIPTION', as_index=False)['INVOICE LINE TOTAL'].sum().sort_values(
    by='INVOICE LINE TOTAL', ascending=False).reset_index(drop=True)

# Rename the column for clarity
site_account_5504_summary.rename(columns={'INVOICE LINE TOTAL': 'TOTAL_INVOICE_LINE_5504'}, inplace=True)

# Display the result
site_account_5504_summary

Unnamed: 0,SITE DESCRIPTION,TOTAL_INVOICE_LINE_5504
0,Spectra Longwood,380312.66
1,Spectra Tampa,274153.9
2,Diverzify Itasca,258569.35
3,Spectra Houston,249545.65
4,Spectra Coppell,242230.56
5,Floor Sol Union,205914.06
6,Spectra Norcross,181475.65
7,Spectra Tempe,176385.77
8,Spectra W Valley Cty,167200.19
9,Spectra Jacksonville,117103.73


In [53]:
# Filter rows where Account equals 5504
account_5504_df = final_filtered_df[final_filtered_df['ACCOUNT'] == 5504]

# Group by SITE and sum the INVOICE LINE TOTAL
site_po_account_5504_summary = account_5504_df.groupby('SITE DESCRIPTION', as_index=False)['PO NO'].count().sort_values(
    by='PO NO', ascending=False).reset_index(drop=True)

# Rename the column for clarity
site_po_account_5504_summary.rename(columns={'PO NO': 'COUNT_PO_5504'}, inplace=True)

# Display the result
site_po_account_5504_summary

Unnamed: 0,SITE DESCRIPTION,COUNT_PO_5504
0,Floor Sol Union,1438
1,Spectra Longwood,1159
2,Spectra Tampa,957
3,Spectra Houston,846
4,Spectra Norcross,788
5,Spectra W Valley Cty,595
6,Diverzify Itasca,528
7,Contract Carpet Sol,475
8,Spectra Tempe,472
9,Spectra Coppell,434


In [55]:
# Perform the join on 'SITE DESCRIPTION'
merged_site_5504_summary = site_po_account_5504_summary.merge(site_account_5504_summary, on='SITE DESCRIPTION', how='inner').sort_values(
    by='TOTAL_INVOICE_LINE_5504', ascending=False).reset_index(drop=True)

# Display the first few rows of the merged DataFrame
merged_site_5504_summary.head()

Unnamed: 0,SITE DESCRIPTION,COUNT_PO_5504,TOTAL_INVOICE_LINE_5504
0,Spectra Longwood,1159,380312.66
1,Spectra Tampa,957,274153.9
2,Diverzify Itasca,528,258569.35
3,Spectra Houston,846,249545.65
4,Spectra Coppell,434,242230.56


In [58]:
merged_site_5504_summary.to_csv("../data/output/merged_site_5504_summary.csv", index=False, float_format='%.6f')
print(f"\n✅ Merged site 5504 summary exported to: ../data/output/merged_site_5504_summary.csv")


✅ Merged site 5504 summary exported to: ../data/output/merged_site_5504_summary.csv


In [56]:
final_filtered_df.columns

Index(['SITE', 'SITE DESCRIPTION', 'SUPPLIER NO', 'SUPPLIER NAME', 'PO NO',
       'ACCOUNT', 'ACCOUNT DESCRIPTION', 'SHIP TO ZIP', 'PART NO',
       'PART DESCRIPTION', 'PO PURCH QTY', 'PO INV QTY', 'INVOICED LINE QTY',
       'PURCH UOM', 'INV UOM', 'INVOICE LINE TOTAL', 'PO PRICE',
       'TOTAL_PO_VALUE', 'PO_HAS_PROJECT_FREIGHT', 'PROJECT_FREIGHT_COST',
       'PROJECT_FREIGHT_PERCENT', 'PRODUCT_ONLY_PO_VALUE',
       'FREIGHT_≥90%_OF_PO', 'FREIGHT_GT_PRODUCT', 'NEGATIVE_FREIGHT_PERCENT',
       'PO_HAS_MULTIPLE_PROJECT_FREIGHT_LINES', 'COMM 1', 'Commodity Group',
       'Description', 'Old/New', 'Priority', 'IS_MANUAL', 'PRIORITY_PO_FLAG',
       'QTY_CONSISTENCY_FLAG', 'UOM_COMPARISON_STATUS', 'ALL_PRIORITY_2008',
       'PRIORITY_PO_FLAG_2008'],
      dtype='object')

In [57]:
# Filter rows where Account equals 5504
account_5504_df = final_filtered_df[final_filtered_df['ACCOUNT'] == 5504]

# Group by SITE and sum the INVOICE LINE TOTAL
site_po_account_5504_summary = account_5504_df.groupby('SITE DESCRIPTION', as_index=False)['PROJECT_FREIGHT_PERCENT'].mean().sort_values(
    by='PROJECT_FREIGHT_PERCENT', ascending=False).reset_index(drop=True)

# Rename the column for clarity
site_po_account_5504_summary.rename(columns={'PO NO': 'COUNT_PO_5504'}, inplace=True)

# Display the result
site_po_account_5504_summary

Unnamed: 0,SITE DESCRIPTION,PROJECT_FREIGHT_PERCENT
0,Floor Sol New Hanovr,7272929000000.0
1,ProSpectra Union Cty,0.1581943
2,Div National Accts,0.1493186
3,RD Weis Hawthorn,0.1423082
4,Spectra Sterling,0.1344015
5,Pavillion Floors,0.1339169
6,ProSpectra Ceritos,0.1335099
7,Spectra Tigard,0.1317778
8,Spectra Austin,0.1300755
9,Diverzify St. Louis,0.1279924


In [16]:
# === Filter for specific sites ===
# Filter the DataFrame for specific sites
df_sites = df_filtered[df_filtered['SITE'].isin(['SPJ','SPW','SPT'])]
df_sites.shape

(75724, 37)

In [17]:
# Stage 1: Filter where PO_HAS_PROJECT_FREIGHT is True
stage1_df = df_sites[df_sites['PO_HAS_PROJECT_FREIGHT'] == True]

# Stage 2: Filter where NEGATIVE_FREIGHT_PERCENT is False
stage2_df = stage1_df[stage1_df['NEGATIVE_FREIGHT_PERCENT'] == False]

# Stage 3: Filter where PRIORITY_PO_FLAG_2008 is True
final_filtered_df = stage2_df[stage2_df['PRIORITY_PO_FLAG_2008'] == True]

# Display the resulting DataFrame
final_filtered_df.head()

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,PO NO,ACCOUNT,ACCOUNT DESCRIPTION,SHIP TO ZIP,PART NO,PART DESCRIPTION,...,Commodity Group,Description,Old/New,Priority,IS_MANUAL,PRIORITY_PO_FLAG,QTY_CONSISTENCY_FLAG,UOM_COMPARISON_STATUS,ALL_PRIORITY_2008,PRIORITY_PO_FLAG_2008
246432,SPJ,Spectra Jacksonville,103423,"Shaw Industries, Inc.",40170,5504,PROJECT Freight,32256.0,,,...,,,,,False,True,False,Missing INV UOM,True,True
246433,SPJ,Spectra Jacksonville,103423,"Shaw Industries, Inc.",40170,5205,PROJECT PO Variance M181 & M182,,,,...,,,,,False,True,False,Missing INV UOM,True,True
246434,SPJ,Spectra Jacksonville,103423,"Shaw Industries, Inc.",40170,2008,Received Not Yet Invoiced,32256.0,204991-002,Shaw Contract Correspond (5T353) Multi Level P...,...,1CPT,Carpet Tile,New Commodity,Yes,False,True,False,Mismatch,True,True
246439,SPJ,Spectra Jacksonville,121550,Mohawk Industries,44286,5504,PROJECT Freight,32256.0,,,...,,,,,False,True,False,Missing INV UOM,True,True
246440,SPJ,Spectra Jacksonville,121550,Mohawk Industries,44286,2008,Received Not Yet Invoiced,32256.0,1000003305,Mohawk-Juniperus II (C2059)-6ft 6in-2.0 MM-Tem...,...,20,Sheet Vinyl,Old Commodity,Yes,False,True,False,Match,True,True


In [18]:
# Export enriched dataset
output_path = "../data/output/Freight_Analysis_Enriched_Sample_Sites_Any_v4.csv"
final_filtered_df.to_csv(output_path, index=False)
print(f"\n✅ Enriched dataset exported to: {output_path}")


✅ Enriched dataset exported to: ../data/output/Freight_Analysis_Enriched_Sample_Sites_Any_v4.csv
