In [78]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [79]:
# Read in the modelling input file
model_input = pd.read_csv('../../data/output/20250414_modelling_input_v_20250414_195626.csv')
model_input.head()

Unnamed: 0,project_id,project_name,po_no,account,account_description,siteid,site,supplierid,suppliername,partnumber,...,all_priority_products,any_priority_products,match_commodity,match_supplier,new_commodity_description,new_commodity_group,all_accounts_2008_classified,all_2008_accounts_converted,multiple_commodities,freight_per_invoice
0,2311123624,Rick Sidor Residence,47568,2008,Received Not Yet Invoiced,DIT,Diverzify Itasca,890,All Surfaces,1000008205,...,True,True,Commodity Found,No supplier found,LVT,1VNL,True,True,False,54.55
1,230485284,Epic Delphi Conference Room Refresh,40635,2008,Received Not Yet Invoiced,DIT,Diverzify Itasca,108164,"Mannington Mills, Inc.",1000006036,...,True,True,Commodity Found,Supplier registered,LVT,1VNL,True,True,False,49.55
2,2310119387,Round Barn Manor,49453,2008,Received Not Yet Invoiced,DIT,Diverzify Itasca,890,All Surfaces,1000008746,...,True,True,Commodity Found,No supplier found,LVP,1VNL,True,True,False,159.86
3,2401130627,Hotel Zachary - Tenant 9 Elevator F,52075,2008,Received Not Yet Invoiced,DIT,Diverzify Itasca,103423,"Shaw Industries, Inc.",1000009308,...,True,True,Commodity Found,Supplier registered,LVP,1VNL,True,True,False,86.14
4,2402136755,Arrow Electronics,64012,2008,Received Not Yet Invoiced,DIT,Diverzify Itasca,103423,"Shaw Industries, Inc.",1000012638,...,True,True,Commodity Found,Supplier registered,LVT,1VNL,True,True,False,88.5


In [80]:
# Read the CSV file with modelled results
model_output = pd.read_csv('../../app/downloads/freight_dual_results_20250413_172552.csv')
model_output.head(2)

Unnamed: 0,project_id,project_name,po_no,account,account_description,siteid,site,supplierid,suppliername,partnumber,...,match_supplier,est_estimated_area_cost,est_estimated_cwt_cost,est_freight_class_area,est_freight_class_lbs,est_lbs,est_rate_area,est_rate_cwt,est_sqyd,est_uom
0,2408170517,Biller Genie,203858,2008,Received Not Yet Invoiced,SPW,Spectra Longwood,107776,"J.J. Haines & Company, LLC dba UCX Flooring",126732-041,...,Supplier registered,Not applicable,64.85,,L5C,255.6,Not applicable,25.37,20.0,SQFT
1,2502211895,COUNTRY BOY TRAILERS,193967,2008,Received Not Yet Invoiced,SPN,Spectra Norcross,107371,Fishman Flooring Solutions,211021-032,...,No supplier found,Not applicable,15.64,,L5C,127.8,Not applicable,12.24,10.0,SQFT


In [81]:

# Load your DataFrame (replace this with your actual loading logic)

# --- Clean numeric fields that may contain errors or text ---
model_output['est_estimated_area_cost'] = pd.to_numeric(
    model_output['est_estimated_area_cost'].astype(str).str.extract(r'([-]?[0-9]*\.?[0-9]+)')[0],
    errors='coerce'
)

model_output['est_estimated_cwt_cost'] = pd.to_numeric(
    model_output['est_estimated_cwt_cost'].astype(str).str.extract(r'([-]?[0-9]*\.?[0-9]+)')[0],
    errors='coerce'
)

# --- Group by invoice_id and aggregate ---
model_output_freight = model_output.groupby(['siteid','invoice_id']).agg(
    total_estimated_area_cost=('est_estimated_area_cost', 'sum'),
    total_estimated_cwt_cost=('est_estimated_cwt_cost', 'sum'),
    unique_commodity_group_output=('est_commodity_group', lambda x: x.dropna().unique().tolist()),
    unique_commodity_description_output=('new_commodity_description', lambda x: x.dropna().unique().tolist())
).reset_index()

# View results
model_output_freight.head()


Unnamed: 0,siteid,invoice_id,total_estimated_area_cost,total_estimated_cwt_cost,unique_commodity_group_output,unique_commodity_description_output
0,DIT,443670,0.0,204.88,[1VNL],[LVT]
1,DIT,454017,0.0,58.82,[1VNL],[LVT]
2,DIT,457800,0.0,485.12,[1VNL],[LVP]
3,DIT,466275,0.0,37.83,[1VNL],[LVP]
4,DIT,559703,0.0,27.88,[1VNL],[LVT]


In [82]:
#Get the unique commodity descriptions for each invoice_id amd the freight price from the modelled input
model_input_freight = model_input.groupby(['siteid','invoice_id']).agg(
    freight_price=('freight_per_invoice', 'first'),
    unique_commodity_group_input=('new_commodity_group', lambda x: x.dropna().unique().tolist()),
    unique_commodity_description_input=('new_commodity_description', lambda x: x.dropna().unique().tolist())
).reset_index()
model_input_freight

Unnamed: 0,siteid,invoice_id,freight_price,unique_commodity_group_input,unique_commodity_description_input
0,DIT,443670,54.55,[1VNL],[LVT]
1,DIT,454017,49.55,[1VNL],[LVT]
2,DIT,457800,159.86,[1VNL],[LVP]
3,DIT,466275,86.14,[1VNL],[LVP]
4,DIT,559703,88.50,[1VNL],[LVT]
...,...,...,...,...,...
2035,SPW,1743395,8.39,[1VNL],[VCT]
2036,SPW,1743617,315.87,[1VNL],[LVT]
2037,SPW,1745209,155.59,[1VNL],[VCT]
2038,SPW,1745853,5.99,[1VNL],[VCT]


In [83]:
# Merge the two summaries on invoice_id
merged_summary = pd.merge(model_input_freight, model_output_freight, on=['siteid','invoice_id'], how='left')
# Ensure all numerical columns in merged_summary are floats
numerical_columns = ['freight_price', 'total_estimated_area_cost', 'total_estimated_cwt_cost']
merged_summary[numerical_columns] = merged_summary[numerical_columns].apply(pd.to_numeric, errors='coerce')


# Display the first few rows of the updated merged summary
merged_summary.sort_values(by='freight_price',ascending=False).head()

Unnamed: 0,siteid,invoice_id,freight_price,unique_commodity_group_input,unique_commodity_description_input,total_estimated_area_cost,total_estimated_cwt_cost,unique_commodity_group_output,unique_commodity_description_output
979,SPT,601562,10836.22,[1VNL],[VCT],0.0,808.62,[1VNL],[VCT]
774,SPN,1294588,9749.59,[1VNL],"[LVP, VCT]",0.0,5528.99,[1VNL],"[LVP, VCT]"
599,SPN,723864,8497.03,[1CBL],[Carpet Roll],483.7,617.15,[1CBL],[Carpet Roll]
1971,SPW,1643009,8198.64,[1VNL],[LVT],0.0,2954.7,[1VNL],[LVT]
1743,SPW,1027287,7333.09,[1VNL],[LVT],0.0,6676.23,[1VNL],[LVT]


In [89]:
# Step 3: Apply conditional logic
merged_summary['total_cost'] = merged_summary.apply(
    lambda row: row['total_estimated_cwt_cost'] if '1VNL' in row['unique_commodity_group_input'] else (
        row['total_estimated_area_cost'] if '1CBL' in row['unique_commodity_group_input'] else 0
    ),
    axis=1
)
merged_summary.sort_values(by='freight_price',ascending=False).head()

Unnamed: 0,siteid,invoice_id,freight_price,unique_commodity_group_input,unique_commodity_description_input,total_estimated_area_cost,total_estimated_cwt_cost,unique_commodity_group_output,unique_commodity_description_output,commodity_group_key,group_key,total_cost
979,SPT,601562,10836.22,[1VNL],[VCT],0.0,808.62,[1VNL],[VCT],"(1VNL,)","(1VNL,)",808.62
774,SPN,1294588,9749.59,[1VNL],"[LVP, VCT]",0.0,5528.99,[1VNL],"[LVP, VCT]","(1VNL,)","(1VNL,)",5528.99
599,SPN,723864,8497.03,[1CBL],[Carpet Roll],483.7,617.15,[1CBL],[Carpet Roll],"(1CBL,)","(1CBL,)",483.7
1971,SPW,1643009,8198.64,[1VNL],[LVT],0.0,2954.7,[1VNL],[LVT],"(1VNL,)","(1VNL,)",2954.7
1743,SPW,1027287,7333.09,[1VNL],[LVT],0.0,6676.23,[1VNL],[LVT],"(1VNL,)","(1VNL,)",6676.23


In [91]:
merged_summary.to_csv('../../data/output/all_data.csv', index=False)
# --- Plotting the results ---
merged_summary.head(20)

Unnamed: 0,siteid,invoice_id,freight_price,unique_commodity_group_input,unique_commodity_description_input,total_estimated_area_cost,total_estimated_cwt_cost,unique_commodity_group_output,unique_commodity_description_output,commodity_group_key,group_key,total_cost
0,DIT,443670,54.55,[1VNL],[LVT],0.0,204.88,[1VNL],[LVT],"(1VNL,)","(1VNL,)",204.88
1,DIT,454017,49.55,[1VNL],[LVT],0.0,58.82,[1VNL],[LVT],"(1VNL,)","(1VNL,)",58.82
2,DIT,457800,159.86,[1VNL],[LVP],0.0,485.12,[1VNL],[LVP],"(1VNL,)","(1VNL,)",485.12
3,DIT,466275,86.14,[1VNL],[LVP],0.0,37.83,[1VNL],[LVP],"(1VNL,)","(1VNL,)",37.83
4,DIT,559703,88.5,[1VNL],[LVT],0.0,27.88,[1VNL],[LVT],"(1VNL,)","(1VNL,)",27.88
5,DIT,559715,331.28,[1VNL],[LVT],0.0,301.11,[1VNL],[LVT],"(1VNL,)","(1VNL,)",301.11
6,DIT,562903,150.0,[1VNL],[LVP],0.0,30.26,[1VNL],[LVP],"(1VNL,)","(1VNL,)",30.26
7,DIT,584653,34.85,[1CBL],[Carpet Roll],14.04,63.13,[1CBL],[Carpet Roll],"(1CBL,)","(1CBL,)",14.04
8,DIT,592040,1043.0,[1VNL],[VCT],0.0,604.21,[1VNL],[VCT],"(1VNL,)","(1VNL,)",604.21
9,DIT,607962,88.5,[1CBL],[Carpet Roll],21.93,98.64,[1CBL],[Carpet Roll],"(1CBL,)","(1CBL,)",21.93


In [84]:
# Step 1: Ensure the list column is converted to a tuple for grouping
merged_summary['commodity_group_key'] = merged_summary['unique_commodity_group_input'].apply(
    lambda x: tuple(sorted(set(x))) if isinstance(x, list) else (x,)
)

# Step 2: Group by siteid and the new tuple-based key
grouped_summary = merged_summary.groupby(['siteid', 'commodity_group_key']).agg(
    total_freight_price=('freight_price', 'sum'),
    total_area_cost=('total_estimated_area_cost', 'sum'),
    total_cwt_cost=('total_estimated_cwt_cost', 'sum')
).reset_index()

# Step 3: Apply conditional logic
grouped_summary['total_cost'] = grouped_summary.apply(
    lambda row: row['total_cwt_cost'] if '1VNL' in row['commodity_group_key'] else (
        row['total_area_cost'] if '1CBL' in row['commodity_group_key'] else 0
    ),
    axis=1
)

# Done
grouped_summary.head()


Unnamed: 0,siteid,commodity_group_key,total_freight_price,total_area_cost,total_cwt_cost,total_cost
0,DIT,"(1CBL,)",16381.65,6153.43,26462.24,6153.43
1,DIT,"(1VNL,)",31088.61,0.0,28166.56,28166.56
2,SPJ,"(1CBL,)",7960.92,5337.24,26311.86,5337.24
3,SPJ,"(1VNL,)",74343.86,0.0,88297.28,88297.28
4,SPN,"(1CBL,)",18486.17,7215.23,9963.44,7215.23


In [85]:
grouped_summary.to_csv('../../data/output/freight_grouped_summary_20250413_172552.csv', index=False)
# --- Plotting the results ---
grouped_summary.head(20)

Unnamed: 0,siteid,commodity_group_key,total_freight_price,total_area_cost,total_cwt_cost,total_cost
0,DIT,"(1CBL,)",16381.65,6153.43,26462.24,6153.43
1,DIT,"(1VNL,)",31088.61,0.0,28166.56,28166.56
2,SPJ,"(1CBL,)",7960.92,5337.24,26311.86,5337.24
3,SPJ,"(1VNL,)",74343.86,0.0,88297.28,88297.28
4,SPN,"(1CBL,)",18486.17,7215.23,9963.44,7215.23
5,SPN,"(1VNL,)",100017.36,0.0,72765.74,72765.74
6,SPT,"(1CBL,)",8846.79,3331.32,10987.2,3331.32
7,SPT,"(1VNL,)",158040.27,0.0,159247.05,159247.05
8,SPW,"(1CBL,)",23550.08,16965.65,40533.57,16965.65
9,SPW,"(1VNL,)",176976.52,0.0,189916.21,189916.21


In [86]:
# Convert the list column to tuples for grouping
merged_summary['group_key'] = merged_summary['unique_commodity_group_input'].apply(
    lambda x: tuple(sorted(set(x))) if isinstance(x, list) else (x,)
)

# Iterate through unique tuple values
for group in merged_summary['group_key'].unique():
    group_str = "_".join(map(str, group))  # e.g., ('VCT', 'LVT') → 'LVT_VCT'

    # Clean filename manually
    allowed_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"
    group_str_clean = "".join(c if c in allowed_chars else "_" for c in group_str)

    # Filter DataFrame
    filtered_df = merged_summary[merged_summary['group_key'] == group]

    # Save
    filename = f'../../data/output/merged_summary_{group_str_clean}.csv'
    filtered_df.to_csv(filename, index=False)


In [87]:
merged_summary.to_excel('../../data/output/merged_summary3.xlsx', index=False)