In [1]:
import pandas as pd
import sys
sys.path.append('../../utils')  # go up two folders, then into utils

from data_cleaning_utils import (
    data_cleaning,
    enrich_invoice_flags,
    uom_cleaning,
    flag_fully_converted_invoices,
    add_freight_per_invoice,
    filter_valid_invoices,
    increase_sample_size,
    resampling,
)  # or whatever functions you have


In [2]:
df = pd.read_csv('../../data/input/Freight_Cost_Analysis_CY2024-03.25.csv',encoding="latin1")  # replace with your actual data loading method

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
conversion_csv_path = "../../data/input/freight_model/conversion_table_standardized.csv"


In [4]:
df = data_cleaning(df,base_path='../../data/input')
df.head(2)


INFO:root:🔧 Running data_cleaning...
INFO:root:✅ data_cleaning complete.


Unnamed: 0,site,site_description,supplier_no,supplier_name,invoice_id,invoice_no,date_posted,project_id,project_name,account,...,old/new,priority,match_commodity,match_supplier,is_classified,invoice_has_classified,classification,new_commodity_description,new_commodity_group,conversion_code
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,,,Commodity Not Found,Supplier registered,False,False,Unclassified,,,nan_nan_nan
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,New Commodity,Yes,Commodity Found,Supplier registered,False,False,Unclassified,LVP,1VNL,LVP_1VNL_SF


In [5]:
df = uom_cleaning(df)
df.head(2)

INFO:root:✅ fixing unit of measure.


Unnamed: 0,site,site_description,supplier_no,supplier_name,invoice_id,invoice_no,date_posted,project_id,project_name,account,...,description,old/new,priority,match_commodity,match_supplier,classification,new_commodity_description,new_commodity_group,conversion_code,all_accounts_2008_uom_classified
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,,,,Commodity Not Found,Supplier registered,No UOM,,,nan_nan_nan,True
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,Vinyl,New Commodity,Yes,Commodity Found,Supplier registered,Classified,LVP,1VNL,LVP_1VNL_SQFT,True


In [6]:
df = flag_fully_converted_invoices(df, conversion_csv_path)
df.head(2)

INFO:root:✅ Flagging fully converted invoices.


✅ 8590 invoices have all account == 2008 rows with valid conversion codes


Unnamed: 0,site,site_description,supplier_no,supplier_name,invoice_id,invoice_no,date_posted,project_id,project_name,account,...,old/new,priority,match_commodity,match_supplier,classification,new_commodity_description,new_commodity_group,conversion_code,all_accounts_2008_uom_classified,all_2008_accounts_converted
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,,,Commodity Not Found,Supplier registered,No UOM,,,nan_nan_nan,True,True
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,New Commodity,Yes,Commodity Found,Supplier registered,Classified,LVP,1VNL,LVP_1VNL_SQFT,True,True


In [7]:
df = enrich_invoice_flags(df)
df.head(2)


INFO:root:✅ Enriching invoice data.


Number of invoices with at least one freight line: 55942
Number of invoices with multiple freight lines: 197
Number of invoices with multiple distinct parts: 47577
Number of invoices with multiple distinct commodities: 16091
Number of invoices where all ACCOUNT == 2008 have Priority == 'Yes': 17084
Number of invoices where at least one ACCOUNT == 2008 has Priority == 'Yes': 23536


Unnamed: 0,site,site_description,supplier_no,supplier_name,invoice_id,invoice_no,date_posted,project_id,project_name,account,...,new_commodity_group,conversion_code,all_accounts_2008_uom_classified,all_2008_accounts_converted,has_freight_line,multiple_freight_lines,multiple_parts,multiple_commodities,all__invoice_priority_products_(2008),any__invoice_priority_products_(2008)
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,,nan_nan_nan,True,True,True,False,False,False,True,True
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,1VNL,LVP_1VNL_SQFT,True,True,True,False,False,False,True,True


In [8]:
df = add_freight_per_invoice(df)
df.head(2)


INFO:root:✅ Calculating freight per invoice.
INFO:root:✅ Completed adding freight to invoice.


Unnamed: 0,site,site_description,supplier_no,supplier_name,invoice_id,invoice_no,date_posted,project_id,project_name,account,...,conversion_code,all_accounts_2008_uom_classified,all_2008_accounts_converted,has_freight_line,multiple_freight_lines,multiple_parts,multiple_commodities,all__invoice_priority_products_(2008),any__invoice_priority_products_(2008),freight_per_invoice
0,DIT,Diverzify Itasca,121550,Mohawk Industries,731977,C2665159,25-Jun-24,2312127222,Blue Chip Casino Deli,5504,...,nan_nan_nan,True,True,True,False,False,False,True,True,19.07
1,SPN,Spectra Norcross,103277,William M. Bird,531030,656881,01-Mar-24,2401132763,FAROPOINT LOBBY RENOVATION,2008,...,LVP_1VNL_SQFT,True,True,True,False,False,False,True,True,5.79


In [None]:
#df = increase_sample_size(df)
df = resampling(df)
df.head(2)

In [6]:
summary = df.pivot_table(index='baseline_sample',values='invoice_id',aggfunc=lambda X: len(X.unique())).reset_index()
summary.head(2)

Unnamed: 0,baseline_sample,invoice_id
0,False,130198
1,True,22415
