In [1]:
import pandas as pd
import sys
sys.path.append('../../utils')  # go up two folders, then into utils

from data_cleaning_utils import (
    data_cleaning,
    enrich_invoice_flags,
    uom_cleaning,
    flag_fully_converted_invoices,
    add_freight_per_invoice,
    filter_valid_invoices,
    increase_sample_size,
)  # or whatever functions you have


In [2]:
df = pd.read_csv('../../data/input/Freight_Cost_Analysis_CY2024-03.25.csv',encoding="latin1")  # replace with your actual data loading method

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
conversion_csv_path = "../../data/input/freight_model/conversion_table_standardized.csv"


In [4]:
df = data_cleaning(df,base_path='../../data/input')
df = uom_cleaning(df)
df = flag_fully_converted_invoices(df, conversion_csv_path)
df = enrich_invoice_flags(df)
df = add_freight_per_invoice(df)
df = increase_sample_size(df)
df.head(2)

INFO:root:🔧 Running data_cleaning...
INFO:root:✅ data_cleaning complete.
INFO:root:✅ fixing unit of measure.
INFO:root:✅ Flagging fully converted invoices.
INFO:root:✅ Enriching invoice data.


✅ 8590 invoices have all account == 2008 rows with valid conversion codes
Number of invoices with at least one freight line: 55942
Number of invoices with multiple freight lines: 197
Number of invoices with multiple distinct parts: 47577
Number of invoices with multiple distinct commodities: 16091
Number of invoices where all ACCOUNT == 2008 have Priority == 'Yes': 17084


INFO:root:✅ Calculating freight per invoice.


Number of invoices where at least one ACCOUNT == 2008 has Priority == 'Yes': 23536


INFO:root:✅ Completed adding freight to invoice.
INFO:root:✅ increasing freight per invoice.


FileNotFoundError: [Errno 2] No such file or directory: 'data/downloads/invalid.csv'

In [None]:
# export the cleaned DataFrame to a CSV file
df.to_csv('../../data/output/cleaned_freight_data.csv', index=False)

### Begin Funnel 

In [116]:
# How many distinct invoices 
distinct_invoices = df.groupby('invoice_id').ngroups
print(f"Distinct number of invoices: {distinct_invoices}")

Distinct number of invoices: 188087


In [117]:
# Invoices with project ID 
unique_invoices_with_project_id = df[df['project_id'].notnull()]['invoice_id'].nunique()
print(f"Unique invoices with project ID: {unique_invoices_with_project_id}")

Unique invoices with project ID: 161241


In [118]:
# Invoices with PO
invoices_with_po = df[df['project_id'].notnull() & df['po_no'].notnull()]['invoice_id'].nunique()
print(f"Invoices with project ID and PO numbers: {invoices_with_po}")

Invoices with project ID and PO numbers: 152548


In [119]:
df = df[df['invoice_id'].isin(df[df['po_no'].notnull()]['invoice_id'])]
df.shape

(406310, 48)

In [120]:
df.columns

Index(['site', 'site_description', 'supplier_no', 'supplier_name',
       'invoice_id', 'invoice_no', 'date_posted', 'project_id', 'project_name',
       'account', 'account_description', 'planned_delivery_date',
       'ship_to_zip', 'po_no', 'po_line_no', 'po_rel_no', 'receipt_no',
       'part_no', 'part_description', 'comm_1', 'comm_2', 'po_purch_qty',
       'purch_uom', 'po_inv_qty', 'inv_uom', 'invoiced_line_qty',
       'invoice_line_total', 'po_price', 'commodity_group', 'description',
       'old/new', 'priority', 'match_commodity', 'match_supplier',
       'classification', 'new_commodity_description', 'new_commodity_group',
       'conversion_code', 'all_accounts_2008_uom_classified',
       'all_2008_accounts_converted', 'has_freight_line',
       'multiple_freight_lines', 'multiple_parts', 'multiple_commodities',
       'all__invoice_priority_products_(2008)',
       'any__invoice_priority_products_(2008)', 'freight_per_invoice',
       'low_mix_priority_flag'],
      dty

In [121]:
unique_invoices_with_freight = df[df['has_freight_line'] == True]['invoice_id'].nunique()
print(f"Unique invoices with freight line: {unique_invoices_with_freight}")

Unique invoices with freight line: 52316


In [122]:
unique_invoices_with_freight_and_single_line = df[(df['has_freight_line'] == True) & (df['multiple_freight_lines'] == False)]['invoice_id'].nunique()
print(f"Unique invoices with freight line and no multiple freight lines: {unique_invoices_with_freight_and_single_line}")

Unique invoices with freight line and no multiple freight lines: 52122


In [123]:
filtered_invoices = df[
    (df['has_freight_line'] == True) &
    (df['multiple_freight_lines'] == False) &
    (df['any__invoice_priority_products_(2008)'] == True)
]['invoice_id'].nunique()

print(f"Unique invoices with the specified conditions: {filtered_invoices}")

Unique invoices with the specified conditions: 18511


In [124]:
unique_invoices = df[
    (df['has_freight_line'] == True) &
    (df['multiple_freight_lines'] == False) &
    (df['any__invoice_priority_products_(2008)'] == True) &
    (df['low_mix_priority_flag'] == True)
]['invoice_id'].nunique()

print(f"Unique invoices with the specified conditions: {unique_invoices}")

Unique invoices with the specified conditions: 17976


In [None]:
# How much freight cost is there in the invoices with project ID and PO numbers?
sum_invoice_line_total = df[
    (df['invoice_id'].isin(df[
        (df['has_freight_line'] == True) &
        (df['multiple_freight_lines'] == False) &
        (df['any__invoice_priority_products_(2008)'] == True) &
        (df['low_mix_priority_flag'] == True)
    ]['invoice_id'])) &
    (df['account'] == 5504)
]['invoice_line_total'].sum()

print(f"Sum of invoice_line_total for unique invoices where account == 2008: {sum_invoice_line_total}")

Sum of invoice_line_total for unique invoices where account == 2008: 6265994.24


In [130]:
unique_invoices_with_uom = df[
    (df['has_freight_line'] == True) &
    (df['multiple_freight_lines'] == False) &
    (df['any__invoice_priority_products_(2008)'] == True) &
    (df['low_mix_priority_flag'] == True) &
    (df['inv_uom'].isin(['SQYD', 'SQFT']))
]['invoice_id'].nunique()

print(f"Unique invoices with specified conditions and inv_uom as SQYD or SQFT: {unique_invoices_with_uom}")

Unique invoices with specified conditions and inv_uom as SQYD or SQFT: 16372


In [None]:
um_invoice_line_total = df[
    (df['invoice_id'].isin(df[
        (df['has_freight_line'] == True) &
        (df['multiple_freight_lines'] == False) &
        (df['any__invoice_priority_products_(2008)'] == True) &
        (df['low_mix_priority_flag'] == True) &
        (df['inv_uom'].isin(['SQYD', 'SQFT']))
    ]['invoice_id'])) &
    (df['account'] == 5504)
]['invoice_line_total'].sum()

um_invoice_line_total

5718669.33