In [2]:
import pandas as pd

# === Load Files ===
# Main PO, invoice, freight dataset
df = pd.read_csv("Freight_Cost_Analysis_CY2024-03.25.csv", encoding="latin1", low_memory=False)

# Load account exclusions
excluded_accounts_df = pd.read_excel("accounts_to_exclude.xlsx")
excluded_accounts = excluded_accounts_df['Account'].astype(int).tolist()

# === Filter Out Unwanted Account Lines ===
df = df[~df['ACCOUNT'].isin(excluded_accounts)]

# === Step 1: Freight Spend Profiling ===
freight_lines = df[df['ACCOUNT DESCRIPTION'].astype(str).str.upper().str.contains('PROJECT FREIGHT', na=False)]
total_po_value = df.groupby('PO NO', as_index=False)['INVOICE LINE TOTAL'].sum().rename(columns={'INVOICE LINE TOTAL': 'TOTAL_PO_VALUE'})
freight_cost = freight_lines.groupby('PO NO', as_index=False)['INVOICE LINE TOTAL'].sum().rename(columns={'INVOICE LINE TOTAL': 'PROJECT_FREIGHT_COST'})
df = df.merge(total_po_value, on='PO NO', how='left')
df = df.merge(freight_cost, on='PO NO', how='left')
df['PROJECT_FREIGHT_COST'] = df['PROJECT_FREIGHT_COST'].fillna(0)
df['PROJECT_FREIGHT_PERCENT'] = df['PROJECT_FREIGHT_COST'] / df['TOTAL_PO_VALUE']
df['PRODUCT_ONLY_PO_VALUE'] = df['TOTAL_PO_VALUE'] - df['PROJECT_FREIGHT_COST']
df['FREIGHT_PERCENT_EXCL_PRODUCT'] = df['PROJECT_FREIGHT_COST'] / df['PRODUCT_ONLY_PO_VALUE']
df['FREIGHT_≥90%_OF_PO'] = df['PROJECT_FREIGHT_PERCENT'] >= 0.9
df['FREIGHT_GT_PRODUCT'] = df['PROJECT_FREIGHT_COST'] > df['PRODUCT_ONLY_PO_VALUE']
df['NEGATIVE_FREIGHT_PERCENT'] = df['PROJECT_FREIGHT_PERCENT'] < 0

# === Step 2: ZIP & Supplier Analysis ===
df['SHIP TO ZIP'] = df['SHIP TO ZIP'].astype(str).str.extract(r'(\d{5})')
freight_df = df[df['PROJECT_FREIGHT_COST'] > 0]
zip_supplier_summary = freight_df.groupby(['SHIP TO ZIP', 'SUPPLIER NO']).agg(
    AVG_FREIGHT_PERCENT=('PROJECT_FREIGHT_PERCENT', 'mean'),
    PO_COUNT=('PO NO', 'nunique')
).reset_index()
zip_supplier_summary['HIGH_FREIGHT_FLAG'] = zip_supplier_summary['AVG_FREIGHT_PERCENT'] > 0.5
df = df.merge(zip_supplier_summary[['SHIP TO ZIP', 'SUPPLIER NO', 'HIGH_FREIGHT_FLAG']],
              on=['SHIP TO ZIP', 'SUPPLIER NO'], how='left')

# === Step 3: Product Analysis ===
labor_keywords = ['LABOR', 'INSTALL', 'SERVICE', 'WAGE', 'CONTRACT', 'EMPLOYEE']
df['PART DESCRIPTION CLEAN'] = df['PART DESCRIPTION'].astype(str).str.upper()
df['IS_LABOR'] = df['PART DESCRIPTION CLEAN'].apply(lambda desc: any(k in desc for k in labor_keywords))
product_counts = df[~df['IS_LABOR'] & df['PART DESCRIPTION'].notna()].groupby('PART DESCRIPTION').size().reset_index(name='PRODUCT_ORDER_COUNT')
product_counts['PRODUCT_ORDER_RANK'] = product_counts['PRODUCT_ORDER_COUNT'].rank(method='dense', ascending=False).astype(int)
df = df.merge(product_counts, on='PART DESCRIPTION', how='left')

# === Step 4: UOM Consistency ===
def compare_uom(row):
    if pd.isna(row['INV UOM']):
        return 'Missing INV UOM'
    elif row['PURCH UOM'] == row['INV UOM']:
        return 'Match'
    else:
        return 'Mismatch'
df['UOM_COMPARISON_STATUS'] = df.apply(compare_uom, axis=1)

# === Step 5: Key Metrics Summary (Optional) ===
po_summary = df[['PO NO', 'PROJECT_FREIGHT_COST', 'PRODUCT_ONLY_PO_VALUE']].drop_duplicates().dropna()
po_summary['FREIGHT_PERCENT_EXCL_PRODUCT'] = po_summary['PROJECT_FREIGHT_COST'] / po_summary['PRODUCT_ONLY_PO_VALUE']
pct_le_10 = (po_summary['FREIGHT_PERCENT_EXCL_PRODUCT'] <= 0.10).mean() * 100
pct_gt_50 = (po_summary['FREIGHT_PERCENT_EXCL_PRODUCT'] > 0.50).mean() * 100
print(f"% of POs with freight ≤ 10% of product spend: {pct_le_10:.2f}%")
print(f"% of POs with freight > 50% of product spend: {pct_gt_50:.2f}%")

# === Step 6: Composite Key Consistency ===
df['PO_INVOICE_COMPOSITE_KEY'] = df['PO NO'].astype(str) + '|' + df['INVOICE ID'].astype(str) + '|' + df['INVOICE NO'].astype(str)
composite_counts = df.groupby('PO NO')['PO_INVOICE_COMPOSITE_KEY'].nunique().reset_index()
composite_counts['PO_COMPOSITE_KEY_CONSISTENCY'] = composite_counts['PO_INVOICE_COMPOSITE_KEY'].apply(lambda x: 'Consistent' if x == 1 else 'Inconsistent')
df = df.merge(composite_counts[['PO NO', 'PO_COMPOSITE_KEY_CONSISTENCY']], on='PO NO', how='left')

# === Export Final Enriched Dataset ===
df.to_csv("Freight_Analysis_Enriched_Output.csv", index=False)
print("✅ Exported: Freight_Analysis_Enriched_Output.csv")


% of POs with freight ≤ 10% of product spend: 88.20%
% of POs with freight > 50% of product spend: 2.83%
✅ Exported: Freight_Analysis_Enriched_Output.csv
