In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:


# Load your dataset
file_path = "../../data/input/Freight_Cost_Analysis_CY2024-03.25.csv"
df = pd.read_csv(file_path, encoding="latin1", low_memory=False)
df.head(2)

In [None]:

#  === Load Commodity Groups ===
# Load the commodity groups from the Excel file
commodity_df = pd.read_excel('../../data/input/IFS Cloud Commodity Groups.xlsx', sheet_name='Commodity Groups')
commodity_df.head()


# Convert 'Commodity Group' to string and create a new column 'COMM 1'
commodity_df['COMM 1'] = commodity_df['Commodity Group'].astype(str)

# Convert 'Commodity Group' to string in the main DataFrame
df['COMM 1'] = df['COMM 1'].astype(str)

# Perform the join on the 'COMM 1' column
merged_df = df.merge(commodity_df, on='COMM 1', how='left')
# Flag matched and unmatched rows clearly
merged_df['Match Commodity'] = merged_df['Commodity Group'].apply(
    lambda x: 'Commodity Found' if pd.notna(x) else 'Commodity Not Found'
)
merged_df.head(2)

In [None]:
# Replace values in the 'uom' column
merged_df['INV UOM'] = merged_df['INV UOM'].replace({'SF': 'SQFT', 'SY': 'SQYD'})
merged_df.head(2)

In [None]:

#  === Load Manufacturers ===
# Load the manufacturers from the Excel file
manufacturer_df = pd.read_excel('../../data/input/Manufacturer List.xlsx', sheet_name='Sheet1')
manufacturer_df.head()

# Convert 'Commodity Group' to string and create a new column 'COMM 1'
manufacturer_df['SUPPLIER NO'] = manufacturer_df['Supplier No'].astype(str)
manufacturer_df.head(2)


In [None]:

# Convert 'Commodity Group' to string in the main DataFrame
merged_df['SUPPLIER NO'] = merged_df['SUPPLIER NO'].astype(str)
# Perform the join on the 'COMM 1' column
merged_df2 = merged_df.merge(manufacturer_df[['SUPPLIER NO','Supplier Name']], on='SUPPLIER NO', how='left')

merged_df2['Match Supplier'] = merged_df2['Supplier Name'].apply(
    lambda x: 'Supplier registered' if pd.notna(x) else 'No supplier found'
)

# Display the updated DataFrame
merged_df2.head(2)


In [None]:
# data cleaning function to standardise the description conversion
# This function will classify the commodity based on the description
def classify_commodity(row):
    desc = str(row['Description'])
    desc_lower = desc.lower()
    
    if 'vinyl' in desc_lower:
        return ''.join(filter(str.isalpha, str(row['COMM 2'])))
    elif 'carpet' in desc_lower:
        if desc_lower == 'carpet bl':
            return 'Carpet Roll'
        elif desc_lower == 'carpet tile':
            return 'Carpet Tiles'
        else:
            return 'Carpet Roll'
    else:
        return desc  # Default: keep original Description

# Apply to new column
merged_df2['new commodity description'] = merged_df2.apply(classify_commodity, axis=1)
# Display the updated DataFrame
merged_df2.head(2)

In [None]:
# This function will classify the commodity from old codes to new codes
def map_commodity_group(x):
    x_str = str(x)
    
    if '10' in x_str:
        return '1CBL'
    elif x_str == '100':
        return '1CPT'
    elif x_str == '40':
        return '1VNL'
    else:
        return x  # Keep original value if none of the above match

# Apply the function to update the column
merged_df2['new commodity group'] = merged_df2['Commodity Group'].apply(map_commodity_group)


In [None]:

# Display the updated DataFrame
merged_df2.head(2)

In [None]:
merged_df2['new commodity description'].unique()

In [None]:
# From this point df becomes default DB
# Set the DataFrame to merged_df2 for further processing
df = merged_df2

# Normalize the 'INV UOM' column to handle case sensitivity and strip spaces
df['INV UOM'] = df['INV UOM'].str.strip().str.upper()

# Classify rows based on 'INV UOM' values
df['Classification'] = df.apply(
    lambda row: 'Classified' if row['INV UOM'] in ['SQFT', 'SQYD']
    else ('No UOM' if pd.isna(row['INV UOM']) or row['INV UOM'] == '' else 'Unclassified'),
    axis=1
)
# Create a new column 'conversion_code' based on the 'Description' + 'Comodity Group' + 'INV UOM' column
df['conversion_code'] = df['new commodity description'].str.replace(' ', '_', regex=True).astype(str) + '_' + df['new commodity group'].astype(str) + '_' + df['INV UOM'].astype(str)
df.head(2)

In [None]:
# checking which of the rows in an invoice matching 2008 has unclassified items
# Check if all rows with ACCOUNT == 2008 are classified
# Step 1: Identify invoice IDs where ALL rows with ACCOUNT == 2008 are classified
classified_invoice_ids = (
    df[df['ACCOUNT'] == 2008]
    .groupby('INVOICE ID')['Classification']
    .apply(lambda x: all(x == 'Classified'))
)

# Step 2: Filter to only invoice IDs where ALL 2008 accounts are classified
fully_classified_ids = classified_invoice_ids[classified_invoice_ids].index

# Step 3: Create a new column to mark if entire invoice is considered classified (based on the 2008 rule)
df['All Accounts 2008 Classified'] = df['INVOICE ID'].isin(fully_classified_ids)

# Step 4: Count how many invoices meet this condition
count_all_classified_invoices = df[df['All Accounts 2008 Classified']]['INVOICE ID'].nunique()

print(f"Number of invoices where all ACCOUNT == 2008 are classified: {count_all_classified_invoices}")


In [None]:
# Checking which of the rows in an invoice matching 2008 has existing conversion codes to area
# Step 1: Identify invoice IDs where ALL rows with ACCOUNT == 2008 have existing conversion codes
# 1. Load the rates table
rates_df = pd.read_csv('../../app/conversion_table_standardized.csv')  # adjust path and filename
# Step 1: Ensure consistent data types
rates_df['conversion_code'] = rates_df['conversion_code'].astype(str)
df['conversion_code'] = df['conversion_code'].astype(str)

# Step 2: Create a set of valid conversion codes from the rates table
valid_codes = set(rates_df['conversion_code'].unique())

# Step 3: Filter only rows with ACCOUNT == 2008
df_2008 = df[df['ACCOUNT'] == 2008].copy()

# Step 4: For each INVOICE ID, check if all conversion codes for ACCOUNT 2008 are valid
invoice_validity = df_2008.groupby('INVOICE ID')['conversion_code'].apply(
    lambda codes: all(code in valid_codes for code in codes)
)

# Step 5: Get list of INVOICE IDs where all ACCOUNT 2008 codes are valid
fully_valid_invoice_ids = invoice_validity[invoice_validity].index

# Step 6: Create a new column in the main df that flags all rows for those invoices
df['All 2008 Accounts Converted'] = df['INVOICE ID'].isin(fully_valid_invoice_ids)

# Step 7: Count how many invoices meet this condition
count_all_valid_invoices = df[df['All 2008 Accounts Converted']]['INVOICE ID'].nunique()

print(f"Number of invoices where all ACCOUNT == 2008 have valid conversion codes: {count_all_valid_invoices}")


In [None]:
# Step 1: Get INVOICE IDs that have at least one freight line (ACCOUNT == 5504)
freight_invoice_ids = df[df['ACCOUNT'] == 5504]['INVOICE ID'].unique()

# Step 2: Flag all rows where the INVOICE ID appears in that list
df['Has Freight Line'] = df['INVOICE ID'].isin(freight_invoice_ids)
# Step 3: Count how many invoices have at least one freight line
count_freight_invoices = df[df['Has Freight Line']]['INVOICE ID'].nunique()

print(f"Number of invoices with at least one freight line: {count_freight_invoices}")

In [None]:
# 
# Group by INVOICE ID and count the number of rows where ACCOUNT == 5504
freight_count = df[df['ACCOUNT'] == 5504].groupby('INVOICE ID').size()

# Flag invoices with more than 1 Project Freight line item
df['Multiple Freight Lines'] = df['INVOICE ID'].map(freight_count > 1).fillna(False)
# Step 4: Count how many invoices have multiple freight lines
count_multiple_freight_invoices = df[df['Multiple Freight Lines']]['INVOICE ID'].nunique()

print(f"Number of invoices with multiple freight lines: {count_multiple_freight_invoices}")

# Display the updated DataFrame
df.head(2)

In [None]:
# Step 1: Filter the DataFrame to only include rows where ACCOUNT == 2008
# These represent "Part Component" line items we're interested in
df_2008 = df[df['ACCOUNT'] == 2008]

# Step 2: Group by INVOICE ID and count the number of distinct partnumbers per invoice
# This tells us how many unique parts are associated with each invoice
component_count = df_2008.groupby('INVOICE ID')['PART NO'].nunique()

# Step 3: Create a new column 'Multiple Parts' in the main DataFrame
# For each INVOICE ID, mark True if it has more than one unique partnumber; otherwise False
# Invoices without any ACCOUNT == 2008 lines will get NaN, so we fill those with False
df['Multiple Parts'] = df['INVOICE ID'].map(component_count > 1).fillna(False)

# Step 4: Count how many invoices have multiple distinct part components
count_multiple_parts_invoices = df[df['Multiple Parts']]['INVOICE ID'].nunique()

# Step 5: Print the result for quick validation
print(f"Number of invoices with multiple distinct parts: {count_multiple_parts_invoices}")

# Step 6: Preview the updated DataFrame
df.head(2)


In [None]:
# Step 1: Filter to ACCOUNT == 2008
df_2008 = df[df['ACCOUNT'] == 2008]

# Step 2: For each invoice, check if all rows with ACCOUNT == 2008 have Priority == 'Yes'
priority_flag = df_2008.groupby('INVOICE ID')['Priority'].apply(
    lambda x: all(x == 'Yes')
)

# Step 3: Get invoice IDs where all ACCOUNT 2008 rows have Priority == 'Yes'
priority_invoice_ids = priority_flag[priority_flag].index

# Step 4: Flag those invoice IDs across the full dataframe
df['All Priority Products (2008)'] = df['INVOICE ID'].isin(priority_invoice_ids)   
# Step 5: Count how many invoices meet this condition
count_priority_invoices = df[df['All Priority Products (2008)']]['INVOICE ID'].nunique()

# Step 6: Print the result for quick validation
print(f"Number of invoices where all ACCOUNT == 2008 have Priority == 'Yes': {count_priority_invoices}")


In [None]:
# True if at least one ACCOUNT == 2008 line in the invoice has Priority == 'Yes'
priority_flag_any = df_2008.groupby('INVOICE ID')['Priority'].apply(lambda x: any(x == 'Yes'))
priority_invoice_ids_any = priority_flag_any[priority_flag_any].index
df['Any Priority Product (2008)'] = df['INVOICE ID'].isin(priority_invoice_ids_any)
# Step 5: Count how many invoices meet this condition
count_any_priority_invoices = df[df['Any Priority Product (2008)']]['INVOICE ID'].nunique()

# Step 6: Print the result for quick validation
print(f"Number of invoices where at least one ACCOUNT == 2008 has Priority == 'Yes': {count_any_priority_invoices}")

In [None]:

template_columns = ['PROJECT ID','PROJECT NAME','PO NO','ACCOUNT', 'ACCOUNT DESCRIPTION',
                     'SITE','SITE DESCRIPTION',
                     'SUPPLIER NO', 'SUPPLIER NAME', 
                     'PART NO', 'PART DESCRIPTION',
                    'INVOICED LINE QTY','INVOICE ID',
                    'INVOICE NO','INV UOM','COMM 1','COMM 2',
                    'Commodity Group', 'Description',
                    'Old/New', 'Priority', 'Classification',
                    'conversion_code','INVOICE LINE TOTAL',
                    'Has Freight Line','Multiple Freight Lines',
                    'Multiple Parts','All Priority Products (2008)',
                    'Any Priority Product (2008)','Match Commodity','Match Supplier','new commodity description',
                    'new commodity group','All Accounts 2008 Classified','All 2008 Accounts Converted']

In [None]:
# Define the mapping of template columns to the desired column names
column_mapping = {
    
    'PROJECT ID':'project_id',
    'PROJECT NAME':'project_name',
    'PO NO': 'po_no',
    'INVOICE ID': 'invoice_id',
    'INVOICE NO': 'invoice_no',
    'ACCOUNT':'account', 
    'ACCOUNT DESCRIPTION':'account_description',
    'SITE': 'siteid',
    'SITE DESCRIPTION': 'site',
    'SUPPLIER NO': 'supplierid',
    'SUPPLIER NAME': 'suppliername',
    'INVOICED LINE QTY': 'quantity',
    'PART NO': 'partnumber',
    'PART DESCRIPTION':'partdescription',
    'COMM 1': 'comm1',
    'COMM 2': 'comm2',
    'Commodity Group': 'commodity_group',
    'Description': 'commoditydescription',
    'INV UOM': 'uom',
    'Priority':'priority', 
    'Classification': 'classification',
    'conversion_code': 'conversion_code',
    'Old/New': 'old_new',
    'Has Freight Line':'freight_invoice',
    'INVOICE LINE TOTAL': 'invoice_line_total',
    'Multiple Freight Lines':'multiple_freight_lines',
    'Multiple Parts':'multiple_parts',
    'All Priority Products (2008)':'all_priority_products',
    'Any Priority Product (2008)':'any_priority_products',
    'Match Commodity':'match_commodity',
    'Match Supplier':'match_supplier',
    'new commodity description':'new_commodity_description',
    'new commodity group':'new_commodity_group',
    'All Accounts 2008 Classified':'all_accounts_2008_classified',
    'All 2008 Accounts Converted':'all_2008_accounts_converted'
}

# Rename the columns in the DataFrame
mapped_df = df[template_columns].rename(columns=column_mapping)

# Display the first few rows of the mapped DataFrame
mapped_df.head()

In [None]:
# Group by ACCOUNT and ACCOUNT DESCRIPTION, and count unique INVOICE IDs
account_summary = mapped_df.groupby(['account', 'account_description'])['invoice_id'].nunique()

# Convert the result to a DataFrame for better readability
account_summary = account_summary.reset_index(name='Unique Invoice Count').sort_values('Unique Invoice Count', ascending=False)

# Display the summary
account_summary.head(20)

In [None]:
# Group by ACCOUNT and ACCOUNT DESCRIPTION, and count unique INVOICE IDs
project_freight_df = mapped_df[mapped_df['freight_invoice'] == True]

account_summary = project_freight_df.groupby(['account', 'account_description'])['invoice_id'].nunique()

# Convert the result to a DataFrame for better readability
account_summary = account_summary.reset_index(name='Unique Invoice Count').sort_values('Unique Invoice Count', ascending=False)

# Display the summary
account_summary.head(20)

In [None]:
unique_invoices = mapped_df['invoice_id'].nunique()
print(f"Unique Invoices in mapped_df: {unique_invoices}")

In [None]:
# Get records with product IDs
mapped_df = mapped_df[mapped_df['project_id'].notna()]
mapped_df.head(2)

In [None]:
unique_invoices = mapped_df['invoice_id'].nunique()
print(f"Unique Invoices in mapped_df: {unique_invoices}")

In [None]:
filtered_df = mapped_df[mapped_df['po_no'].notna()]
unique_invoice_ids_with_po_no = filtered_df['invoice_id'].nunique()
print(f"Unique Invoice IDs with PO NO: {unique_invoice_ids_with_po_no}")

In [None]:
# Filter rows where 'po_no' is not null and 'all_priority_products' is True
freight_filtered_df = filtered_df[filtered_df['freight_invoice'] == True]

# Calculate the unique invoice IDs
unique_priority_invoice_ids = freight_filtered_df['invoice_id'].nunique()

# Print the result
print(f"Unique Invoice IDs with PO NO and Freight Price = True: {unique_priority_invoice_ids}")

In [None]:
# Filter rows where 'po_no' is not null and 'all_priority_products' is True
priority_filtered_df = freight_filtered_df[freight_filtered_df['any_priority_products'] == True]

# Calculate the unique invoice IDs
unique_priority_invoice_ids = priority_filtered_df['invoice_id'].nunique()

# Print the result
print(f"Unique Invoice IDs with PO NO ,Freight and Any Priority Products = True: {unique_priority_invoice_ids}")

In [None]:
# Filter rows where 'po_no' is not null and 'all_priority_products' is True
all_priority_filtered_df = freight_filtered_df[freight_filtered_df['all_priority_products'] == True]

# Calculate the unique invoice IDs
unique_priority_invoice_ids = all_priority_filtered_df['invoice_id'].nunique()

# Print the result
print(f"Unique Invoice IDs with PO NO ,Freight and All Priority Products = True: {unique_priority_invoice_ids}")

In [None]:
# Filter rows where 'po_no' is not null and 'all_priority_products' is True
priority_filtered_df = all_priority_filtered_df[all_priority_filtered_df['all_classified'] == True]

# Calculate the unique invoice IDs
unique_priority_invoice_ids = priority_filtered_df['invoice_id'].nunique()

# Print the result
print(f"Unique Invoice IDs with PO NO ,Freight and All Priority Products and all classified = True: {unique_priority_invoice_ids}")

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

mapped_df.to_csv(f'../../data/output/enhanced_data_{timestamp}.csv', index=False)
# Display the first few rows of the filtered DataFrame  