In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [3]:


# Load your dataset
file_path = "../../data/input/Freight_Cost_Analysis_CY2024-03.25.csv"
df = pd.read_csv(file_path, encoding="latin1", low_memory=False)
df.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,PART DESCRIPTION,COMM 1,COMM 2,PO PURCH QTY,PURCH UOM,PO INV QTY,INV UOM,INVOICED LINE QTY,INVOICE LINE TOTAL,PO PRICE
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,,,,,,,-600.0,
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,PROTECT ALL PRE-NOTCHED Z-BAR INSIDE CORNER AL...,1ACC,1ACC,4.0,EA,4.0,EA,4.0,18.4,18.4


In [4]:

#  === Load Commodity Groups ===
# Load the commodity groups from the Excel file
commodity_df = pd.read_excel('../../data/input/IFS Cloud Commodity Groups.xlsx', sheet_name='Commodity Groups')
commodity_df.head()


# Convert 'Commodity Group' to string and create a new column 'COMM 1'
commodity_df['COMM 1'] = commodity_df['Commodity Group'].astype(str)

# Convert 'Commodity Group' to string in the main DataFrame
df['COMM 1'] = df['COMM 1'].astype(str)

# Perform the join on the 'COMM 1' column
merged_df = df.merge(commodity_df, on='COMM 1', how='left')
# Flag matched and unmatched rows clearly
merged_df['Match Commodity'] = merged_df['Commodity Group'].apply(
    lambda x: 'Commodity Found' if pd.notna(x) else 'Commodity Not Found'
)
merged_df.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,PO INV QTY,INV UOM,INVOICED LINE QTY,INVOICE LINE TOTAL,PO PRICE,Commodity Group,Description,Old/New,Priority,Match Commodity
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,,-600.0,,,,,,Commodity Not Found
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,4.0,EA,4.0,18.4,18.4,1ACC,Accessories,New Commodity,No,Commodity Found


In [5]:
# Replace values in the 'uom' column
merged_df['INV UOM'] = merged_df['INV UOM'].replace({'SF': 'SQFT', 'SY': 'SQYD'})
merged_df.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,PO INV QTY,INV UOM,INVOICED LINE QTY,INVOICE LINE TOTAL,PO PRICE,Commodity Group,Description,Old/New,Priority,Match Commodity
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,,-600.0,,,,,,Commodity Not Found
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,4.0,EA,4.0,18.4,18.4,1ACC,Accessories,New Commodity,No,Commodity Found


In [6]:

#  === Load Manufacturers ===
# Load the manufacturers from the Excel file
manufacturer_df = pd.read_excel('../../data/input/Manufacturer List.xlsx', sheet_name='Sheet1')
manufacturer_df.head()

# Convert 'Commodity Group' to string and create a new column 'COMM 1'
manufacturer_df['SUPPLIER NO'] = manufacturer_df['Supplier No'].astype(str)
manufacturer_df.head(2)


Unnamed: 0,Supplier No,Supplier Name,SUPPLIER NO
0,104471,Adleta Corporation,104471
1,128340,"AHF, LLC dba AHF Products",128340


In [7]:

# Convert 'Commodity Group' to string in the main DataFrame
merged_df['SUPPLIER NO'] = merged_df['SUPPLIER NO'].astype(str)
# Perform the join on the 'COMM 1' column
merged_df2 = merged_df.merge(manufacturer_df[['SUPPLIER NO','Supplier Name']], on='SUPPLIER NO', how='left')

merged_df2['Match Supplier'] = merged_df2['Supplier Name'].apply(
    lambda x: 'Supplier found' if pd.notna(x) else 'No supplier found'
)

# Display the updated DataFrame
merged_df2.head(2)


Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,INVOICED LINE QTY,INVOICE LINE TOTAL,PO PRICE,Commodity Group,Description,Old/New,Priority,Match Commodity,Supplier Name,Match Supplier
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,-600.0,,,,,,Commodity Not Found,,No supplier found
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,4.0,18.4,18.4,1ACC,Accessories,New Commodity,No,Commodity Found,,No supplier found


In [8]:
df = merged_df2
# Normalize the 'INV UOM' column to handle case sensitivity and strip spaces
df['INV UOM'] = df['INV UOM'].str.strip().str.upper()

# Classify rows based on 'INV UOM' values
df['Classification'] = df.apply(
    lambda row: 'Classified' if row['INV UOM'] in ['SQFT', 'SQYD']
    else ('No UOM' if pd.isna(row['INV UOM']) or row['INV UOM'] == '' else 'Unclassified'),
    axis=1
)
# Create a new column 'conversion_code' based on the 'Description' + 'Comodity Group' + 'INV UOM' column
df['conversion_code'] = df['Description'].str.replace(' ', '_', regex=True).astype(str) + '_' + df['Commodity Group'].astype(str) + '_' + df['INV UOM'].astype(str)
df.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,PO PRICE,Commodity Group,Description,Old/New,Priority,Match Commodity,Supplier Name,Match Supplier,Classification,conversion_code
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,,,,Commodity Not Found,,No supplier found,No UOM,nan_nan_nan
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,18.4,1ACC,Accessories,New Commodity,No,Commodity Found,,No supplier found,Unclassified,Accessories_1ACC_EA


In [9]:
# Group by INVOICE ID and count the number of rows where ACCOUNT == 5504
freight_count = df[df['ACCOUNT'] == 5504].groupby('INVOICE ID').size()

# Flag invoices with more than 1 Project Freight line item
df['Multiple Freight Lines'] = df['INVOICE ID'].map(freight_count > 1).fillna(False)

# Display the updated DataFrame
df.head(2)

  df['Multiple Freight Lines'] = df['INVOICE ID'].map(freight_count > 1).fillna(False)


Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,Commodity Group,Description,Old/New,Priority,Match Commodity,Supplier Name,Match Supplier,Classification,conversion_code,Multiple Freight Lines
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,,,Commodity Not Found,,No supplier found,No UOM,nan_nan_nan,False
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,1ACC,Accessories,New Commodity,No,Commodity Found,,No supplier found,Unclassified,Accessories_1ACC_EA,False


In [10]:
# Group by INVOICE ID and count the number of rows where ACCOUNT == 5504
component_count = df[df['ACCOUNT'] == 2008].groupby('INVOICE ID').size()

# Flag invoices with more than 1 Project Freight line item
df['Multiple Parts'] = df['INVOICE ID'].map(component_count > 1).fillna(False)

# Display the updated DataFrame
df.head(2)

  df['Multiple Parts'] = df['INVOICE ID'].map(component_count > 1).fillna(False)


Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,Description,Old/New,Priority,Match Commodity,Supplier Name,Match Supplier,Classification,conversion_code,Multiple Freight Lines,Multiple Parts
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,,,,Commodity Not Found,,No supplier found,No UOM,nan_nan_nan,False,False
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,Accessories,New Commodity,No,Commodity Found,,No supplier found,Unclassified,Accessories_1ACC_EA,False,True


In [11]:
# Check if all products in each invoice with ACCOUNT == 2008 have Priority == 'Yes'
df['All Priority Products (2008)'] = df[df['ACCOUNT'] == 2008].groupby('INVOICE ID')['Priority'].transform(lambda x: all(x == 'Yes'))

# Display the updated DataFrame
df[['INVOICE ID', 'Priority', 'All Priority Products (2008)']].head()

Unnamed: 0,INVOICE ID,Priority,All Priority Products (2008)
0,433731,,
1,433340,No,False
2,433340,No,False
3,433340,,
4,433340,,


In [12]:
# Check if all products in each invoice with ACCOUNT == 2008 have Priority == 'Yes'
df['Any Priority Products (2008)'] = df[df['ACCOUNT'] == 2008].groupby('INVOICE ID')['Priority'].transform(lambda x: any(x == 'Yes'))

# Display the updated DataFrame
df[['INVOICE ID', 'Priority', 'Any Priority Products (2008)']].head()

Unnamed: 0,INVOICE ID,Priority,Any Priority Products (2008)
0,433731,,
1,433340,No,True
2,433340,No,True
3,433340,,
4,433340,,


In [13]:
# Filter rows where ACCOUNT is 5504
freight_invoices = df[df['ACCOUNT'] == 5504]

# Check if INVOICE ID exists in the filtered DataFrame
df['Freight Invoice'] = df['INVOICE ID'].isin(freight_invoices['INVOICE ID'])

# Display the result
df[['INVOICE ID', 'Freight Invoice']]
df.head(2)

Unnamed: 0,SITE,SITE DESCRIPTION,SUPPLIER NO,SUPPLIER NAME,INVOICE ID,INVOICE NO,DATE POSTED,PROJECT ID,PROJECT NAME,ACCOUNT,...,Match Commodity,Supplier Name,Match Supplier,Classification,conversion_code,Multiple Freight Lines,Multiple Parts,All Priority Products (2008),Any Priority Products (2008),Freight Invoice
0,BNB,Beckers New Brighton,102548,Lonseal Flooring,433731,0007795-CM,3-Jan-24,2311121922,REGIONS HOSPITAL 4TH MRI,5400,...,Commodity Not Found,,No supplier found,No UOM,nan_nan_nan,False,False,,,False
1,BNB,Beckers New Brighton,104716,Hank's Specialties,433340,173373,3-Jan-24,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,2008,...,Commodity Found,,No supplier found,Unclassified,Accessories_1ACC_EA,False,True,False,True,True


In [14]:
df.columns

Index(['SITE', 'SITE DESCRIPTION', 'SUPPLIER NO', 'SUPPLIER NAME',
       'INVOICE ID', 'INVOICE NO', 'DATE POSTED', 'PROJECT ID', 'PROJECT NAME',
       'ACCOUNT', 'ACCOUNT DESCRIPTION', 'PLANNED DELIVERY DATE',
       'SHIP TO ZIP', 'PO NO', 'PO LINE NO', 'PO REL NO', 'RECEIPT NO',
       'PART NO', 'PART DESCRIPTION', 'COMM 1', 'COMM 2', 'PO PURCH QTY',
       'PURCH UOM', 'PO INV QTY', 'INV UOM', 'INVOICED LINE QTY',
       'INVOICE LINE TOTAL', 'PO PRICE', 'Commodity Group', 'Description',
       'Old/New', 'Priority', 'Match Commodity', 'Supplier Name',
       'Match Supplier', 'Classification', 'conversion_code',
       'Multiple Freight Lines', 'Multiple Parts',
       'All Priority Products (2008)', 'Any Priority Products (2008)',
       'Freight Invoice'],
      dtype='object')

In [15]:

template_columns = ['PROJECT ID','PROJECT NAME','PO NO','ACCOUNT', 'ACCOUNT DESCRIPTION',
                     'SITE','SITE DESCRIPTION',
                     'SUPPLIER NO', 'SUPPLIER NAME', 
                     'PART NO', 'PART DESCRIPTION',
                    'INVOICED LINE QTY','INVOICE ID',
                    'INVOICE NO','INV UOM','COMM 1','COMM 2',
                    'Commodity Group', 'Description',
                    'Old/New', 'Priority', 'Classification',
                    'conversion_code','INVOICE LINE TOTAL',
                    'Freight Invoice','Multiple Freight Lines',
                    'Multiple Parts','All Priority Products (2008)',
                    'Any Priority Products (2008)','Match Commodity','Match Supplier']

In [16]:
# Define the mapping of template columns to the desired column names
column_mapping = {
    
    'PROJECT ID':'project_id',
    'PROJECT NAME':'project_name',
    'PO NO': 'po_no',
    'INVOICE ID': 'invoice_id',
    'INVOICE NO': 'invoice_no',
    'ACCOUNT':'account', 
    'ACCOUNT DESCRIPTION':'account_description',
    'SITE': 'siteid',
    'SITE DESCRIPTION': 'site',
    'SUPPLIER NO': 'supplierid',
    'SUPPLIER NAME': 'suppliername',
    'INVOICED LINE QTY': 'quantity',
    'PART NO': 'partnumber',
    'PART DESCRIPTION':'partdescription',
    'COMM 1': 'comm1',
    'COMM 2': 'comm2',
    'Commodity Group': 'commodity_group',
    'Description': 'commoditydescription',
    'INV UOM': 'uom',
    'Priority':'priority', 
    'Classification': 'classification',
    'conversion_code': 'conversion_code',
    'Old/New': 'old_new',
    'Freight Invoice':'freight_invoice',
    'INVOICE LINE TOTAL': 'invoice_line_total',
    'Multiple Freight Lines':'multiple_freight_lines',
    'Multiple Parts':'multiple_parts',
    'All Priority Products (2008)':'all_priority_products',
    'Any Priority Products (2008)':'any_priority_products',
    'Match Commodity':'match_commodity',
    'Match Supplier':'match_supplier'
}

# Rename the columns in the DataFrame
mapped_df = df[template_columns].rename(columns=column_mapping)

# Display the first few rows of the mapped DataFrame
mapped_df.head()

Unnamed: 0,project_id,project_name,po_no,account,account_description,siteid,site,supplierid,suppliername,partnumber,...,classification,conversion_code,invoice_line_total,freight_invoice,multiple_freight_lines,multiple_parts,all_priority_products,any_priority_products,match_commodity,match_supplier
0,2311121922,REGIONS HOSPITAL 4TH MRI,,5400,PROJECT Sub-Contract Labor,BNB,Beckers New Brighton,102548,Lonseal Flooring,,...,No UOM,nan_nan_nan,-600.0,False,False,False,,,Commodity Not Found,No supplier found
1,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,48180.0,2008,Received Not Yet Invoiced,BNB,Beckers New Brighton,104716,Hank's Specialties,1000007968.0,...,Unclassified,Accessories_1ACC_EA,18.4,True,False,True,False,True,Commodity Found,No supplier found
2,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,48180.0,2008,Received Not Yet Invoiced,BNB,Beckers New Brighton,104716,Hank's Specialties,1000008162.0,...,Unclassified,Transitions_1TRAN_EA,45.36,True,False,True,False,True,Commodity Found,No supplier found
3,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,48180.0,5504,PROJECT Freight,BNB,Beckers New Brighton,104716,Hank's Specialties,,...,No UOM,nan_nan_nan,50.0,True,False,True,,,Commodity Not Found,No supplier found
4,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,48180.0,2015,Sales Tax Payable,BNB,Beckers New Brighton,104716,Hank's Specialties,,...,No UOM,nan_nan_nan,151.42,True,False,True,,,Commodity Not Found,No supplier found


In [17]:
# Fill missing values in 'all_priority_products' and 'any_priority_products' using 'match_commodity'
mapped_df['all_priority_products'] = mapped_df['all_priority_products'].fillna(mapped_df['match_commodity'])
mapped_df['any_priority_products'] = mapped_df['any_priority_products'].fillna(mapped_df['match_commodity'])
mapped_df.head(2)

Unnamed: 0,project_id,project_name,po_no,account,account_description,siteid,site,supplierid,suppliername,partnumber,...,classification,conversion_code,invoice_line_total,freight_invoice,multiple_freight_lines,multiple_parts,all_priority_products,any_priority_products,match_commodity,match_supplier
0,2311121922,REGIONS HOSPITAL 4TH MRI,,5400,PROJECT Sub-Contract Labor,BNB,Beckers New Brighton,102548,Lonseal Flooring,,...,No UOM,nan_nan_nan,-600.0,False,False,False,Commodity Not Found,Commodity Not Found,Commodity Not Found,No supplier found
1,2312127706,UOFM MOLECULAR & CELLULAR BIOLOGY P,48180.0,2008,Received Not Yet Invoiced,BNB,Beckers New Brighton,104716,Hank's Specialties,1000007968.0,...,Unclassified,Accessories_1ACC_EA,18.4,True,False,True,False,True,Commodity Found,No supplier found


In [18]:
# Group by ACCOUNT and ACCOUNT DESCRIPTION, and count unique INVOICE IDs
account_summary = mapped_df.groupby(['account', 'account_description'])['invoice_id'].nunique()

# Convert the result to a DataFrame for better readability
account_summary = account_summary.reset_index(name='Unique Invoice Count').sort_values('Unique Invoice Count', ascending=False)

# Display the summary
account_summary.head(20)

Unnamed: 0,account,account_description,Unique Invoice Count
23,2008,Received Not Yet Invoiced,152613
72,5504,PROJECT Freight,55942
66,5205,PROJECT PO Variance M181 & M182,24488
63,5200,PROJECT Supplies and Materials,7403
28,2015,Sales Tax Payable,7069
74,5599,PROJECT Use Tax for Jobs,6036
97,6108,Warehouse Trash,4744
69,5500,PROJECT Other Direct Costs,4433
108,6207,Delivery Other,3317
88,5999,Purchase Discounts,2079


In [19]:
# Group by ACCOUNT and ACCOUNT DESCRIPTION, and count unique INVOICE IDs
project_freight_df = mapped_df[mapped_df['freight_invoice'] == True]

account_summary = project_freight_df.groupby(['account', 'account_description'])['invoice_id'].nunique()

# Convert the result to a DataFrame for better readability
account_summary = account_summary.reset_index(name='Unique Invoice Count').sort_values('Unique Invoice Count', ascending=False)

# Display the summary
account_summary.head(20)

Unnamed: 0,account,account_description,Unique Invoice Count
20,5504,PROJECT Freight,55942
6,2008,Received Not Yet Invoiced,50702
15,5205,PROJECT PO Variance M181 & M182,13975
22,5599,PROJECT Use Tax for Jobs,2479
9,2015,Sales Tax Payable,2205
12,5200,PROJECT Supplies and Materials,1702
18,5500,PROJECT Other Direct Costs,908
25,5999,Purchase Discounts,777
4,1420,CARE Stewardship Fee,229
7,2009,2008  Received Not Yet Invoiced  WinBid,198


In [20]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

mapped_df.to_csv(f'../../data/output/enhanced_data_{timestamp}.csv', index=False)
# Display the first few rows of the filtered DataFrame  

### Sample Size 

In [21]:
# Total sample size 
mapped_df.shape

(441707, 31)

In [22]:
#Unique invoice 
mapped_df['invoice_id'].nunique()

188087

In [23]:
#Unique PO with project ID
unique_invoices_with_project_id = mapped_df[mapped_df['project_id'].notnull()]['invoice_id'].nunique()
unique_invoices_with_project_id

161241

In [24]:
#Unique invoice with PO
unique_invoices_with_po_no = mapped_df[mapped_df['po_no'].notnull()]['invoice_id'].nunique()
unique_invoices_with_po_no

159202

In [25]:
#Unique invoice with project ID and Purchase Order No 
unique_invoices_with_project_and_po = mapped_df[mapped_df['project_id'].notnull() & mapped_df['po_no'].notnull()]['invoice_id'].nunique()
unique_invoices_with_project_and_po

152548

In [26]:
# Unique invoice with project ID and Purchace order and freight invoice
unique_invoices_with_freight_and_po = mapped_df[
    (mapped_df['project_id'].notnull()) & 
    (mapped_df['po_no'].notnull()) &
    (mapped_df['freight_invoice'] == True)
     ]['invoice_id'].nunique()
unique_invoices_with_freight_and_po

51927

In [27]:
# Unique invoice with project ID and Purchace order and freight invoice and priority products
unique_invoices_with_po_freight_and_priority_any = mapped_df[
    (mapped_df['project_id'].notnull()) & 
    (mapped_df['po_no'].notnull()) &
    (mapped_df['freight_invoice'] == True) &
    (mapped_df['any_priority_products'] == True)
]['invoice_id'].nunique()
unique_invoices_with_po_freight_and_priority_any

18523

In [28]:
# Unique invoice with project ID and Purchace order and freight invoice and priority products
unique_invoices_with_po_freight_and_priority_all = mapped_df[
    (mapped_df['project_id'].notnull()) & 
    (mapped_df['po_no'].notnull()) &
    (mapped_df['freight_invoice'] == True) &
    (mapped_df['all_priority_products'] == True)
]['invoice_id'].nunique()
unique_invoices_with_po_freight_and_priority_all

13378

## constraints flags 

In [29]:
# Group by INVOICE ID and check if all rows in the 'Classification' column are 'Classified'
mapped_df['classification_all_priority'] = mapped_df.groupby('invoice_id')['classification'].transform(lambda x: all(x == 'Classified'))

# Display the updated DataFrame
mapped_df[['invoice_id', 'classification', 'classification_all_priority']].head()

Unnamed: 0,invoice_id,classification,classification_all_priority
0,433731,No UOM,False
1,433340,Unclassified,False
2,433340,Unclassified,False
3,433340,No UOM,False
4,433340,No UOM,False


In [35]:
classification_all_priority_count = mapped_df['classification_all_priority'].sum()
classification_all_priority_count

5774

In [34]:
count = mapped_df['classification_all_priority']==True
count.count()

441707