# This notebook shows the steps in building a splitting of purchase data pipeline

In [1]:
import pandas as pd
import numpy as np

In [2]:
%%time
df_raw = pd.read_excel("./data/split_purchase_data.xlsx")

CPU times: total: 234 ms
Wall time: 556 ms


In [3]:
df_raw

Unnamed: 0,Purchasing Department,Purchase Order,Purchase Order Date,Item,Item Category,Vendor,Total Value
0,Dept001,PO_10001001,2022-01-12,Equipment AA,Equipment,V0001,2000
1,Dept001,PO_10001002,2022-01-15,Equipment AA,Equipment,V0001,4000
2,Dept001,PO_10001003,2022-01-14,Equipment AB,Equipment,V0002,3000
3,Dept001,PO_10001004,2022-02-14,Equipment AB,Equipment,V0003,6000
4,Dept001,PO_10001005,2022-02-15,Equipment AB,Equipment,V0004,8000
5,Dept002,PO_10001006,2022-01-16,Equipment AA,Equipment,V0001,4000
6,Dept002,PO_10001007,2022-01-17,Equipment AB,Equipment,V0002,3000
7,Dept002,PO_10001008,2022-01-18,Equipment AB,Equipment,V0002,6000


In [4]:
df_raw["Purchase Order Date"] = pd.to_datetime(
    df_raw["Purchase Order Date"]
)
df_raw["Purchase Order"] = df_raw["Purchase Order"].astype(
    str
)

In [5]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Purchasing Department  8 non-null      object        
 1   Purchase Order         8 non-null      object        
 2   Purchase Order Date    8 non-null      datetime64[ns]
 3   Item                   8 non-null      object        
 4   Item Category          8 non-null      object        
 5   Vendor                 8 non-null      object        
 6   Total Value            8 non-null      int64         
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 576.0+ bytes


# Calculation of Split Purchase based on Item

In [7]:
item_field = "Item"

In [8]:
df_dept_item_sorted = df_raw.sort_values(
    ["Purchasing Department", item_field, "Purchase Order Date"]
)

In [9]:
df_dept_item_sorted["Prev PR Date"] = df_dept_item_sorted[
    "Purchase Order Date"
].shift()

In [10]:
df_dept_item_sorted["PreGroup"] = (
    df_dept_item_sorted[
        ["Purchasing Department", item_field, "Purchase Order Date"]
    ]
    .groupby(["Purchasing Department", item_field])
    .cumcount()
)

In [11]:
df_dept_item_sorted

Unnamed: 0,Purchasing Department,Purchase Order,Purchase Order Date,Item,Item Category,Vendor,Total Value,Prev PR Date,PreGroup
0,Dept001,PO_10001001,2022-01-12,Equipment AA,Equipment,V0001,2000,NaT,0
1,Dept001,PO_10001002,2022-01-15,Equipment AA,Equipment,V0001,4000,2022-01-12,1
2,Dept001,PO_10001003,2022-01-14,Equipment AB,Equipment,V0002,3000,2022-01-15,0
3,Dept001,PO_10001004,2022-02-14,Equipment AB,Equipment,V0003,6000,2022-01-14,1
4,Dept001,PO_10001005,2022-02-15,Equipment AB,Equipment,V0004,8000,2022-02-14,2
5,Dept002,PO_10001006,2022-01-16,Equipment AA,Equipment,V0001,4000,2022-02-15,0
6,Dept002,PO_10001007,2022-01-17,Equipment AB,Equipment,V0002,3000,2022-01-16,0
7,Dept002,PO_10001008,2022-01-18,Equipment AB,Equipment,V0002,6000,2022-01-17,1


In [12]:
mask = df_dept_item_sorted["PreGroup"] == 0
df_dept_item_sorted.loc[mask, "Prev PR Date"] = pd.NaT

In [13]:
# Calculating number of days difference between adjacent records
df_dept_item_sorted["Date_Diff"] = (
    df_dept_item_sorted["Purchase Order Date"]
    - df_dept_item_sorted["Prev PR Date"]
).dt.days

In [14]:
# Set Threshold
n = 5
df_dept_item_sorted["Flag Out of Range"] = (
    1 - (df_dept_item_sorted["Date_Diff"] <= n)
).astype(int)

In [15]:
df_dept_item_sorted["Grouping"] = df_dept_item_sorted["Flag Out of Range"].cumsum()

In [18]:
df_dept_item_sorted.columns

Index(['Purchasing Department', 'Purchase Order', 'Purchase Order Date',
       'Item', 'Item Category', 'Vendor', 'Total Value', 'Prev PR Date',
       'PreGroup', 'Date_Diff', 'Flag Out of Range', 'Grouping'],
      dtype='object')

In [19]:
columns = ['Purchasing Department', 'Item', 'Purchase Order', 'Purchase Order Date',
           'Prev PR Date', 'Date_Diff', 'Grouping']
df_dept_item_sorted[columns]

Unnamed: 0,Purchasing Department,Item,Purchase Order,Purchase Order Date,Prev PR Date,Date_Diff,Grouping
0,Dept001,Equipment AA,PO_10001001,2022-01-12,NaT,,1
1,Dept001,Equipment AA,PO_10001002,2022-01-15,2022-01-12,3.0,1
2,Dept001,Equipment AB,PO_10001003,2022-01-14,NaT,,2
3,Dept001,Equipment AB,PO_10001004,2022-02-14,2022-01-14,31.0,3
4,Dept001,Equipment AB,PO_10001005,2022-02-15,2022-02-14,1.0,3
5,Dept002,Equipment AA,PO_10001006,2022-01-16,NaT,,4
6,Dept002,Equipment AB,PO_10001007,2022-01-17,NaT,,5
7,Dept002,Equipment AB,PO_10001008,2022-01-18,2022-01-17,1.0,5


In [17]:
df_pr_count = (
    df_dept_item_sorted[["Grouping", item_field, "Purchase Order"]]
    .groupby(["Grouping", item_field])
    .nunique()
    .reset_index()
)

In [18]:
df_pr_count.columns = ["Grouping", item_field, "Count PR"]

In [19]:
# Identifies the groups that are valid split purchase cases
df_split_purchase_group = df_pr_count[df_pr_count["Count PR"] >= 2]

In [20]:
# Joining of Split Purchase Groups to initial details
df_split_purchase_details = df_split_purchase_group.merge(
    df_dept_item_sorted, how="inner", on=["Grouping", item_field]
)

In [21]:
df_split_purchase_details.head()

Unnamed: 0,Grouping,Item,Count PR,Purchasing Department,Purchase Order,Purchase Order Date,Item Category,Vendor,Total Value,Prev PR Date,PreGroup,Date_Diff,Flag Out of Range
0,1,Equipment AA,2,Dept001,PO_10001001,2022-01-12,Equipment,V0001,2000,NaT,0,,1
1,1,Equipment AA,2,Dept001,PO_10001002,2022-01-15,Equipment,V0001,4000,2022-01-12,1,3.0,0
2,3,Equipment AB,2,Dept001,PO_10001004,2022-02-14,Equipment,V0003,6000,2022-01-14,1,31.0,1
3,3,Equipment AB,2,Dept001,PO_10001005,2022-02-15,Equipment,V0004,8000,2022-02-14,2,1.0,0


In [22]:
df_split_purchase_details["Date_Diff"] = np.where(
    df_split_purchase_details["Flag Out of Range"] == 1,
    pd.NaT,
    df_split_purchase_details["Date_Diff"],
)

In [23]:
df_split_purchase_details.sort_values(
    ["Grouping", "Purchase Order"], inplace=True
)

In [24]:
df_split_purchase_desc = (
    df_split_purchase_details.groupby(["Grouping", item_field])
    .agg({"Purchase Order": "|".join})
    .reset_index()
)

In [25]:
df_split_purchase_desc.columns = ["Grouping", item_field, "PR with similar items"]

In [26]:
df_split_purchase_comb = df_split_purchase_details.merge(
    df_split_purchase_desc, how="inner", on=["Grouping", item_field]
)

In [27]:
df_split_purchase_comb

Unnamed: 0,Grouping,Item,Count PR,Purchasing Department,Purchase Order,Purchase Order Date,Item Category,Vendor,Total Value,Prev PR Date,PreGroup,Date_Diff,Flag Out of Range,PR with similar items
0,1,Equipment AA,2,Dept001,PO_10001001,2022-01-12,Equipment,V0001,2000,NaT,0,NaT,1,PO_10001001|PO_10001002
1,1,Equipment AA,2,Dept001,PO_10001002,2022-01-15,Equipment,V0001,4000,2022-01-12,1,3.0,0,PO_10001001|PO_10001002
2,3,Equipment AB,2,Dept001,PO_10001004,2022-02-14,Equipment,V0003,6000,2022-01-14,1,NaT,1,PO_10001004|PO_10001005
3,3,Equipment AB,2,Dept001,PO_10001005,2022-02-15,Equipment,V0004,8000,2022-02-14,2,1.0,0,PO_10001004|PO_10001005


### Derive futher prioritization based on rules

- Was the threshold avoided due to the split ?
- Are the splits belonging to the same vendor ?
- Are the dates diff within one day ?

In [28]:
df_priority_score = (
    df_split_purchase_comb.groupby(["Grouping"])
    .agg(
        {
            "Total Value": ["min", "sum"],
            "Date_Diff": "min",
            "Vendor": ["nunique", "count"],
        }
    )
    .reset_index()
)

In [29]:
df_priority_score.columns = [
    "Grouping",
    "Min_Value",
    "Max_Value",
    "Min_Date_Diff",
    "Unique_Vendor",
    "Count_Vendor",
]

In [30]:
value_threshold = 5_000
df_priority_score["Flag_Threshold_Crossed"] = (
    (df_priority_score["Min_Value"] < value_threshold)
    & (df_priority_score["Max_Value"] >= value_threshold)
).astype(int)

In [31]:
df_priority_score["Flag_Same_Vendor"] = (
    (df_priority_score["Unique_Vendor"] == df_priority_score["Count_Vendor"])
).astype(int)

In [32]:
days_threshold = 1
df_priority_score["Flag_Min_Days"] = (
    df_priority_score["Min_Date_Diff"] <= days_threshold
).astype(int)

In [33]:
df_priority_score["Score"] = (
    df_priority_score["Flag_Threshold_Crossed"]
    + df_priority_score["Flag_Same_Vendor"]
    + df_priority_score["Flag_Min_Days"]
)

In [34]:
df_priority_score

Unnamed: 0,Grouping,Min_Value,Max_Value,Min_Date_Diff,Unique_Vendor,Count_Vendor,Flag_Threshold_Crossed,Flag_Same_Vendor,Flag_Min_Days,Score
0,1,2000,6000,3.0,1,2,1,0,0,1
1,3,6000,14000,1.0,2,2,0,1,1,2


In [35]:
df_sop_item = df_split_purchase_comb.merge(df_priority_score, how="left", on="Grouping")

In [36]:
df_sop_item.insert(0, "Item Type", f"1_{item_field}")

In [37]:
df_sop_item.head()

Unnamed: 0,Item Type,Grouping,Item,Count PR,Purchasing Department,Purchase Order,Purchase Order Date,Item Category,Vendor,Total Value,...,PR with similar items,Min_Value,Max_Value,Min_Date_Diff,Unique_Vendor,Count_Vendor,Flag_Threshold_Crossed,Flag_Same_Vendor,Flag_Min_Days,Score
0,1_Item,1,Equipment AA,2,Dept001,PO_10001001,2022-01-12,Equipment,V0001,2000,...,PO_10001001|PO_10001002,2000,6000,3.0,1,2,1,0,0,1
1,1_Item,1,Equipment AA,2,Dept001,PO_10001002,2022-01-15,Equipment,V0001,4000,...,PO_10001001|PO_10001002,2000,6000,3.0,1,2,1,0,0,1
2,1_Item,3,Equipment AB,2,Dept001,PO_10001004,2022-02-14,Equipment,V0003,6000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2
3,1_Item,3,Equipment AB,2,Dept001,PO_10001005,2022-02-15,Equipment,V0004,8000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2


## Performing the calculations for Split Purchase based on Item Categories

In [38]:
item_field = "Item Category"

In [39]:
df_dept_item_sorted = df_raw.sort_values(
    ["Purchasing Department", item_field, "Purchase Order Date"]
)

In [40]:
df_dept_item_sorted["Prev PR Date"] = df_dept_item_sorted[
    "Purchase Order Date"
].shift()

In [41]:
df_dept_item_sorted["PreGroup"] = (
    df_dept_item_sorted[
        ["Purchasing Department", item_field, "Purchase Order Date"]
    ]
    .groupby(["Purchasing Department", item_field])
    .cumcount()
)

In [42]:
df_dept_item_sorted.head()

Unnamed: 0,Purchasing Department,Purchase Order,Purchase Order Date,Item,Item Category,Vendor,Total Value,Prev PR Date,PreGroup
0,Dept001,PO_10001001,2022-01-12,Equipment AA,Equipment,V0001,2000,NaT,0
2,Dept001,PO_10001003,2022-01-14,Equipment AB,Equipment,V0002,3000,2022-01-12,1
1,Dept001,PO_10001002,2022-01-15,Equipment AA,Equipment,V0001,4000,2022-01-14,2
3,Dept001,PO_10001004,2022-02-14,Equipment AB,Equipment,V0003,6000,2022-01-15,3
4,Dept001,PO_10001005,2022-02-15,Equipment AB,Equipment,V0004,8000,2022-02-14,4


In [43]:
mask = df_dept_item_sorted["PreGroup"] == 0
df_dept_item_sorted.loc[mask, "Prev PR Date"] = pd.NaT

In [44]:
# Calculating number of days difference between adjacent records
df_dept_item_sorted["Date_Diff"] = (
    df_dept_item_sorted["Purchase Order Date"]
    - df_dept_item_sorted["Prev PR Date"]
).dt.days

In [45]:
# Set Threshold
n = 5
df_dept_item_sorted["Flag Out of Range"] = (
    1 - (df_dept_item_sorted["Date_Diff"] <= n)
).astype(int)

In [46]:
df_dept_item_sorted["Grouping"] = df_dept_item_sorted["Flag Out of Range"].cumsum()

In [47]:
df_dept_item_sorted.head()

Unnamed: 0,Purchasing Department,Purchase Order,Purchase Order Date,Item,Item Category,Vendor,Total Value,Prev PR Date,PreGroup,Date_Diff,Flag Out of Range,Grouping
0,Dept001,PO_10001001,2022-01-12,Equipment AA,Equipment,V0001,2000,NaT,0,,1,1
2,Dept001,PO_10001003,2022-01-14,Equipment AB,Equipment,V0002,3000,2022-01-12,1,2.0,0,1
1,Dept001,PO_10001002,2022-01-15,Equipment AA,Equipment,V0001,4000,2022-01-14,2,1.0,0,1
3,Dept001,PO_10001004,2022-02-14,Equipment AB,Equipment,V0003,6000,2022-01-15,3,30.0,1,2
4,Dept001,PO_10001005,2022-02-15,Equipment AB,Equipment,V0004,8000,2022-02-14,4,1.0,0,2


In [48]:
df_pr_count = (
    df_dept_item_sorted[["Grouping", item_field, "Purchase Order"]]
    .groupby(["Grouping", item_field])
    .nunique()
    .reset_index()
)

In [49]:
df_pr_count.columns = ["Grouping", item_field, "Count PR"]

In [50]:
# Identifies the groups that are valid split purchase cases
df_split_purchase_group = df_pr_count[df_pr_count["Count PR"] >= 2]

In [51]:
# Joining of Split Purchase Groups to initial details
df_split_purchase_details = df_split_purchase_group.merge(
    df_dept_item_sorted, how="inner", on=["Grouping", item_field]
)

In [52]:
df_split_purchase_details.head()

Unnamed: 0,Grouping,Item Category,Count PR,Purchasing Department,Purchase Order,Purchase Order Date,Item,Vendor,Total Value,Prev PR Date,PreGroup,Date_Diff,Flag Out of Range
0,1,Equipment,3,Dept001,PO_10001001,2022-01-12,Equipment AA,V0001,2000,NaT,0,,1
1,1,Equipment,3,Dept001,PO_10001003,2022-01-14,Equipment AB,V0002,3000,2022-01-12,1,2.0,0
2,1,Equipment,3,Dept001,PO_10001002,2022-01-15,Equipment AA,V0001,4000,2022-01-14,2,1.0,0
3,2,Equipment,2,Dept001,PO_10001004,2022-02-14,Equipment AB,V0003,6000,2022-01-15,3,30.0,1
4,2,Equipment,2,Dept001,PO_10001005,2022-02-15,Equipment AB,V0004,8000,2022-02-14,4,1.0,0


In [53]:
df_split_purchase_details["Date_Diff"] = np.where(
    df_split_purchase_details["Flag Out of Range"] == 1,
    pd.NaT,
    df_split_purchase_details["Date_Diff"],
)

In [54]:
df_split_purchase_details.sort_values(
    ["Grouping", "Purchase Order"], inplace=True
)

In [55]:
df_split_purchase_desc = (
    df_split_purchase_details.groupby(["Grouping", item_field])
    .agg({"Purchase Order": "|".join})
    .reset_index()
)

In [56]:
df_split_purchase_desc.columns = ["Grouping", item_field, "PR with similar items"]

In [57]:
df_split_purchase_comb = df_split_purchase_details.merge(
    df_split_purchase_desc, how="inner", on=["Grouping", item_field]
)

In [58]:
df_split_purchase_comb

Unnamed: 0,Grouping,Item Category,Count PR,Purchasing Department,Purchase Order,Purchase Order Date,Item,Vendor,Total Value,Prev PR Date,PreGroup,Date_Diff,Flag Out of Range,PR with similar items
0,1,Equipment,3,Dept001,PO_10001001,2022-01-12,Equipment AA,V0001,2000,NaT,0,NaT,1,PO_10001001|PO_10001002|PO_10001003
1,1,Equipment,3,Dept001,PO_10001002,2022-01-15,Equipment AA,V0001,4000,2022-01-14,2,1.0,0,PO_10001001|PO_10001002|PO_10001003
2,1,Equipment,3,Dept001,PO_10001003,2022-01-14,Equipment AB,V0002,3000,2022-01-12,1,2.0,0,PO_10001001|PO_10001002|PO_10001003
3,2,Equipment,2,Dept001,PO_10001004,2022-02-14,Equipment AB,V0003,6000,2022-01-15,3,NaT,1,PO_10001004|PO_10001005
4,2,Equipment,2,Dept001,PO_10001005,2022-02-15,Equipment AB,V0004,8000,2022-02-14,4,1.0,0,PO_10001004|PO_10001005
5,3,Equipment,2,Dept002,PO_10001006,2022-01-16,Equipment AA,V0001,4000,NaT,0,NaT,1,PO_10001006|PO_10001007
6,3,Equipment,2,Dept002,PO_10001007,2022-01-17,Equipment AB,V0002,3000,2022-01-16,1,1.0,0,PO_10001006|PO_10001007


### Derive futher prioritization based on rules

- Was the threshold avoided due to the split ?
- Are the splits belonging to the same vendor ?
- Are the dates diff within one day ?

In [59]:
df_priority_score = (
    df_split_purchase_comb.groupby(["Grouping"])
    .agg(
        {
            "Total Value": ["min", "sum"],
            "Date_Diff": "min",
            "Vendor": ["nunique", "count"],
        }
    )
    .reset_index()
)

In [60]:
df_priority_score.columns = [
    "Grouping",
    "Min_Value",
    "Max_Value",
    "Min_Date_Diff",
    "Unique_Vendor",
    "Count_Vendor",
]

In [61]:
value_threshold = 5_000
df_priority_score["Flag_Threshold_Crossed"] = (
    (df_priority_score["Min_Value"] < value_threshold)
    & (df_priority_score["Max_Value"] >= value_threshold)
).astype(int)

In [62]:
df_priority_score["Flag_Same_Vendor"] = (
    (df_priority_score["Unique_Vendor"] == df_priority_score["Count_Vendor"])
).astype(int)

In [63]:
days_threshold = 1
df_priority_score["Flag_Min_Days"] = (
    df_priority_score["Min_Date_Diff"] <= days_threshold
).astype(int)

In [64]:
df_priority_score["Score"] = (
    df_priority_score["Flag_Threshold_Crossed"]
    + df_priority_score["Flag_Same_Vendor"]
    + df_priority_score["Flag_Min_Days"]
)

In [65]:
df_priority_score

Unnamed: 0,Grouping,Min_Value,Max_Value,Min_Date_Diff,Unique_Vendor,Count_Vendor,Flag_Threshold_Crossed,Flag_Same_Vendor,Flag_Min_Days,Score
0,1,2000,9000,1.0,2,3,1,0,1,2
1,2,6000,14000,1.0,2,2,0,1,1,2
2,3,3000,7000,1.0,2,2,1,1,1,3


In [66]:
df_sop_itemcat = df_split_purchase_comb.merge(
    df_priority_score, how="left", on="Grouping"
)

In [67]:
df_sop_itemcat.insert(0, "Item Type", f"2_{item_field}")

In [68]:
df_sop_itemcat.head()

Unnamed: 0,Item Type,Grouping,Item Category,Count PR,Purchasing Department,Purchase Order,Purchase Order Date,Item,Vendor,Total Value,...,PR with similar items,Min_Value,Max_Value,Min_Date_Diff,Unique_Vendor,Count_Vendor,Flag_Threshold_Crossed,Flag_Same_Vendor,Flag_Min_Days,Score
0,2_Item Category,1,Equipment,3,Dept001,PO_10001001,2022-01-12,Equipment AA,V0001,2000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
1,2_Item Category,1,Equipment,3,Dept001,PO_10001002,2022-01-15,Equipment AA,V0001,4000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
2,2_Item Category,1,Equipment,3,Dept001,PO_10001003,2022-01-14,Equipment AB,V0002,3000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
3,2_Item Category,2,Equipment,2,Dept001,PO_10001004,2022-02-14,Equipment AB,V0003,6000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2
4,2_Item Category,2,Equipment,2,Dept001,PO_10001005,2022-02-15,Equipment AB,V0004,8000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2


## Combine Details

In [69]:
df_sop_comb = pd.concat([df_sop_item, df_sop_itemcat])

In [70]:
df_sop_comb

Unnamed: 0,Item Type,Grouping,Item,Count PR,Purchasing Department,Purchase Order,Purchase Order Date,Item Category,Vendor,Total Value,...,PR with similar items,Min_Value,Max_Value,Min_Date_Diff,Unique_Vendor,Count_Vendor,Flag_Threshold_Crossed,Flag_Same_Vendor,Flag_Min_Days,Score
0,1_Item,1,Equipment AA,2,Dept001,PO_10001001,2022-01-12,Equipment,V0001,2000,...,PO_10001001|PO_10001002,2000,6000,3.0,1,2,1,0,0,1
1,1_Item,1,Equipment AA,2,Dept001,PO_10001002,2022-01-15,Equipment,V0001,4000,...,PO_10001001|PO_10001002,2000,6000,3.0,1,2,1,0,0,1
2,1_Item,3,Equipment AB,2,Dept001,PO_10001004,2022-02-14,Equipment,V0003,6000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2
3,1_Item,3,Equipment AB,2,Dept001,PO_10001005,2022-02-15,Equipment,V0004,8000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2
0,2_Item Category,1,Equipment AA,3,Dept001,PO_10001001,2022-01-12,Equipment,V0001,2000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
1,2_Item Category,1,Equipment AA,3,Dept001,PO_10001002,2022-01-15,Equipment,V0001,4000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
2,2_Item Category,1,Equipment AB,3,Dept001,PO_10001003,2022-01-14,Equipment,V0002,3000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
3,2_Item Category,2,Equipment AB,2,Dept001,PO_10001004,2022-02-14,Equipment,V0003,6000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2
4,2_Item Category,2,Equipment AB,2,Dept001,PO_10001005,2022-02-15,Equipment,V0004,8000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2
5,2_Item Category,3,Equipment AA,2,Dept002,PO_10001006,2022-01-16,Equipment,V0001,4000,...,PO_10001006|PO_10001007,3000,7000,1.0,2,2,1,1,1,3


In [71]:
# With the different methods, there will be Groupings with the exact same PR combination and items flagged, we can remove those
df_sop_comb.sort_values(["Grouping", "Item"], inplace=True)

In [72]:
df_sop_grouping_check = (
    df_sop_comb.groupby(["Grouping", "Item Type", "PR with similar items", "Score"])
    .agg({"Item": "|".join})
    .reset_index()
)

In [73]:
df_sop_grouping_check.columns = [
    "Grouping",
    "Item Type",
    "PR with similar items",
    "Score",
    "Items in Group",
]

In [74]:
df_sop_grouping_check.sort_values(
    ["PR with similar items", "Items in Group", "Item Type", "Score"],
    ascending=[True, True, True, False],
    inplace=True,
)

In [75]:
# Identify Groups with the same PRs and Items
df_sop_grouping_check[
    df_sop_grouping_check.duplicated(
        subset=["PR with similar items", "Items in Group"], keep=False
    )
]

Unnamed: 0,Grouping,Item Type,PR with similar items,Score,Items in Group
3,3,1_Item,PO_10001004|PO_10001005,2,Equipment AB|Equipment AB
2,2,2_Item Category,PO_10001004|PO_10001005,2,Equipment AB|Equipment AB


In [76]:
df_sop_grouping_unique = df_sop_grouping_check.drop_duplicates(
    subset=["PR with similar items", "Items in Group"], keep="first"
)
valid_groupings = set(df_sop_grouping_unique["Grouping"].tolist())

In [77]:
df_sop_comb_final = df_sop_comb[df_sop_comb["Grouping"].isin(valid_groupings)]

In [78]:
df_sop_comb_final

Unnamed: 0,Item Type,Grouping,Item,Count PR,Purchasing Department,Purchase Order,Purchase Order Date,Item Category,Vendor,Total Value,...,PR with similar items,Min_Value,Max_Value,Min_Date_Diff,Unique_Vendor,Count_Vendor,Flag_Threshold_Crossed,Flag_Same_Vendor,Flag_Min_Days,Score
0,1_Item,1,Equipment AA,2,Dept001,PO_10001001,2022-01-12,Equipment,V0001,2000,...,PO_10001001|PO_10001002,2000,6000,3.0,1,2,1,0,0,1
1,1_Item,1,Equipment AA,2,Dept001,PO_10001002,2022-01-15,Equipment,V0001,4000,...,PO_10001001|PO_10001002,2000,6000,3.0,1,2,1,0,0,1
0,2_Item Category,1,Equipment AA,3,Dept001,PO_10001001,2022-01-12,Equipment,V0001,2000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
1,2_Item Category,1,Equipment AA,3,Dept001,PO_10001002,2022-01-15,Equipment,V0001,4000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
2,2_Item Category,1,Equipment AB,3,Dept001,PO_10001003,2022-01-14,Equipment,V0002,3000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
5,2_Item Category,3,Equipment AA,2,Dept002,PO_10001006,2022-01-16,Equipment,V0001,4000,...,PO_10001006|PO_10001007,3000,7000,1.0,2,2,1,1,1,3
2,1_Item,3,Equipment AB,2,Dept001,PO_10001004,2022-02-14,Equipment,V0003,6000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2
3,1_Item,3,Equipment AB,2,Dept001,PO_10001005,2022-02-15,Equipment,V0004,8000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2
6,2_Item Category,3,Equipment AB,2,Dept002,PO_10001007,2022-01-17,Equipment,V0002,3000,...,PO_10001006|PO_10001007,3000,7000,1.0,2,2,1,1,1,3
