# This notebook shows the steps in building a splitting of purchase data pipeline

In [270]:
import pandas as pd
import numpy as np

In [271]:
%%time
df_raw = pd.read_excel("./data/split_purchase_data.xlsx")

CPU times: total: 15.6 ms
Wall time: 21.7 ms


In [272]:
df_raw

Unnamed: 0,Purchasing Department,Purchase Order,Purchase Order Date,Item,Item Category,Vendor,Total Value
0,Dept001,PO_10001001,2022-01-12,TV,Audio Visual System,Vendor_1,2000
1,Dept001,PO_10001002,2022-01-15,TV,Audio Visual System,Vendor_1,4000
2,Dept001,PO_10001003,2022-01-14,Stereo System,Audio Visual System,Vendor_2,3000
3,Dept001,PO_10001004,2022-02-14,Stereo System,Audio Visual System,Vendor_3,6000
4,Dept001,PO_10001005,2022-02-15,Stereo System,Audio Visual System,Vendor_4,8000
5,Dept002,PO_10001006,2022-01-16,TV,Audio Visual System,Vendor_1,4000
6,Dept002,PO_10001007,2022-01-17,Stereo System,Audio Visual System,Vendor_2,3000
7,Dept002,PO_10001008,2022-01-18,Stereo System,Audio Visual System,Vendor_2,6000


In [273]:
df_raw["Purchase Order Date"] = pd.to_datetime(
    df_raw["Purchase Order Date"]
)
df_raw["Purchase Order"] = df_raw["Purchase Order"].astype(
    str
)

In [274]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Purchasing Department  8 non-null      object        
 1   Purchase Order         8 non-null      object        
 2   Purchase Order Date    8 non-null      datetime64[ns]
 3   Item                   8 non-null      object        
 4   Item Category          8 non-null      object        
 5   Vendor                 8 non-null      object        
 6   Total Value            8 non-null      int64         
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 576.0+ bytes


# Calculation of Split Purchase based on Item

In [275]:
item_field = "Item"

df_dept_item_sorted = df_raw.sort_values(
    ["Purchasing Department", item_field, "Purchase Order Date"]
)

df_dept_item_sorted["Prev PO Date"] = df_dept_item_sorted[
    "Purchase Order Date"
].shift()

df_dept_item_sorted["PreGroup"] = (
    df_dept_item_sorted[
        ["Purchasing Department", item_field, "Purchase Order Date"]
    ]
    .groupby(["Purchasing Department", item_field])
    .cumcount()
)

In [276]:
df_dept_item_sorted

Unnamed: 0,Purchasing Department,Purchase Order,Purchase Order Date,Item,Item Category,Vendor,Total Value,Prev PO Date,PreGroup
2,Dept001,PO_10001003,2022-01-14,Stereo System,Audio Visual System,Vendor_2,3000,NaT,0
3,Dept001,PO_10001004,2022-02-14,Stereo System,Audio Visual System,Vendor_3,6000,2022-01-14,1
4,Dept001,PO_10001005,2022-02-15,Stereo System,Audio Visual System,Vendor_4,8000,2022-02-14,2
0,Dept001,PO_10001001,2022-01-12,TV,Audio Visual System,Vendor_1,2000,2022-02-15,0
1,Dept001,PO_10001002,2022-01-15,TV,Audio Visual System,Vendor_1,4000,2022-01-12,1
6,Dept002,PO_10001007,2022-01-17,Stereo System,Audio Visual System,Vendor_2,3000,2022-01-15,0
7,Dept002,PO_10001008,2022-01-18,Stereo System,Audio Visual System,Vendor_2,6000,2022-01-17,1
5,Dept002,PO_10001006,2022-01-16,TV,Audio Visual System,Vendor_1,4000,2022-01-18,0


In [277]:
mask = df_dept_item_sorted["PreGroup"] == 0
df_dept_item_sorted.loc[mask, "Prev PO Date"] = pd.NaT

# Calculating number of days difference between adjacent records
df_dept_item_sorted["Date_Diff"] = (
    df_dept_item_sorted["Purchase Order Date"]
    - df_dept_item_sorted["Prev PO Date"]
).dt.days

# Set Threshold
n = 5
df_dept_item_sorted["Flag Out of Range"] = (
    1 - (df_dept_item_sorted["Date_Diff"] <= n)
).astype(int)

df_dept_item_sorted["Grouping"] = df_dept_item_sorted["Flag Out of Range"].cumsum()

In [278]:
df_dept_item_sorted.columns

Index(['Purchasing Department', 'Purchase Order', 'Purchase Order Date',
       'Item', 'Item Category', 'Vendor', 'Total Value', 'Prev PO Date',
       'PreGroup', 'Date_Diff', 'Flag Out of Range', 'Grouping'],
      dtype='object')

In [279]:
columns = ['Purchasing Department', 'Item', 'Purchase Order', 'Purchase Order Date',
           'Prev PO Date', 'Date_Diff', 'Grouping']
df_dept_item_sorted[columns]

Unnamed: 0,Purchasing Department,Item,Purchase Order,Purchase Order Date,Prev PO Date,Date_Diff,Grouping
2,Dept001,Stereo System,PO_10001003,2022-01-14,NaT,,1
3,Dept001,Stereo System,PO_10001004,2022-02-14,2022-01-14,31.0,2
4,Dept001,Stereo System,PO_10001005,2022-02-15,2022-02-14,1.0,2
0,Dept001,TV,PO_10001001,2022-01-12,NaT,,3
1,Dept001,TV,PO_10001002,2022-01-15,2022-01-12,3.0,3
6,Dept002,Stereo System,PO_10001007,2022-01-17,NaT,,4
7,Dept002,Stereo System,PO_10001008,2022-01-18,2022-01-17,1.0,4
5,Dept002,TV,PO_10001006,2022-01-16,NaT,,5


In [280]:
df_po_count = (
    df_dept_item_sorted[["Grouping", item_field, "Purchase Order"]]
    .groupby(["Grouping", item_field])
    .nunique()
    .reset_index()
)

df_po_count.columns = ["Grouping", item_field, "Count PO"]

# Identifies the groups that are valid split purchase cases
df_split_purchase_group = df_po_count[df_po_count["Count PO"] >= 2]

# Joining of Split Purchase Groups to initial details
df_split_purchase_details = df_split_purchase_group.merge(
    df_dept_item_sorted, how="inner", on=["Grouping", item_field]
)

In [281]:
df_split_purchase_details

Unnamed: 0,Grouping,Item,Count PO,Purchasing Department,Purchase Order,Purchase Order Date,Item Category,Vendor,Total Value,Prev PO Date,PreGroup,Date_Diff,Flag Out of Range
0,2,Stereo System,2,Dept001,PO_10001004,2022-02-14,Audio Visual System,Vendor_3,6000,2022-01-14,1,31.0,1
1,2,Stereo System,2,Dept001,PO_10001005,2022-02-15,Audio Visual System,Vendor_4,8000,2022-02-14,2,1.0,0
2,3,TV,2,Dept001,PO_10001001,2022-01-12,Audio Visual System,Vendor_1,2000,NaT,0,,1
3,3,TV,2,Dept001,PO_10001002,2022-01-15,Audio Visual System,Vendor_1,4000,2022-01-12,1,3.0,0
4,4,Stereo System,2,Dept002,PO_10001007,2022-01-17,Audio Visual System,Vendor_2,3000,NaT,0,,1
5,4,Stereo System,2,Dept002,PO_10001008,2022-01-18,Audio Visual System,Vendor_2,6000,2022-01-17,1,1.0,0


In [282]:
df_split_purchase_details["Date_Diff"] = np.where(
    df_split_purchase_details["Flag Out of Range"] == 1,
    pd.NaT,
    df_split_purchase_details["Date_Diff"],
)

df_split_purchase_details.sort_values(
    ["Grouping", "Purchase Order"], inplace=True
)

df_split_purchase_desc = (
    df_split_purchase_details.groupby(["Grouping", item_field])
    .agg({"Purchase Order": "|".join})
    .reset_index()
)

df_split_purchase_desc.columns = ["Grouping", item_field, "PO with similar items"]

df_split_purchase_comb = df_split_purchase_details.merge(
    df_split_purchase_desc, how="inner", on=["Grouping", item_field]
)

In [283]:
columns = ['Purchasing Department', 'Item', 'Purchase Order', 'Purchase Order Date',
           'Prev PO Date', 'Date_Diff', 'Grouping', 'PO with similar items']
df_split_purchase_comb[columns]

Unnamed: 0,Purchasing Department,Item,Purchase Order,Purchase Order Date,Prev PO Date,Date_Diff,Grouping,PO with similar items
0,Dept001,Stereo System,PO_10001004,2022-02-14,2022-01-14,NaT,2,PO_10001004|PO_10001005
1,Dept001,Stereo System,PO_10001005,2022-02-15,2022-02-14,1.0,2,PO_10001004|PO_10001005
2,Dept001,TV,PO_10001001,2022-01-12,NaT,NaT,3,PO_10001001|PO_10001002
3,Dept001,TV,PO_10001002,2022-01-15,2022-01-12,3.0,3,PO_10001001|PO_10001002
4,Dept002,Stereo System,PO_10001007,2022-01-17,NaT,NaT,4,PO_10001007|PO_10001008
5,Dept002,Stereo System,PO_10001008,2022-01-18,2022-01-17,1.0,4,PO_10001007|PO_10001008


### Derive futher prioritization based on rules

- Was the threshold of 5,000 avoided due to the split ?
- Are the splits belonging to the same vendor ?
- Are the dates diff within two days ?

In [284]:
df_priority_score = (
    df_split_purchase_comb.groupby(["Grouping"])
    .agg(
        {
            "Total Value": ["min", "sum"],
            "Date_Diff": "min",
            "Vendor": ["nunique", "count"],
        }
    )
    .reset_index()
)

df_priority_score.columns = [
    "Grouping",
    "Min_Value",
    "Max_Value",
    "Min_Date_Diff",
    "Unique_Vendor",
    "Count_Vendor",
]

value_threshold = 5_000
df_priority_score["Flag_Threshold_Crossed"] = (
    (df_priority_score["Min_Value"] < value_threshold)
    & (df_priority_score["Max_Value"] >= value_threshold)
).astype(int)

df_priority_score["Flag_Same_Vendor"] = (
    (df_priority_score["Unique_Vendor"] < df_priority_score["Count_Vendor"])
).astype(int)

days_threshold = 2
df_priority_score["Flag_Min_Days"] = (
    df_priority_score["Min_Date_Diff"] <= days_threshold
).astype(int)

df_priority_score["Score"] = (
    df_priority_score["Flag_Threshold_Crossed"]
    + df_priority_score["Flag_Same_Vendor"]
    + df_priority_score["Flag_Min_Days"]
)

In [290]:
df_priority_score

Unnamed: 0,Grouping,Min_Value,Max_Value,Min_Date_Diff,Unique_Vendor,Count_Vendor,Flag_Threshold_Crossed,Flag_Same_Vendor,Flag_Min_Days,Score
0,2,6000,14000,1.0,2,2,0,0,1,1
1,3,2000,6000,3.0,1,2,1,1,0,2
2,4,3000,9000,1.0,1,2,1,1,1,3


In [291]:
df_sop_item = df_split_purchase_comb.merge(df_priority_score, how="left", on="Grouping")

In [292]:
df_sop_item.insert(0, "Item Type", f"1_{item_field}")

In [293]:
df_sop_item.columns

Index(['Item Type', 'Grouping', 'Item', 'Count PO', 'Purchasing Department',
       'Purchase Order', 'Purchase Order Date', 'Item Category', 'Vendor',
       'Total Value', 'Prev PO Date', 'PreGroup', 'Date_Diff',
       'Flag Out of Range', 'PO with similar items', 'Min_Value', 'Max_Value',
       'Min_Date_Diff', 'Unique_Vendor', 'Count_Vendor',
       'Flag_Threshold_Crossed', 'Flag_Same_Vendor', 'Flag_Min_Days', 'Score'],
      dtype='object')

In [294]:
df_sop_item

Unnamed: 0,Item Type,Grouping,Item,Count PO,Purchasing Department,Purchase Order,Purchase Order Date,Item Category,Vendor,Total Value,...,PO with similar items,Min_Value,Max_Value,Min_Date_Diff,Unique_Vendor,Count_Vendor,Flag_Threshold_Crossed,Flag_Same_Vendor,Flag_Min_Days,Score
0,1_Item,2,Stereo System,2,Dept001,PO_10001004,2022-02-14,Audio Visual System,Vendor_3,6000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,0,1,1
1,1_Item,2,Stereo System,2,Dept001,PO_10001005,2022-02-15,Audio Visual System,Vendor_4,8000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,0,1,1
2,1_Item,3,TV,2,Dept001,PO_10001001,2022-01-12,Audio Visual System,Vendor_1,2000,...,PO_10001001|PO_10001002,2000,6000,3.0,1,2,1,1,0,2
3,1_Item,3,TV,2,Dept001,PO_10001002,2022-01-15,Audio Visual System,Vendor_1,4000,...,PO_10001001|PO_10001002,2000,6000,3.0,1,2,1,1,0,2
4,1_Item,4,Stereo System,2,Dept002,PO_10001007,2022-01-17,Audio Visual System,Vendor_2,3000,...,PO_10001007|PO_10001008,3000,9000,1.0,1,2,1,1,1,3
5,1_Item,4,Stereo System,2,Dept002,PO_10001008,2022-01-18,Audio Visual System,Vendor_2,6000,...,PO_10001007|PO_10001008,3000,9000,1.0,1,2,1,1,1,3


## Performing the calculations for Split Purchase based on Item Categories

In [295]:
item_field = "Item Category"

In [296]:
df_dept_item_sorted = df_raw.sort_values(
    ["Purchasing Department", item_field, "Purchase Order Date"]
)

In [297]:
df_dept_item_sorted["Prev PO Date"] = df_dept_item_sorted[
    "Purchase Order Date"
].shift()

In [298]:
df_dept_item_sorted["PreGroup"] = (
    df_dept_item_sorted[
        ["Purchasing Department", item_field, "Purchase Order Date"]
    ]
    .groupby(["Purchasing Department", item_field])
    .cumcount()
)

In [299]:
df_dept_item_sorted.head()

Unnamed: 0,Purchasing Department,Purchase Order,Purchase Order Date,Item,Item Category,Vendor,Total Value,Prev PO Date,PreGroup
0,Dept001,PO_10001001,2022-01-12,TV,Audio Visual System,Vendor_1,2000,NaT,0
2,Dept001,PO_10001003,2022-01-14,Stereo System,Audio Visual System,Vendor_2,3000,2022-01-12,1
1,Dept001,PO_10001002,2022-01-15,TV,Audio Visual System,Vendor_1,4000,2022-01-14,2
3,Dept001,PO_10001004,2022-02-14,Stereo System,Audio Visual System,Vendor_3,6000,2022-01-15,3
4,Dept001,PO_10001005,2022-02-15,Stereo System,Audio Visual System,Vendor_4,8000,2022-02-14,4


In [300]:
mask = df_dept_item_sorted["PreGroup"] == 0
df_dept_item_sorted.loc[mask, "Prev PO Date"] = pd.NaT

In [301]:
# Calculating number of days difference between adjacent records
df_dept_item_sorted["Date_Diff"] = (
    df_dept_item_sorted["Purchase Order Date"]
    - df_dept_item_sorted["Prev PO Date"]
).dt.days

In [302]:
# Set Threshold
n = 5
df_dept_item_sorted["Flag Out of Range"] = (
    1 - (df_dept_item_sorted["Date_Diff"] <= n)
).astype(int)

In [303]:
df_dept_item_sorted["Grouping"] = df_dept_item_sorted["Flag Out of Range"].cumsum()

In [304]:
df_dept_item_sorted.head()

Unnamed: 0,Purchasing Department,Purchase Order,Purchase Order Date,Item,Item Category,Vendor,Total Value,Prev PO Date,PreGroup,Date_Diff,Flag Out of Range,Grouping
0,Dept001,PO_10001001,2022-01-12,TV,Audio Visual System,Vendor_1,2000,NaT,0,,1,1
2,Dept001,PO_10001003,2022-01-14,Stereo System,Audio Visual System,Vendor_2,3000,2022-01-12,1,2.0,0,1
1,Dept001,PO_10001002,2022-01-15,TV,Audio Visual System,Vendor_1,4000,2022-01-14,2,1.0,0,1
3,Dept001,PO_10001004,2022-02-14,Stereo System,Audio Visual System,Vendor_3,6000,2022-01-15,3,30.0,1,2
4,Dept001,PO_10001005,2022-02-15,Stereo System,Audio Visual System,Vendor_4,8000,2022-02-14,4,1.0,0,2


In [305]:
df_po_count = (
    df_dept_item_sorted[["Grouping", item_field, "Purchase Order"]]
    .groupby(["Grouping", item_field])
    .nunique()
    .reset_index()
)

In [306]:
df_po_count.columns = ["Grouping", item_field, "Count PO"]

In [307]:
# Identifies the groups that are valid split purchase cases
df_split_purchase_group = df_po_count[df_po_count["Count PO"] >= 2]

In [308]:
# Joining of Split Purchase Groups to initial details
df_split_purchase_details = df_split_purchase_group.merge(
    df_dept_item_sorted, how="inner", on=["Grouping", item_field]
)

In [309]:
df_split_purchase_details.head()

Unnamed: 0,Grouping,Item Category,Count PO,Purchasing Department,Purchase Order,Purchase Order Date,Item,Vendor,Total Value,Prev PO Date,PreGroup,Date_Diff,Flag Out of Range
0,1,Audio Visual System,3,Dept001,PO_10001001,2022-01-12,TV,Vendor_1,2000,NaT,0,,1
1,1,Audio Visual System,3,Dept001,PO_10001003,2022-01-14,Stereo System,Vendor_2,3000,2022-01-12,1,2.0,0
2,1,Audio Visual System,3,Dept001,PO_10001002,2022-01-15,TV,Vendor_1,4000,2022-01-14,2,1.0,0
3,2,Audio Visual System,2,Dept001,PO_10001004,2022-02-14,Stereo System,Vendor_3,6000,2022-01-15,3,30.0,1
4,2,Audio Visual System,2,Dept001,PO_10001005,2022-02-15,Stereo System,Vendor_4,8000,2022-02-14,4,1.0,0


In [310]:
df_split_purchase_details["Date_Diff"] = np.where(
    df_split_purchase_details["Flag Out of Range"] == 1,
    pd.NaT,
    df_split_purchase_details["Date_Diff"],
)

In [311]:
df_split_purchase_details.sort_values(
    ["Grouping", "Purchase Order"], inplace=True
)

In [312]:
df_split_purchase_desc = (
    df_split_purchase_details.groupby(["Grouping", item_field])
    .agg({"Purchase Order": "|".join})
    .reset_index()
)

In [313]:
df_split_purchase_desc.columns = ["Grouping", item_field, "PO with similar items"]

In [314]:
df_split_purchase_comb = df_split_purchase_details.merge(
    df_split_purchase_desc, how="inner", on=["Grouping", item_field]
)

In [315]:
df_split_purchase_comb

Unnamed: 0,Grouping,Item Category,Count PO,Purchasing Department,Purchase Order,Purchase Order Date,Item,Vendor,Total Value,Prev PO Date,PreGroup,Date_Diff,Flag Out of Range,PO with similar items
0,1,Audio Visual System,3,Dept001,PO_10001001,2022-01-12,TV,Vendor_1,2000,NaT,0,NaT,1,PO_10001001|PO_10001002|PO_10001003
1,1,Audio Visual System,3,Dept001,PO_10001002,2022-01-15,TV,Vendor_1,4000,2022-01-14,2,1.0,0,PO_10001001|PO_10001002|PO_10001003
2,1,Audio Visual System,3,Dept001,PO_10001003,2022-01-14,Stereo System,Vendor_2,3000,2022-01-12,1,2.0,0,PO_10001001|PO_10001002|PO_10001003
3,2,Audio Visual System,2,Dept001,PO_10001004,2022-02-14,Stereo System,Vendor_3,6000,2022-01-15,3,NaT,1,PO_10001004|PO_10001005
4,2,Audio Visual System,2,Dept001,PO_10001005,2022-02-15,Stereo System,Vendor_4,8000,2022-02-14,4,1.0,0,PO_10001004|PO_10001005
5,3,Audio Visual System,3,Dept002,PO_10001006,2022-01-16,TV,Vendor_1,4000,NaT,0,NaT,1,PO_10001006|PO_10001007|PO_10001008
6,3,Audio Visual System,3,Dept002,PO_10001007,2022-01-17,Stereo System,Vendor_2,3000,2022-01-16,1,1.0,0,PO_10001006|PO_10001007|PO_10001008
7,3,Audio Visual System,3,Dept002,PO_10001008,2022-01-18,Stereo System,Vendor_2,6000,2022-01-17,2,1.0,0,PO_10001006|PO_10001007|PO_10001008


### Derive futher prioritization based on rules

- Was the threshold avoided due to the split ?
- Are the splits belonging to the same vendor ?
- Are the dates diff within one day ?

In [316]:
df_priority_score = (
    df_split_purchase_comb.groupby(["Grouping"])
    .agg(
        {
            "Total Value": ["min", "sum"],
            "Date_Diff": "min",
            "Vendor": ["nunique", "count"],
        }
    )
    .reset_index()
)

In [317]:
df_priority_score.columns = [
    "Grouping",
    "Min_Value",
    "Max_Value",
    "Min_Date_Diff",
    "Unique_Vendor",
    "Count_Vendor",
]

In [318]:
value_threshold = 5_000
df_priority_score["Flag_Threshold_Crossed"] = (
    (df_priority_score["Min_Value"] < value_threshold)
    & (df_priority_score["Max_Value"] >= value_threshold)
).astype(int)

In [319]:
df_priority_score["Flag_Same_Vendor"] = (
    (df_priority_score["Unique_Vendor"] == df_priority_score["Count_Vendor"])
).astype(int)

In [320]:
days_threshold = 1
df_priority_score["Flag_Min_Days"] = (
    df_priority_score["Min_Date_Diff"] <= days_threshold
).astype(int)

In [321]:
df_priority_score["Score"] = (
    df_priority_score["Flag_Threshold_Crossed"]
    + df_priority_score["Flag_Same_Vendor"]
    + df_priority_score["Flag_Min_Days"]
)

In [322]:
df_priority_score

Unnamed: 0,Grouping,Min_Value,Max_Value,Min_Date_Diff,Unique_Vendor,Count_Vendor,Flag_Threshold_Crossed,Flag_Same_Vendor,Flag_Min_Days,Score
0,1,2000,9000,1.0,2,3,1,0,1,2
1,2,6000,14000,1.0,2,2,0,1,1,2
2,3,3000,13000,1.0,2,3,1,0,1,2


In [323]:
df_sop_itemcat = df_split_purchase_comb.merge(
    df_priority_score, how="left", on="Grouping"
)

In [324]:
df_sop_itemcat.insert(0, "Item Type", f"2_{item_field}")

In [325]:
df_sop_itemcat.head()

Unnamed: 0,Item Type,Grouping,Item Category,Count PO,Purchasing Department,Purchase Order,Purchase Order Date,Item,Vendor,Total Value,...,PO with similar items,Min_Value,Max_Value,Min_Date_Diff,Unique_Vendor,Count_Vendor,Flag_Threshold_Crossed,Flag_Same_Vendor,Flag_Min_Days,Score
0,2_Item Category,1,Audio Visual System,3,Dept001,PO_10001001,2022-01-12,TV,Vendor_1,2000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
1,2_Item Category,1,Audio Visual System,3,Dept001,PO_10001002,2022-01-15,TV,Vendor_1,4000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
2,2_Item Category,1,Audio Visual System,3,Dept001,PO_10001003,2022-01-14,Stereo System,Vendor_2,3000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
3,2_Item Category,2,Audio Visual System,2,Dept001,PO_10001004,2022-02-14,Stereo System,Vendor_3,6000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2
4,2_Item Category,2,Audio Visual System,2,Dept001,PO_10001005,2022-02-15,Stereo System,Vendor_4,8000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2


## Combine Details

In [326]:
df_sop_comb = pd.concat([df_sop_item, df_sop_itemcat])

In [327]:
df_sop_comb

Unnamed: 0,Item Type,Grouping,Item,Count PO,Purchasing Department,Purchase Order,Purchase Order Date,Item Category,Vendor,Total Value,...,PO with similar items,Min_Value,Max_Value,Min_Date_Diff,Unique_Vendor,Count_Vendor,Flag_Threshold_Crossed,Flag_Same_Vendor,Flag_Min_Days,Score
0,1_Item,2,Stereo System,2,Dept001,PO_10001004,2022-02-14,Audio Visual System,Vendor_3,6000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,0,1,1
1,1_Item,2,Stereo System,2,Dept001,PO_10001005,2022-02-15,Audio Visual System,Vendor_4,8000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,0,1,1
2,1_Item,3,TV,2,Dept001,PO_10001001,2022-01-12,Audio Visual System,Vendor_1,2000,...,PO_10001001|PO_10001002,2000,6000,3.0,1,2,1,1,0,2
3,1_Item,3,TV,2,Dept001,PO_10001002,2022-01-15,Audio Visual System,Vendor_1,4000,...,PO_10001001|PO_10001002,2000,6000,3.0,1,2,1,1,0,2
4,1_Item,4,Stereo System,2,Dept002,PO_10001007,2022-01-17,Audio Visual System,Vendor_2,3000,...,PO_10001007|PO_10001008,3000,9000,1.0,1,2,1,1,1,3
5,1_Item,4,Stereo System,2,Dept002,PO_10001008,2022-01-18,Audio Visual System,Vendor_2,6000,...,PO_10001007|PO_10001008,3000,9000,1.0,1,2,1,1,1,3
0,2_Item Category,1,TV,3,Dept001,PO_10001001,2022-01-12,Audio Visual System,Vendor_1,2000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
1,2_Item Category,1,TV,3,Dept001,PO_10001002,2022-01-15,Audio Visual System,Vendor_1,4000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
2,2_Item Category,1,Stereo System,3,Dept001,PO_10001003,2022-01-14,Audio Visual System,Vendor_2,3000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
3,2_Item Category,2,Stereo System,2,Dept001,PO_10001004,2022-02-14,Audio Visual System,Vendor_3,6000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2


In [328]:
# With the different methods, there will be Groupings with the exact same PR combination and items flagged, we can remove those
df_sop_comb.sort_values(["Grouping", "Item"], inplace=True)

In [329]:
df_sop_grouping_check = (
    df_sop_comb.groupby(["Grouping", "Item Type", "PO with similar items", "Score"])
    .agg({"Item": "|".join})
    .reset_index()
)

In [330]:
df_sop_grouping_check.columns = [
    "Grouping",
    "Item Type",
    "PO with similar items",
    "Score",
    "Items in Group",
]

In [331]:
df_sop_grouping_check.sort_values(
    ["PO with similar items", "Items in Group", "Item Type", "Score"],
    ascending=[True, True, True, False],
    inplace=True,
)

In [332]:
# Identify Groups with the same PRs and Items
df_sop_grouping_check[
    df_sop_grouping_check.duplicated(
        subset=["PO with similar items", "Items in Group"], keep=False
    )
]

Unnamed: 0,Grouping,Item Type,PO with similar items,Score,Items in Group
1,2,1_Item,PO_10001004|PO_10001005,1,Stereo System|Stereo System
2,2,2_Item Category,PO_10001004|PO_10001005,2,Stereo System|Stereo System


In [333]:
df_sop_grouping_unique = df_sop_grouping_check.drop_duplicates(
    subset=["PO with similar items", "Items in Group"], keep="first"
)
valid_groupings = set(df_sop_grouping_unique["Grouping"].tolist())

In [334]:
df_sop_comb_final = df_sop_comb[df_sop_comb["Grouping"].isin(valid_groupings)]

In [335]:
df_sop_comb_final

Unnamed: 0,Item Type,Grouping,Item,Count PO,Purchasing Department,Purchase Order,Purchase Order Date,Item Category,Vendor,Total Value,...,PO with similar items,Min_Value,Max_Value,Min_Date_Diff,Unique_Vendor,Count_Vendor,Flag_Threshold_Crossed,Flag_Same_Vendor,Flag_Min_Days,Score
2,2_Item Category,1,Stereo System,3,Dept001,PO_10001003,2022-01-14,Audio Visual System,Vendor_2,3000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
0,2_Item Category,1,TV,3,Dept001,PO_10001001,2022-01-12,Audio Visual System,Vendor_1,2000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
1,2_Item Category,1,TV,3,Dept001,PO_10001002,2022-01-15,Audio Visual System,Vendor_1,4000,...,PO_10001001|PO_10001002|PO_10001003,2000,9000,1.0,2,3,1,0,1,2
0,1_Item,2,Stereo System,2,Dept001,PO_10001004,2022-02-14,Audio Visual System,Vendor_3,6000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,0,1,1
1,1_Item,2,Stereo System,2,Dept001,PO_10001005,2022-02-15,Audio Visual System,Vendor_4,8000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,0,1,1
3,2_Item Category,2,Stereo System,2,Dept001,PO_10001004,2022-02-14,Audio Visual System,Vendor_3,6000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2
4,2_Item Category,2,Stereo System,2,Dept001,PO_10001005,2022-02-15,Audio Visual System,Vendor_4,8000,...,PO_10001004|PO_10001005,6000,14000,1.0,2,2,0,1,1,2
6,2_Item Category,3,Stereo System,3,Dept002,PO_10001007,2022-01-17,Audio Visual System,Vendor_2,3000,...,PO_10001006|PO_10001007|PO_10001008,3000,13000,1.0,2,3,1,0,1,2
7,2_Item Category,3,Stereo System,3,Dept002,PO_10001008,2022-01-18,Audio Visual System,Vendor_2,6000,...,PO_10001006|PO_10001007|PO_10001008,3000,13000,1.0,2,3,1,0,1,2
2,1_Item,3,TV,2,Dept001,PO_10001001,2022-01-12,Audio Visual System,Vendor_1,2000,...,PO_10001001|PO_10001002,2000,6000,3.0,1,2,1,1,0,2
