# This notebook shows the steps in building a splitting of purchase data pipeline

In [1]:
import pandas as pd
import numpy as np

In [2]:
%%time
df_raw = pd.read_excel("./data/split_purchase_data.xlsx")

CPU times: total: 125 ms
Wall time: 773 ms


In [3]:
df_raw.head(10)

Unnamed: 0,Purchasing Department,Purchase Order,Purchase Order Date,Item,Item Category,Vendor,Total Value
0,Dept001,PO_10001001,2022-01-12,Equipment AA,Equipment,V0001,2000
1,Dept001,PO_10001002,2022-01-15,Equipment AA,Equipment,V0001,4000
2,Dept001,PO_10001003,2022-01-14,Equipment AB,Equipment,V0002,3000
3,Dept001,PO_10001004,2022-02-14,Equipment AB,Equipment,V0003,6000
4,Dept001,PO_10001005,2022-02-15,Equipment AB,Equipment,V0004,8000
5,Dept002,PO_10001006,2022-01-16,Equipment AA,Equipment,V0001,4000
6,Dept002,PO_10001007,2022-01-17,Equipment AB,Equipment,V0002,3000


In [4]:
df_raw["Purchase Requisition Date"] = pd.to_datetime(
    df_raw["Purchase Requisition Date"]
)
df_raw["Purchase Requisition Number"] = df_raw["Purchase Requisition Number"].astype(
    str
)

KeyError: 'Purchase Requisition Date'

In [None]:
df_raw.info()

# Calculation of Split Purchase based on Item

In [None]:
item_field = "Item"

In [None]:
df_dept_item_sorted = df_raw.sort_values(
    ["Purchasing Department", item_field, "Purchase Requisition Date"]
)

In [None]:
df_dept_item_sorted["Prev PR Date"] = df_dept_item_sorted[
    "Purchase Requisition Date"
].shift()

In [None]:
df_dept_item_sorted["PreGroup"] = (
    df_dept_item_sorted[
        ["Purchasing Department", item_field, "Purchase Requisition Date"]
    ]
    .groupby(["Purchasing Department", item_field])
    .cumcount()
)

In [None]:
df_dept_item_sorted.head()

In [None]:
mask = df_dept_item_sorted["PreGroup"] == 0
df_dept_item_sorted.loc[mask, "Prev PR Date"] = pd.NaT

In [None]:
# Calculating number of days difference between adjacent records
df_dept_item_sorted["Date_Diff"] = (
    df_dept_item_sorted["Purchase Requisition Date"]
    - df_dept_item_sorted["Prev PR Date"]
).dt.days

In [None]:
# Set Threshold
n = 5
df_dept_item_sorted["Flag Out of Range"] = (
    1 - (df_dept_item_sorted["Date_Diff"] <= n)
).astype(int)

In [None]:
df_dept_item_sorted["Grouping"] = df_dept_item_sorted["Flag Out of Range"].cumsum()

In [None]:
df_dept_item_sorted.head()

In [None]:
df_pr_count = (
    df_dept_item_sorted[["Grouping", item_field, "Purchase Requisition Number"]]
    .groupby(["Grouping", item_field])
    .nunique()
    .reset_index()
)

In [None]:
df_pr_count.columns = ["Grouping", item_field, "Count PR"]

In [None]:
# Identifies the groups that are valid split purchase cases
df_split_purchase_group = df_pr_count[df_pr_count["Count PR"] >= 2]

In [None]:
# Joining of Split Purchase Groups to initial details
df_split_purchase_details = df_split_purchase_group.merge(
    df_dept_item_sorted, how="inner", on=["Grouping", item_field]
)

In [None]:
df_split_purchase_details.head()

In [None]:
df_split_purchase_details["Date_Diff"] = np.where(
    df_split_purchase_details["Flag Out of Range"] == 1,
    pd.NaT,
    df_split_purchase_details["Date_Diff"],
)

In [None]:
df_split_purchase_details.sort_values(
    ["Grouping", "Purchase Requisition Number"], inplace=True
)

In [None]:
df_split_purchase_desc = (
    df_split_purchase_details.groupby(["Grouping", item_field])
    .agg({"Purchase Requisition Number": "|".join})
    .reset_index()
)

In [None]:
df_split_purchase_desc.columns = ["Grouping", item_field, "PR with similar items"]

In [None]:
df_split_purchase_comb = df_split_purchase_details.merge(
    df_split_purchase_desc, how="inner", on=["Grouping", item_field]
)

In [None]:
df_split_purchase_comb

### Derive futher prioritization based on rules

- Was the threshold avoided due to the split ?
- Are the splits belonging to the same vendor ?
- Are the dates diff within one day ?

In [None]:
df_priority_score = (
    df_split_purchase_comb.groupby(["Grouping"])
    .agg(
        {
            "Total Value": ["min", "sum"],
            "Date_Diff": "min",
            "Vendor": ["nunique", "count"],
        }
    )
    .reset_index()
)

In [None]:
df_priority_score.columns = [
    "Grouping",
    "Min_Value",
    "Max_Value",
    "Min_Date_Diff",
    "Unique_Vendor",
    "Count_Vendor",
]

In [None]:
value_threshold = 5_000
df_priority_score["Flag_Threshold_Crossed"] = (
    (df_priority_score["Min_Value"] < value_threshold)
    & (df_priority_score["Max_Value"] >= value_threshold)
).astype(int)

In [None]:
df_priority_score["Flag_Same_Vendor"] = (
    (df_priority_score["Unique_Vendor"] == df_priority_score["Count_Vendor"])
).astype(int)

In [None]:
days_threshold = 1
df_priority_score["Flag_Min_Days"] = (
    df_priority_score["Min_Date_Diff"] <= days_threshold
).astype(int)

In [None]:
df_priority_score["Score"] = (
    df_priority_score["Flag_Threshold_Crossed"]
    + df_priority_score["Flag_Same_Vendor"]
    + df_priority_score["Flag_Min_Days"]
)

In [None]:
df_priority_score

In [None]:
df_sop_item = df_split_purchase_comb.merge(df_priority_score, how="left", on="Grouping")

In [None]:
df_sop_item.insert(0, "Item Type", f"1_{item_field}")

In [None]:
df_sop_item.head()

## Performing the calculations for Split Purchase based on Item Categories

In [None]:
item_field = "Item Category"

In [None]:
df_dept_item_sorted = df_raw.sort_values(
    ["Purchasing Department", item_field, "Purchase Requisition Date"]
)

In [None]:
df_dept_item_sorted["Prev PR Date"] = df_dept_item_sorted[
    "Purchase Requisition Date"
].shift()

In [None]:
df_dept_item_sorted["PreGroup"] = (
    df_dept_item_sorted[
        ["Purchasing Department", item_field, "Purchase Requisition Date"]
    ]
    .groupby(["Purchasing Department", item_field])
    .cumcount()
)

In [None]:
df_dept_item_sorted.head()

In [None]:
mask = df_dept_item_sorted["PreGroup"] == 0
df_dept_item_sorted.loc[mask, "Prev PR Date"] = pd.NaT

In [None]:
# Calculating number of days difference between adjacent records
df_dept_item_sorted["Date_Diff"] = (
    df_dept_item_sorted["Purchase Requisition Date"]
    - df_dept_item_sorted["Prev PR Date"]
).dt.days

In [None]:
# Set Threshold
n = 5
df_dept_item_sorted["Flag Out of Range"] = (
    1 - (df_dept_item_sorted["Date_Diff"] <= n)
).astype(int)

In [None]:
df_dept_item_sorted["Grouping"] = df_dept_item_sorted["Flag Out of Range"].cumsum()

In [None]:
df_dept_item_sorted.head()

In [None]:
df_pr_count = (
    df_dept_item_sorted[["Grouping", item_field, "Purchase Requisition Number"]]
    .groupby(["Grouping", item_field])
    .nunique()
    .reset_index()
)

In [None]:
df_pr_count.columns = ["Grouping", item_field, "Count PR"]

In [None]:
# Identifies the groups that are valid split purchase cases
df_split_purchase_group = df_pr_count[df_pr_count["Count PR"] >= 2]

In [None]:
# Joining of Split Purchase Groups to initial details
df_split_purchase_details = df_split_purchase_group.merge(
    df_dept_item_sorted, how="inner", on=["Grouping", item_field]
)

In [None]:
df_split_purchase_details.head()

In [None]:
df_split_purchase_details["Date_Diff"] = np.where(
    df_split_purchase_details["Flag Out of Range"] == 1,
    pd.NaT,
    df_split_purchase_details["Date_Diff"],
)

In [None]:
df_split_purchase_details.sort_values(
    ["Grouping", "Purchase Requisition Number"], inplace=True
)

In [None]:
df_split_purchase_desc = (
    df_split_purchase_details.groupby(["Grouping", item_field])
    .agg({"Purchase Requisition Number": "|".join})
    .reset_index()
)

In [None]:
df_split_purchase_desc.columns = ["Grouping", item_field, "PR with similar items"]

In [None]:
df_split_purchase_comb = df_split_purchase_details.merge(
    df_split_purchase_desc, how="inner", on=["Grouping", item_field]
)

In [None]:
df_split_purchase_comb

### Derive futher prioritization based on rules

- Was the threshold avoided due to the split ?
- Are the splits belonging to the same vendor ?
- Are the dates diff within one day ?

In [None]:
df_priority_score = (
    df_split_purchase_comb.groupby(["Grouping"])
    .agg(
        {
            "Total Value": ["min", "sum"],
            "Date_Diff": "min",
            "Vendor": ["nunique", "count"],
        }
    )
    .reset_index()
)

In [None]:
df_priority_score.columns = [
    "Grouping",
    "Min_Value",
    "Max_Value",
    "Min_Date_Diff",
    "Unique_Vendor",
    "Count_Vendor",
]

In [None]:
value_threshold = 5_000
df_priority_score["Flag_Threshold_Crossed"] = (
    (df_priority_score["Min_Value"] < value_threshold)
    & (df_priority_score["Max_Value"] >= value_threshold)
).astype(int)

In [None]:
df_priority_score["Flag_Same_Vendor"] = (
    (df_priority_score["Unique_Vendor"] == df_priority_score["Count_Vendor"])
).astype(int)

In [None]:
days_threshold = 1
df_priority_score["Flag_Min_Days"] = (
    df_priority_score["Min_Date_Diff"] <= days_threshold
).astype(int)

In [None]:
df_priority_score["Score"] = (
    df_priority_score["Flag_Threshold_Crossed"]
    + df_priority_score["Flag_Same_Vendor"]
    + df_priority_score["Flag_Min_Days"]
)

In [None]:
df_priority_score

In [None]:
df_sop_itemcat = df_split_purchase_comb.merge(
    df_priority_score, how="left", on="Grouping"
)

In [None]:
df_sop_itemcat.insert(0, "Item Type", f"2_{item_field}")

In [None]:
df_sop_itemcat.head()

## Combine Details

In [None]:
df_sop_comb = pd.concat([df_sop_item, df_sop_itemcat])

In [None]:
df_sop_comb

In [None]:
# With the different methods, there will be Groupings with the exact same PR combination and items flagged, we can remove those
df_sop_comb.sort_values(["Grouping", "Item"], inplace=True)

In [None]:
df_sop_grouping_check = (
    df_sop_comb.groupby(["Grouping", "Item Type", "PR with similar items", "Score"])
    .agg({"Item": "|".join})
    .reset_index()
)

In [None]:
df_sop_grouping_check.columns = [
    "Grouping",
    "Item Type",
    "PR with similar items",
    "Score",
    "Items in Group",
]

In [None]:
df_sop_grouping_check.sort_values(
    ["PR with similar items", "Items in Group", "Item Type", "Score"],
    ascending=[True, True, True, False],
    inplace=True,
)

In [None]:
# Identify Groups with the same PRs and Items
df_sop_grouping_check[
    df_sop_grouping_check.duplicated(
        subset=["PR with similar items", "Items in Group"], keep=False
    )
]

In [None]:
df_sop_grouping_unique = df_sop_grouping_check.drop_duplicates(
    subset=["PR with similar items", "Items in Group"], keep="first"
)
valid_groupings = set(df_sop_grouping_unique["Grouping"].tolist())

In [None]:
df_sop_comb_final = df_sop_comb[df_sop_comb["Grouping"].isin(valid_groupings)]

In [None]:
df_sop_comb_final