## Semantic Model Development vs. Current Fuzzy Approach
##### Goal: Use or fine-tune a semantic, transformer based model that outperforms the current fuzzy approach in matching Successor SKU's with their "gold standard" Predecessor SKU. 

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import torch
import sklearn
from sentence_transformers import SentenceTransformer, CrossEncoder
from normality import normalize
from src.data_processing import load_and_process

In [4]:
df = load_and_process("data/npl_neighbor_complete_data_for_sentencetransformer.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 633989 entries, 0 to 696728
Data columns (total 10 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   npl_prodord                        633989 non-null  object 
 1   npl_prodord_desc                   633989 non-null  object 
 2   neighbor_prodord                   633989 non-null  object 
 3   neighbor_prodord_desc              633989 non-null  object 
 4   fuzzy_score                        633989 non-null  float64
 5   row_number                         633989 non-null  int64  
 6   predecessor_prodord_ramesses       633989 non-null  object 
 7   predecessor_prodord_desc_ramesses  633989 non-null  object 
 8   manual_match                       633989 non-null  object 
 9   PREDECESSOR TYPE                   633989 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 53.2+ MB


In [6]:
df.head(10)

Unnamed: 0,npl_prodord,npl_prodord_desc,neighbor_prodord,neighbor_prodord_desc,fuzzy_score,row_number,predecessor_prodord_ramesses,predecessor_prodord_desc_ramesses,manual_match,PREDECESSOR TYPE
0,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCX01,GINGER BODY WASH 200ML,0.77,1,0MCX01,GINGER BODY WASH 200ML,YES,STAR PREDECESSOR
1,0WLY01,GINGER BURST HND&BDY WSH 200ML,0JYH01,GINGER BURST 200ML,0.75,2,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
2,0WLY01,GINGER BURST HND&BDY WSH 200ML,027T01,GINGER BODY WASH 250ML,0.73,3,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
3,0WLY01,GINGER BURST HND&BDY WSH 200ML,0F4501,GINGER HAND CLEANSER 200ML,0.64,4,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
4,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCW01,GINGER HAND CLEANSER 200ML,0.64,5,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
5,0WLY01,GINGER BURST HND&BDY WSH 200ML,829D01,GNGR EXFLTNG BODY WASH 200ML,0.62,6,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
6,0WLY01,GINGER BURST HND&BDY WSH 200ML,0GML01,GINGER BATH SOAP 200GM,0.62,7,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
7,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCT01,GINGER SOAP 200GM,0.55,8,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
8,0WLY01,GINGER BURST HND&BDY WSH 200ML,027R01,GINGER SOAP 300GM,0.51,9,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
9,0WLY01,GINGER BURST HND&BDY WSH 200ML,00TQ01,GINGER FLOAT CREAM BUBBL,0.41,10,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR


In [7]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
cross_encoder = CrossEncoder("cross-encoder/stsb-distilroberta-base")

In [8]:
print(type(embedder))
print(type(cross_encoder))

<class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>
<class 'sentence_transformers.cross_encoder.CrossEncoder.CrossEncoder'>


In [24]:
test_df = df[df["npl_prodord"] == "0WLY01"]

In [25]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 0 to 9
Data columns (total 10 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   npl_prodord                        10 non-null     object 
 1   npl_prodord_desc                   10 non-null     object 
 2   neighbor_prodord                   10 non-null     object 
 3   neighbor_prodord_desc              10 non-null     object 
 4   fuzzy_score                        10 non-null     float64
 5   row_number                         10 non-null     int64  
 6   predecessor_prodord_ramesses       10 non-null     object 
 7   predecessor_prodord_desc_ramesses  10 non-null     object 
 8   manual_match                       10 non-null     object 
 9   PREDECESSOR TYPE                   10 non-null     object 
dtypes: float64(1), int64(1), object(8)
memory usage: 880.0+ bytes


In [55]:
def score_group(group):
    succ_desc = group["npl_prodord_desc"].iloc[0]
    neigh_desc = group["neighbor_prodord_desc"].tolist()

    succ_emb = embedder.encode(succ_desc)
    neigh_emb = embedder.encode(neigh_desc)

    raw_trans = embedder.similarity(neigh_emb, succ_emb).tolist()
    trans = [row[0] for row in raw_trans]

    pairs = [[succ_desc, nd] for nd in neigh_desc]
    raw_cross = cross_encoder.predict(pairs).tolist()
    out = group.copy()
    out["st_score"] = pd.Series(trans, index = out.index)
    out["cross_score"] = pd.Series(raw_cross, index = out.index)
    return out

In [56]:
scored_df = (
    test_df
    .groupby(["npl_prodord"])
    .apply(score_group, include_groups=False)
    .reset_index(drop=True)
)

In [57]:
test_df.head(12)

Unnamed: 0,npl_prodord,npl_prodord_desc,neighbor_prodord,neighbor_prodord_desc,fuzzy_score,row_number,predecessor_prodord_ramesses,predecessor_prodord_desc_ramesses,manual_match,PREDECESSOR TYPE
0,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCX01,GINGER BODY WASH 200ML,0.77,1,0MCX01,GINGER BODY WASH 200ML,YES,STAR PREDECESSOR
1,0WLY01,GINGER BURST HND&BDY WSH 200ML,0JYH01,GINGER BURST 200ML,0.75,2,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
2,0WLY01,GINGER BURST HND&BDY WSH 200ML,027T01,GINGER BODY WASH 250ML,0.73,3,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
3,0WLY01,GINGER BURST HND&BDY WSH 200ML,0F4501,GINGER HAND CLEANSER 200ML,0.64,4,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
4,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCW01,GINGER HAND CLEANSER 200ML,0.64,5,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
5,0WLY01,GINGER BURST HND&BDY WSH 200ML,829D01,GNGR EXFLTNG BODY WASH 200ML,0.62,6,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
6,0WLY01,GINGER BURST HND&BDY WSH 200ML,0GML01,GINGER BATH SOAP 200GM,0.62,7,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
7,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCT01,GINGER SOAP 200GM,0.55,8,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
8,0WLY01,GINGER BURST HND&BDY WSH 200ML,027R01,GINGER SOAP 300GM,0.51,9,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
9,0WLY01,GINGER BURST HND&BDY WSH 200ML,00TQ01,GINGER FLOAT CREAM BUBBL,0.41,10,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR


In [58]:
scored_df.head(10)

Unnamed: 0,npl_prodord_desc,neighbor_prodord,neighbor_prodord_desc,fuzzy_score,row_number,predecessor_prodord_ramesses,predecessor_prodord_desc_ramesses,manual_match,PREDECESSOR TYPE,st_score,cross_score
0,GINGER BURST HND&BDY WSH 200ML,0MCX01,GINGER BODY WASH 200ML,0.77,1,0MCX01,GINGER BODY WASH 200ML,YES,STAR PREDECESSOR,0.524254,0.576237
1,GINGER BURST HND&BDY WSH 200ML,0JYH01,GINGER BURST 200ML,0.75,2,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,0.775478,0.66996
2,GINGER BURST HND&BDY WSH 200ML,027T01,GINGER BODY WASH 250ML,0.73,3,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,0.504425,0.323096
3,GINGER BURST HND&BDY WSH 200ML,0F4501,GINGER HAND CLEANSER 200ML,0.64,4,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,0.540302,0.588042
4,GINGER BURST HND&BDY WSH 200ML,0MCW01,GINGER HAND CLEANSER 200ML,0.64,5,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,0.540302,0.588042
5,GINGER BURST HND&BDY WSH 200ML,829D01,GNGR EXFLTNG BODY WASH 200ML,0.62,6,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,0.289681,0.412669
6,GINGER BURST HND&BDY WSH 200ML,0GML01,GINGER BATH SOAP 200GM,0.62,7,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,0.496771,0.577286
7,GINGER BURST HND&BDY WSH 200ML,0MCT01,GINGER SOAP 200GM,0.55,8,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,0.538639,0.572165
8,GINGER BURST HND&BDY WSH 200ML,027R01,GINGER SOAP 300GM,0.51,9,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,0.52911,0.260489
9,GINGER BURST HND&BDY WSH 200ML,00TQ01,GINGER FLOAT CREAM BUBBL,0.41,10,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,0.472533,0.111617


In [60]:
# df_model = (
#     df
#     .groupby(["npl_prodord"])
#     .apply(score_group, include_groups=False)
#     .reset_index(drop=True)
# )

In [None]:
def score_percentage(data: pd.DataFrame) -> pd.DataFrame:
    if data[data["manual_match" == YES]