## Semantic Model Development vs. Current Fuzzy Approach
##### Goal: Use or fine-tune a semantic, transformer based model that outperforms the current fuzzy approach in matching Successor SKU's with their "gold standard" Predecessor SKU. 

In [1]:
%load_ext autoreload
%autoreload 2

In [43]:
import pandas as pd
import numpy as np
import torch
import sklearn
from sentence_transformers import SentenceTransformer, CrossEncoder
from normality import normalize
from src.data_processing import load_and_process
from src.eval import eval_fuzzy, eval_trans

In [3]:
df = load_and_process("data/npl_neighbor_complete_data_for_sentencetransformer.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 633989 entries, 0 to 696728
Data columns (total 10 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   npl_prodord                        633989 non-null  object 
 1   npl_prodord_desc                   633989 non-null  object 
 2   neighbor_prodord                   633989 non-null  object 
 3   neighbor_prodord_desc              633989 non-null  object 
 4   fuzzy_score                        633989 non-null  float64
 5   row_number                         633989 non-null  int64  
 6   predecessor_prodord_ramesses       633989 non-null  object 
 7   predecessor_prodord_desc_ramesses  633989 non-null  object 
 8   manual_match                       633989 non-null  object 
 9   PREDECESSOR TYPE                   633989 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 53.2+ MB


In [5]:
df.head(20)

Unnamed: 0,npl_prodord,npl_prodord_desc,neighbor_prodord,neighbor_prodord_desc,fuzzy_score,row_number,predecessor_prodord_ramesses,predecessor_prodord_desc_ramesses,manual_match,PREDECESSOR TYPE
0,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCX01,GINGER BODY WASH 200ML,0.77,1,0MCX01,GINGER BODY WASH 200ML,YES,STAR PREDECESSOR
1,0WLY01,GINGER BURST HND&BDY WSH 200ML,0JYH01,GINGER BURST 200ML,0.75,2,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
2,0WLY01,GINGER BURST HND&BDY WSH 200ML,027T01,GINGER BODY WASH 250ML,0.73,3,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
3,0WLY01,GINGER BURST HND&BDY WSH 200ML,0F4501,GINGER HAND CLEANSER 200ML,0.64,4,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
4,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCW01,GINGER HAND CLEANSER 200ML,0.64,5,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
5,0WLY01,GINGER BURST HND&BDY WSH 200ML,829D01,GNGR EXFLTNG BODY WASH 200ML,0.62,6,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
6,0WLY01,GINGER BURST HND&BDY WSH 200ML,0GML01,GINGER BATH SOAP 200GM,0.62,7,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
7,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCT01,GINGER SOAP 200GM,0.55,8,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
8,0WLY01,GINGER BURST HND&BDY WSH 200ML,027R01,GINGER SOAP 300GM,0.51,9,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR
9,0WLY01,GINGER BURST HND&BDY WSH 200ML,00TQ01,GINGER FLOAT CREAM BUBBL,0.41,10,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR


In [7]:
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

In [9]:
all_descs = pd.concat([df["npl_prodord_desc"], df["neighbor_prodord_desc"]]).unique()

In [10]:
len(all_descs)

18135

In [18]:
emb_map = {}
batch_size = 256
for start in range(0, len(all_descs), batch_size):
    batch = all_descs[start:start+batch_size].tolist()
    embs = model.encode(batch, show_progress_bar=True, normalize_embeddings=True, convert_to_numpy=True)
    for desc, emb in zip(batch, embs):
        emb_map[desc] = emb

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [15]:
len(emb_map)

18135

In [16]:
df["npl_prodord_desc_emb"] = df["npl_prodord_desc"].map(emb_map)
df["neighbor_prodord_desc_emb"] = df["neighbor_prodord_desc"].map(emb_map)

In [17]:
df.head(10)

Unnamed: 0,npl_prodord,npl_prodord_desc,neighbor_prodord,neighbor_prodord_desc,fuzzy_score,row_number,predecessor_prodord_ramesses,predecessor_prodord_desc_ramesses,manual_match,PREDECESSOR TYPE,npl_prodord_desc_emb,neighbor_prodord_desc_emb
0,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCX01,GINGER BODY WASH 200ML,0.77,1,0MCX01,GINGER BODY WASH 200ML,YES,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.03539984, -0.016253632, -0.01356889, 0.028..."
1,0WLY01,GINGER BURST HND&BDY WSH 200ML,0JYH01,GINGER BURST 200ML,0.75,2,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.023320219, -0.011378394, -0.013693338, 0.0..."
2,0WLY01,GINGER BURST HND&BDY WSH 200ML,027T01,GINGER BODY WASH 250ML,0.73,3,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.032164358, 0.00054609415, -0.014018997, 0...."
3,0WLY01,GINGER BURST HND&BDY WSH 200ML,0F4501,GINGER HAND CLEANSER 200ML,0.64,4,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.02397896, -0.014936242, -0.013384992, -0.0..."
4,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCW01,GINGER HAND CLEANSER 200ML,0.64,5,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.02397896, -0.014936242, -0.013384992, -0.0..."
5,0WLY01,GINGER BURST HND&BDY WSH 200ML,829D01,GNGR EXFLTNG BODY WASH 200ML,0.62,6,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.0033935735, -0.06285792, -0.013870217, 0.0..."
6,0WLY01,GINGER BURST HND&BDY WSH 200ML,0GML01,GINGER BATH SOAP 200GM,0.62,7,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.027340047, -0.032042507, -0.014683996, 0.0..."
7,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCT01,GINGER SOAP 200GM,0.55,8,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.012504197, -0.0057617733, -0.015041154, 0...."
8,0WLY01,GINGER BURST HND&BDY WSH 200ML,027R01,GINGER SOAP 300GM,0.51,9,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.014957268, -0.009471259, -0.0147626875, 0...."
9,0WLY01,GINGER BURST HND&BDY WSH 200ML,00TQ01,GINGER FLOAT CREAM BUBBL,0.41,10,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.021027407, 0.020168014, -0.013475669, 0.01..."


In [25]:
succ_array = np.stack(df["npl_prodord_desc_emb"].values)
pred_array = np.stack(df["neighbor_prodord_desc_emb"].values)
print(succ_array.shape)
print(pred_array.shape)

(633989, 1024)
(633989, 1024)


In [26]:
similarity = np.sum(succ_array * pred_array, axis = 1)

In [27]:
df["similarity_score"] = similarity

In [28]:
df.head(10)

Unnamed: 0,npl_prodord,npl_prodord_desc,neighbor_prodord,neighbor_prodord_desc,fuzzy_score,row_number,predecessor_prodord_ramesses,predecessor_prodord_desc_ramesses,manual_match,PREDECESSOR TYPE,npl_prodord_desc_emb,neighbor_prodord_desc_emb,similarity_score
0,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCX01,GINGER BODY WASH 200ML,0.77,1,0MCX01,GINGER BODY WASH 200ML,YES,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.03539984, -0.016253632, -0.01356889, 0.028...",0.823572
1,0WLY01,GINGER BURST HND&BDY WSH 200ML,0JYH01,GINGER BURST 200ML,0.75,2,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.023320219, -0.011378394, -0.013693338, 0.0...",0.950744
2,0WLY01,GINGER BURST HND&BDY WSH 200ML,027T01,GINGER BODY WASH 250ML,0.73,3,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.032164358, 0.00054609415, -0.014018997, 0....",0.813442
3,0WLY01,GINGER BURST HND&BDY WSH 200ML,0F4501,GINGER HAND CLEANSER 200ML,0.64,4,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.02397896, -0.014936242, -0.013384992, -0.0...",0.716095
4,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCW01,GINGER HAND CLEANSER 200ML,0.64,5,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.02397896, -0.014936242, -0.013384992, -0.0...",0.716095
5,0WLY01,GINGER BURST HND&BDY WSH 200ML,829D01,GNGR EXFLTNG BODY WASH 200ML,0.62,6,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.0033935735, -0.06285792, -0.013870217, 0.0...",0.706807
6,0WLY01,GINGER BURST HND&BDY WSH 200ML,0GML01,GINGER BATH SOAP 200GM,0.62,7,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.027340047, -0.032042507, -0.014683996, 0.0...",0.711266
7,0WLY01,GINGER BURST HND&BDY WSH 200ML,0MCT01,GINGER SOAP 200GM,0.55,8,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.012504197, -0.0057617733, -0.015041154, 0....",0.729506
8,0WLY01,GINGER BURST HND&BDY WSH 200ML,027R01,GINGER SOAP 300GM,0.51,9,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.014957268, -0.009471259, -0.0147626875, 0....",0.717675
9,0WLY01,GINGER BURST HND&BDY WSH 200ML,00TQ01,GINGER FLOAT CREAM BUBBL,0.41,10,0MCX01,GINGER BODY WASH 200ML,n,STAR PREDECESSOR,"[-0.044266324, -0.032603286, -0.014347708, 0.0...","[-0.021027407, 0.020168014, -0.013475669, 0.01...",0.718705


In [35]:
int(df["manual_match"].value_counts().iloc[1])

1811

In [41]:
eval_fuzzy(df)

Fuzzy percentage: 15.24%


In [50]:
eval_trans(df)

Sentence Transformer percentage: 14.03%
