# Import Libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from symspellpy import SymSpell
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Lambda

# Load and Explore Dataset

In [2]:
xls = pd.ExcelFile("Product Matching Dataset.xlsx")
master_df = pd.read_excel(xls, "Master File")
dataset_df = pd.read_excel(xls, "Dataset")

In [3]:
dataset_df.head()

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
2,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
3,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5


In [4]:
master_df.head()

Unnamed: 0,sku,product_name,product_name_ar,price
0,279,ANAFRONIL 75 MG 20 TAB,انافرونيل 75 مجم اس ار 20 قرص,75.0
1,2282,LOPRECOUGH SYRUP 100 ML,لوبريكاف شراب 100 مل,28.5
2,4331,TOMEX PLUS 50 TAB,تومكس بلس 50 قرص,60.0
3,1022,TAROLIMUS 0.03% OINT. 15 GM,تاروليمس 0.03 % مرهم 15 جم,129.0
4,116,GLIPTUS PLUS 50/1000 MG 30 TAB,جليبتس بلس 50/1000 مجم 30 قرص,192.0


In [5]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83562 entries, 0 to 83561
Data columns (total 4 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sku                          83562 non-null  int64  
 1   marketplace_product_name_ar  83562 non-null  object 
 2   seller_item_name             83562 non-null  object 
 3   price                        83562 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.6+ MB


# Data Preprocessing

In [6]:
from qalsadi.lemmatizer import Lemmatizer

# Initialize the Arabic lemmatizer
lemmatizer = Lemmatizer()

def normalize_text(text):
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove Arabic diacritics (tashkeel)
    text = re.sub(r"[\u064B-\u065F]", "", text)
    
    # Remove non-Arabic, non-English characters except numbers
    text = re.sub(r'[^\u0621-\u064Aa-zA-Z0-9\s]', ' ', text)
    
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Standardize specific terms
    text = re.sub(r'قرص|\bق\b|\bك\b|اقراص|كبسوله', 'كبسول', text)
    text = re.sub(r'([\u0600-\u06FF])\1+', r'\1', text)  # Remove Arabic repetition
    text = re.sub(r'[إأآ]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ة', 'ه ', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    
    # Separate numbers that stick to Arabic/English words
    text = re.sub(r"(\d+)([a-zA-Z\u0600-\u06FF]+)", r"\1 \2", text)  # Number followed by Arabic/English
    text = re.sub(r"([a-zA-Z\u0600-\u06FF]+)(\d+)", r"\1 \2", text)  # Arabic/English followed by number
    
    # Remove standalone Arabic/English characters (but not numbers)
    text = re.sub(r"\b[^\W\d]\b", "", text)
    
    # Remove specific unwanted phrases
    text = re.sub(r'\b(?:سعر جديد|سعر|قديم|س جديد|س جدي|س ج|ركز)\b', '', text)
    text = re.sub(r'مرهم|اكريم', 'كريم', text)
    text = re.sub(r'مليجرام|\bم\b|مجم', 'مجم', text)
    text = re.sub(r'جرام|جم', 'جم', text)
    text = re.sub(r'شرائط|شريطين', 'كبسول', text)
    text = re.sub(r'امبولات|امبوله|حقن', 'امبول', text)
    text = re.sub(r'لبوس|لبوس اطفال', 'اقماع', text)
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    # Tokenize the text
    tokens = text.split()
    
    # Lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the lemmatized tokens back into a single string
    lemmatized_text = " ".join(lemmatized_tokens)
    
    return lemmatized_text

### Apply preprocessing

In [7]:
dataset_df["cleaned_seller_item_name"] = dataset_df["seller_item_name"].astype(str).apply(normalize_text)

In [8]:
dataset_df.sample(10)

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price,cleaned_seller_item_name
253,252,اماريل 3 مجم 30 قرص,اماريل 3 جديد,61.5,اماريل 3 جديد
30411,3373,الكابريس بلس 10/160مجم 14 قرص,الكابرس 10/160 بلاس سعر قديم,76.0,الكابرس 10 160 بلاس
27078,1457,إندرال 40 مجم 50 قرص,اندرال 40 -- 50 قرص,55.0,اندرال 40 50 كبسول
69439,692,دولفين 12.5 مجم 10 أقماع,دولفن 12.5 لبوس اخضر س ج,36.0,دولفن 12 5 اقماع خضر
44430,775,دوكسيرازول 60 مجم 14 كبسول,دوكسىرازول 60جم اقراص,72.75,دوكسيرازول 60 جم كبسول
48556,249,سيتال 250مجم/5مل شراب معلق 60 مل,سيتال شراب الجدييييييد,31.0,سيتال شراب جديد
66714,1720,توسيستوب شراب 100 مل,توسيستوب شراب س.ق,32.5,توسيستوب شراب كبسول
77325,1724,سيبروفلوكساسين-اورجانو 750 مجم 10 قرص,سيبروفلوكساسين 750مج 10قرص س.ج.,89.0,سيبروفلوكساسين 750 ماج 10 كبسول
57427,927,نيفيلوب 2.5 مجم 14 قرص,نيفيلوب 2.5مجم اقراص,46.0,نيفيلوب 2 5 مجم كبسول
29746,2629,دوكسيرازول 30 مجم 14 كبسول,دوكسيرازول 30مجم 14ك سعر جديد,44.25,دوكسيرازول 30 مجم 14


### Price Scaling

In [9]:
# Scale Price
scaler = StandardScaler()
dataset_df["scaled_price"] = scaler.fit_transform(dataset_df[["price"]])

### Check for class balancing

In [10]:
dataset_df['sku'].value_counts() / len(dataset_df) * 100

sku
1322    0.214212
252     0.214212
257     0.213015
854     0.213015
26      0.210622
          ...   
1255    0.191475
137     0.191475
146     0.191475
1469    0.191475
1603    0.191475
Name: count, Length: 500, dtype: float64

# Feature Engineering

### Label Encoding

In [11]:
# Label Encoding for SKU
label_encoder = LabelEncoder()
dataset_df["encoded_sku"] = label_encoder.fit_transform(dataset_df["sku"])

# Feature and Target
X = dataset_df[["cleaned_seller_item_name", "scaled_price"]]
Y = dataset_df["encoded_sku"]

### Tokenization and Padding

In [12]:
# Tokenization for RNN
num_words = 5000
tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X["cleaned_seller_item_name"])
sequences = tokenizer.texts_to_sequences(X["cleaned_seller_item_name"])

# Padding
max_length = max(len(seq) for seq in sequences)
x_padded = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")

# Train-Test Splitting

In [13]:
# Train-Test Split for RNN
X_train, X_test, y_train, y_test, X_train_texts, X_test_texts = train_test_split(x_padded, Y, dataset_df['cleaned_seller_item_name'], stratify=Y, shuffle=True)

### Temperature Scaling Layer

In [14]:
class TemperatureScaling(tf.keras.layers.Layer):
    def __init__(self, temperature=1.0, **kwargs):
        super().__init__(**kwargs)
        self.temperature = tf.Variable(temperature, trainable=False)

    def call(self, inputs):
        return inputs / self.temperature

    def get_config(self):
        return {'temperature': self.temperature.numpy()}

# Build and Train RNN Model

In [40]:
# Build Improved RNN Model
rnn_model = Sequential([
    Embedding(input_dim=num_words, output_dim=256, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.2),
    Bidirectional(GRU(32)),
    Dropout(0.2),
    Dense(32, activation="relu"),
    Dense(len(label_encoder.classes_), activation="softmax"),
])

rnn_model.pop()  # Remove final dense layer
rnn_model.pop()  # Remove previous temperature layer if exists

# Add new temperature scaling
rnn_model.add(Dense(len(label_encoder.classes_)))
rnn_model.add(TemperatureScaling(temperature=1.0))
rnn_model.add(tf.keras.layers.Activation('softmax'))

# rnn_model.add(Lambda(lambda x: x / 0.3))

rnn_model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=0.0005), metrics=["accuracy"])

# Early Stopping
early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

# Train RNN Model
rnn_model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/100
[1m980/980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 29ms/step - accuracy: 0.1676 - loss: 5.2483 - val_accuracy: 0.9273 - val_loss: 1.9547
Epoch 2/100
[1m980/980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23ms/step - accuracy: 0.9216 - loss: 1.5776 - val_accuracy: 0.9703 - val_loss: 0.5738
Epoch 3/100
[1m980/980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 29ms/step - accuracy: 0.9762 - loss: 0.5464 - val_accuracy: 0.9806 - val_loss: 0.2559
Epoch 4/100
[1m980/980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 33ms/step - accuracy: 0.9876 - loss: 0.2595 - val_accuracy: 0.9840 - val_loss: 0.1557
Epoch 5/100
[1m980/980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 27ms/step - accuracy: 0.9925 - loss: 0.1525 - val_accuracy: 0.9862 - val_loss: 0.1100
Epoch 6/100
[1m980/980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 27ms/step - accuracy: 0.9942 - loss: 0.0971 - val_accuracy: 0.9877 - val_loss: 0.0894
Epoch 7/10

<keras.src.callbacks.history.History at 0x19531337b30>

# Evaluate RNN Model

In [16]:
# Evaluate RNN Model
loss, accuracy = rnn_model.evaluate(X_test, y_test)
print(f"RNN Model Accuracy: {accuracy * 100:.2f}%")

[1m653/653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9908 - loss: 0.0633
RNN Model Accuracy: 99.01%


# Confidence Scoring & Error Analysis

In [41]:
predictions = rnn_model.predict(X_test)
y_pred = np.argmax(predictions, axis=1)
confidence_score = np.max(predictions, axis=1)

[1m653/653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step


In [42]:
# Convert predicted indices and actual labels back to SKUs
predicted_skus = label_encoder.inverse_transform(y_pred)
actual_skus = label_encoder.inverse_transform(y_test)

In [43]:
threshold = 0.6

results_df = pd.DataFrame({
    "Predicted_SKU": predicted_skus,
    "Actual_SKU": actual_skus,
    "Confidence_Score": confidence_score
})

results_df["Prediction_Message"] = results_df["Confidence_Score"].apply(
    lambda conf: "Unknown product" if conf < threshold else "Product Mapped"
)

In [20]:
results_df.sample(10)

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message
12352,250,250,0.999134,Product Mapped
13673,427,427,0.997774,Product Mapped
1517,3598,3598,0.997843,Product Mapped
3851,137,137,0.999278,Product Mapped
10661,2662,2662,0.994568,Product Mapped
16954,454,454,0.998608,Product Mapped
13053,2121,2121,0.997566,Product Mapped
16390,4700,4700,0.998867,Product Mapped
6476,6833,6833,0.896044,Product Mapped
10882,2520,2520,0.998227,Product Mapped


In [44]:
# get all rows where the confidence score is less than the threshold
low_conf = results_df[results_df["Confidence_Score"] < threshold]
low_conf.sample(10)

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message
16127,371,353,0.371692,Unknown product
10978,121,121,0.531606,Unknown product
11698,4623,860,0.407609,Unknown product
5296,1003,1003,0.571848,Unknown product
19296,775,788,0.324045,Unknown product
1411,15,15,0.431612,Unknown product
13988,4139,1337,0.381289,Unknown product
2786,1538,1538,0.480743,Unknown product
16524,95,1885,0.239681,Unknown product
2578,708,1538,0.533043,Unknown product


In [45]:
# get all rows where the Predicted SKU is not equal to the Actual SKU
results_df[results_df["Predicted_SKU"] != results_df["Actual_SKU"]]

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message
59,588,237,0.999373,Product Mapped
152,4887,297,0.419761,Unknown product
333,1073,543,0.618558,Product Mapped
338,2082,762,0.227347,Unknown product
650,4887,297,0.506725,Unknown product
...,...,...,...,...
20531,2629,775,0.850303,Product Mapped
20595,232,2068,0.839305,Product Mapped
20607,1317,3689,0.267744,Unknown product
20648,121,690,0.831892,Product Mapped


In [46]:
def confidence_score(X_test, y_test):
    probabilities = rnn_model.predict(X_test)

    confidence_scores = np.max(probabilities, axis=1)

    predicted_indices = np.argmax(probabilities, axis=1)

    predicted_labels = label_encoder.inverse_transform(predicted_indices)

    true_labels = label_encoder.inverse_transform(y_test)

    confidence_threshold = 0.9

    test_results = []
    incorrect_count = 0
    high_confidence_count = 0

    for i in range(len(X_test)):
        confidence = confidence_scores[i]
        predicted_class = predicted_labels[i]
        true_class = true_labels[i]

        
        if confidence > confidence_threshold:
            high_confidence_count += 1  
            if predicted_class != true_class:
                incorrect_count += 1  

        if confidence < confidence_threshold:
            predicted_class = "Unknown"

        test_results.append({
            'Predicted': predicted_class
        })
        
    if high_confidence_count > 0:
        error_percentage = (incorrect_count / high_confidence_count) * 100
    else:
        error_percentage = 0.0
        

    temp_df = pd.DataFrame(test_results)
    print(f"Percentage of incorrect classifications with confidence > 0.9: {error_percentage:.2f}%")
    print(temp_df.head())
    return temp_df

# String Similarity with Levenshtein Distance

In [47]:
import Levenshtein

def levenshtein_similarity(s1, s2):
    s1 = s1.strip()
    s2 = s2.strip()
    if not s1 and not s2:
        return 1.0
    # Calculate the Levenshtein distance between s1 and s2
    lev_distance = Levenshtein.distance(s1, s2)
    max_len = max(len(s1), len(s2))
    # Ensure we don't divide by zero
    if max_len == 0:
        return 1.0
    similarity = 1 - (lev_distance / max_len)
    return similarity

In [48]:
# Create a mapping from SKU to master product text
sku_to_master_text = dict(zip(master_df['sku'], master_df['product_name_ar']))

In [49]:
similarity_scores = []
# adjusted_confidence_scores = []

for seller_text, pred_sku, orig_conf in zip(X_test_texts, results_df['Predicted_SKU'], results_df['Confidence_Score']):
    # Get the corresponding master text. If not found, you can set similarity to 0 (or a small value).
    master_text = sku_to_master_text.get(pred_sku, "")
    sim = levenshtein_similarity(seller_text, master_text) if master_text else 0.0
    similarity_scores.append(sim)
    
    # Combine the original model confidence with the similarity score.
    # Here, we use a simple multiplication, but you might consider a weighted average or other method.
    adjusted_conf = orig_conf * sim
    # adjusted_confidence_scores.append(adjusted_conf)

In [50]:
# Add these new scores to your results dataframe.
results_df["Similarity_Score"] = similarity_scores

In [51]:
# Ensure that your confidence and similarity scores are in the [0, 1] range.
alpha = 0.7  # Adjust this weight based on your preferences

results_df["Weighted_Combined_Score"] = results_df.apply(
    lambda row: (alpha * row["Confidence_Score"] + (1 - alpha) * row["Similarity_Score"]), axis=1
)

In [52]:
# Convert scores to percentage form and round to two decimal digits
results_df["Confidence_Score"] = results_df["Confidence_Score"].apply(lambda x: round(x * 100, 2))
results_df["Similarity_Score"] = results_df["Similarity_Score"].apply(lambda x: round(x * 100, 2))

In [53]:
results_df["Prediction_Message"] = results_df["Weighted_Combined_Score"].apply(
    lambda conf: "Unknown product" if conf < 0.6 else "Product Mapped"
)

In [54]:
results_df["Weighted_Combined_Score"] = results_df["Weighted_Combined_Score"].apply(lambda x: round(x * 100, 2))

Here we will use Weighted_Combined_Score

In [55]:
results_df.sample(10)

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message,Similarity_Score,Weighted_Combined_Score
14128,1889,1889,99.98,Product Mapped,68.97,90.68
8038,1834,1834,99.98,Product Mapped,66.67,89.99
5754,1305,1305,99.95,Product Mapped,25.0,77.47
5655,1474,1474,99.97,Product Mapped,50.0,84.98
16752,2350,2350,99.93,Product Mapped,79.17,93.7
12356,993,993,99.99,Product Mapped,78.26,93.47
11289,513,513,99.93,Product Mapped,45.95,83.74
14940,614,614,99.97,Product Mapped,78.26,93.46
13861,3701,3701,99.97,Product Mapped,57.14,87.12
9689,977,977,99.98,Product Mapped,81.82,94.53


In [56]:
# get the percentage of incorrect predictions
incorrect_predictions = results_df[results_df["Predicted_SKU"] != results_df["Actual_SKU"]]
incorrect_percentage = len(incorrect_predictions) / len(results_df) * 100
print(f"Percentage of incorrect predictions: {np.round(incorrect_percentage, 2)}%")

Percentage of incorrect predictions: 0.92%


In [57]:
# get the percentage of unknown predictions
unknown_predictions = results_df[results_df["Weighted_Combined_Score"] < 60]
unknown_percentage = len(unknown_predictions) / len(results_df) * 100
print(f"Percentage of unknown products: {np.round(unknown_percentage, 2)}%")

Percentage of unknown products: 0.78%


In [58]:
# get the percentage of incorrect predictions with high confidence score
incorrect_high_confidence = incorrect_predictions[incorrect_predictions["Weighted_Combined_Score"] >= 90]
incorrect_high_confidence_percentage = len(incorrect_high_confidence) / len(results_df) * 100
print(f"Percentage of incorrect predictions with confidence > 90: {np.round(incorrect_high_confidence_percentage, 2)}%")

Percentage of incorrect predictions with confidence > 90: 0.01%


In [59]:
results_df[results_df["Predicted_SKU"] != results_df["Actual_SKU"]]

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message,Similarity_Score,Weighted_Combined_Score
59,588,237,99.94,Product Mapped,36.36,80.87
152,4887,297,41.98,Unknown product,39.29,41.17
333,1073,543,61.86,Unknown product,22.58,50.07
338,2082,762,22.73,Unknown product,15.38,20.53
650,4887,297,50.67,Unknown product,28.57,44.04
...,...,...,...,...,...,...
20531,2629,775,85.03,Product Mapped,44.83,72.97
20595,232,2068,83.93,Product Mapped,38.89,70.42
20607,1317,3689,26.77,Unknown product,41.67,31.24
20648,121,690,83.19,Product Mapped,52.63,74.02


In [60]:
incorrect_predictions[incorrect_predictions["Weighted_Combined_Score"] >= threshold]

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message,Similarity_Score,Weighted_Combined_Score
59,588,237,99.94,Product Mapped,36.36,80.87
152,4887,297,41.98,Unknown product,39.29,41.17
333,1073,543,61.86,Unknown product,22.58,50.07
338,2082,762,22.73,Unknown product,15.38,20.53
650,4887,297,50.67,Unknown product,28.57,44.04
...,...,...,...,...,...,...
20531,2629,775,85.03,Product Mapped,44.83,72.97
20595,232,2068,83.93,Product Mapped,38.89,70.42
20607,1317,3689,26.77,Unknown product,41.67,31.24
20648,121,690,83.19,Product Mapped,52.63,74.02


# Saving Model

In [61]:
rnn_model.save("ISUPPLY Product Matching Model.h5")

