# Import Libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from symspellpy import SymSpell
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Lambda

# Read Dataset

In [2]:
xls = pd.ExcelFile("Product Matching Dataset.xlsx")
master_df = pd.read_excel(xls, "Master File")
dataset_df = pd.read_excel(xls, "Dataset")

In [3]:
dataset_df.head()

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
2,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
3,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5


In [4]:
master_df.head()

Unnamed: 0,sku,product_name,product_name_ar,price
0,279,ANAFRONIL 75 MG 20 TAB,انافرونيل 75 مجم اس ار 20 قرص,75.0
1,2282,LOPRECOUGH SYRUP 100 ML,لوبريكاف شراب 100 مل,28.5
2,4331,TOMEX PLUS 50 TAB,تومكس بلس 50 قرص,60.0
3,1022,TAROLIMUS 0.03% OINT. 15 GM,تاروليمس 0.03 % مرهم 15 جم,129.0
4,116,GLIPTUS PLUS 50/1000 MG 30 TAB,جليبتس بلس 50/1000 مجم 30 قرص,192.0


In [5]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83562 entries, 0 to 83561
Data columns (total 4 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sku                          83562 non-null  int64  
 1   marketplace_product_name_ar  83562 non-null  object 
 2   seller_item_name             83562 non-null  object 
 3   price                        83562 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.6+ MB


# Preprocessing

In [6]:
from qalsadi.lemmatizer import Lemmatizer

# Initialize the Arabic lemmatizer
lemmatizer = Lemmatizer()

def normalize_text(text):
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove Arabic diacritics (tashkeel)
    text = re.sub(r"[\u064B-\u065F]", "", text)
    
    # Remove non-Arabic, non-English characters except numbers
    text = re.sub(r'[^\u0621-\u064Aa-zA-Z0-9\s]', ' ', text)
    
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Standardize specific terms
    text = re.sub(r'قرص|\bق\b|\bك\b|اقراص|كبسوله', 'كبسول', text)
    text = re.sub(r'([\u0600-\u06FF])\1+', r'\1', text)  # Remove Arabic repetition
    text = re.sub(r'[إأآ]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ة', 'ه ', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    
    # Separate numbers that stick to Arabic/English words
    text = re.sub(r"(\d+)([a-zA-Z\u0600-\u06FF]+)", r"\1 \2", text)  # Number followed by Arabic/English
    text = re.sub(r"([a-zA-Z\u0600-\u06FF]+)(\d+)", r"\1 \2", text)  # Arabic/English followed by number
    
    # Remove standalone Arabic/English characters (but not numbers)
    text = re.sub(r"\b[^\W\d]\b", "", text)
    
    # Remove specific unwanted phrases
    text = re.sub(r'\b(?:سعر جديد|سعر|قديم|س جديد|س جدي|س ج|ركز)\b', '', text)
    text = re.sub(r'مرهم|اكريم', 'كريم', text)
    text = re.sub(r'مليجرام|\bم\b|مجم', 'مجم', text)
    text = re.sub(r'جرام|جم', 'جم', text)
    text = re.sub(r'شرائط|شريطين', 'كبسول', text)
    text = re.sub(r'امبولات|امبوله|حقن', 'امبول', text)
    text = re.sub(r'لبوس|لبوس اطفال', 'اقماع', text)
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    # Tokenize the text
    tokens = text.split()
    
    # Lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the lemmatized tokens back into a single string
    lemmatized_text = " ".join(lemmatized_tokens)
    
    return lemmatized_text

In [7]:
normalize_text("3helloo3 فيدروب 2800 وحدة دولية//مل نقط _ بالفم helloo 15 مل")

'3 helloo 3 فيدروب 2800 وحد دولي مل نقط فم helloo 15 مل'

In [8]:
normalize_text(" we.ok//  / Hello,World!  ")

'we ok hello world'

Apply preprocessing

In [9]:
dataset_df["cleaned_seller_item_name"] = dataset_df["seller_item_name"].astype(str).apply(normalize_text)

In [10]:
dataset_df.sample(10)

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price,cleaned_seller_item_name
8305,1412,كورتيبليكس ب 6 للاطفال 3 امبول,كورتيبلكس امبول اطفال س ج 27ج,27.0,كورتيبلكس امبول اطفال 27
79849,3009,فابوزول محلول استشاق 30 مل,فابوزول للاستنشاق/ميباكو,19.5,فابوزول استنشاق ميباكو
78743,2359,ابيمول 300 مجم 5 اقماع,ابيمول لبوس جلاكسو,15.0,ابيمول اقماع جلاكسو
42271,338,افيل 45.5 مجم/2مل 6 امبول عضل,افيل 6 امبول س * ج,39.0,افيل 6 امبول
74586,504,اجركس 75 مجم 60 قرص,اجركس75مم ق6شريط,33.0,اجركس 75 6 شريط
36347,4115,كروميوم - ميباكو 20 كبسول,كروميوم 200مج 20قرص .ض,40.0,كروميوم 200 ماج 20 كبسول
13076,4582,تكلو 30 قرص,تكلو 30 قرص س ج,60.0,تكلو 30 كبسول
50289,372,بيتاسيرك 16 مجم 60 قرص,بيتاسيرك 16 مجم 60 قرص سعر ج,219.0,بيتاسيرك 16 مجم 60 كبسول
16808,2575,موديوريتك 5/50 مجم 30 قرص,موديوريتك اقراص,15.0,موديوريتك كبسول
59187,1754,توسيفان-ان شراب 125 مل,توسيفان ان شراب,34.0,توسيفان ان شراب


In [11]:
# Scale Price
scaler = StandardScaler()
dataset_df["scaled_price"] = scaler.fit_transform(dataset_df[["price"]])

Check for class balancing

In [12]:
dataset_df['sku'].value_counts() / len(dataset_df) * 100

sku
1322    0.214212
252     0.214212
257     0.213015
854     0.213015
26      0.210622
          ...   
1255    0.191475
137     0.191475
146     0.191475
1469    0.191475
1603    0.191475
Name: count, Length: 500, dtype: float64

#### Preparing Data for RNN Model

In [13]:
# Label Encoding for SKU
label_encoder = LabelEncoder()
dataset_df["encoded_sku"] = label_encoder.fit_transform(dataset_df["sku"])

# Feature and Target
X = dataset_df[["cleaned_seller_item_name", "scaled_price"]]
Y = dataset_df["encoded_sku"]

In [14]:
# Tokenization for RNN
num_words = 5000
tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X["cleaned_seller_item_name"])
sequences = tokenizer.texts_to_sequences(X["cleaned_seller_item_name"])

# Padding
max_length = max(len(seq) for seq in sequences)
x_padded = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")

#### Splitting Data

In [43]:
# Train-Test Split for RNN
X_train, X_test, y_train, y_test, X_train_texts, X_test_texts = train_test_split(x_padded, Y, dataset_df['cleaned_seller_item_name'], stratify=Y, shuffle=True)

#### RNN Model

In [16]:
class TemperatureScaling(tf.keras.layers.Layer):
    def __init__(self, temperature=1.0, **kwargs):
        super().__init__(**kwargs)
        self.temperature = tf.Variable(temperature, trainable=False)

    def call(self, inputs):
        return inputs / self.temperature

    def get_config(self):
        return {'temperature': self.temperature.numpy()}

In [44]:
# Build Improved RNN Model
rnn_model = Sequential([
    Embedding(input_dim=num_words, output_dim=256, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.2),
    Bidirectional(GRU(32)),
    Dropout(0.2),
    Dense(32, activation="relu"),
    Dense(len(label_encoder.classes_), activation="softmax"),
])

rnn_model.pop()  # Remove final dense layer
rnn_model.pop()  # Remove previous temperature layer if exists

# Add new temperature scaling
rnn_model.add(Dense(len(label_encoder.classes_)))
rnn_model.add(TemperatureScaling(temperature=1.0))
rnn_model.add(tf.keras.layers.Activation('softmax'))

# rnn_model.add(Lambda(lambda x: x / 0.3))

rnn_model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=0.0005), metrics=["accuracy"])

# Early Stopping
early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

# Train RNN Model
rnn_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])



Epoch 1/100
[1m1959/1959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 50ms/step - accuracy: 0.2867 - loss: 4.7030 - val_accuracy: 0.9578 - val_loss: 0.9865
Epoch 2/100
[1m1959/1959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 45ms/step - accuracy: 0.9572 - loss: 0.8040 - val_accuracy: 0.9781 - val_loss: 0.2720
Epoch 3/100
[1m1959/1959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 45ms/step - accuracy: 0.9850 - loss: 0.2498 - val_accuracy: 0.9829 - val_loss: 0.1462
Epoch 4/100
[1m1959/1959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 47ms/step - accuracy: 0.9906 - loss: 0.1177 - val_accuracy: 0.9851 - val_loss: 0.1036
Epoch 5/100
[1m1959/1959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 47ms/step - accuracy: 0.9949 - loss: 0.0655 - val_accuracy: 0.9876 - val_loss: 0.0897
Epoch 6/100
[1m1959/1959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 46ms/step - accuracy: 0.9955 - loss: 0.0459 - val_accuracy: 0.9847 - val_loss: 0.09

<keras.src.callbacks.history.History at 0x24ccc400b60>

#### Evaluate RNN Model

In [45]:
# Evaluate RNN Model
loss, accuracy = rnn_model.evaluate(X_test, y_test)
print(f"RNN Model Accuracy: {accuracy * 100:.2f}%")

[1m653/653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9888 - loss: 0.0795
RNN Model Accuracy: 98.89%


In [305]:
predictions = rnn_model.predict(X_test)
y_pred = np.argmax(predictions, axis=1)
confidence_score = np.max(predictions, axis=1)

[1m653/653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step


In [306]:
# Convert predicted indices and actual labels back to SKUs
predicted_skus = label_encoder.inverse_transform(y_pred)
actual_skus = label_encoder.inverse_transform(y_test)

In [307]:
threshold = 0.6

results_df = pd.DataFrame({
    "Predicted_SKU": predicted_skus,
    "Actual_SKU": actual_skus,
    "Confidence_Score": confidence_score
})

results_df["Prediction_Message"] = results_df["Confidence_Score"].apply(
    lambda conf: "Unknown product" if conf < threshold else "Product Mapped"
)

In [308]:
results_df.sample(10)

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message
20028,3516,3516,0.998074,Product Mapped
17258,1254,1254,0.998588,Product Mapped
1866,360,360,0.999835,Product Mapped
20885,3132,3132,0.998312,Product Mapped
4893,2006,2006,0.998856,Product Mapped
2890,457,457,0.999049,Product Mapped
18710,444,444,0.997304,Product Mapped
9184,1031,1031,0.999291,Product Mapped
1162,152,152,0.999353,Product Mapped
15775,874,874,0.999549,Product Mapped


In [309]:
# get all rows where the confidence score is less than the threshold
low_conf = results_df[results_df["Confidence_Score"] < threshold]
low_conf.sample(10)

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message
2434,4,788,0.33423,Unknown product
13109,1701,990,0.357356,Unknown product
14479,427,111,0.501607,Unknown product
16275,137,137,0.446695,Unknown product
2502,867,1594,0.136381,Unknown product
16360,941,2355,0.353492,Unknown product
18061,1323,1323,0.568035,Unknown product
13310,1701,491,0.417404,Unknown product
8494,622,1022,0.366986,Unknown product
19444,788,2006,0.361203,Unknown product


In [310]:
# get all rows where the Predicted SKU is not equal to the Actual SKU
results_df[results_df["Predicted_SKU"] != results_df["Actual_SKU"]]

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message
17,1615,1072,0.657380,Product Mapped
85,4188,733,0.190860,Unknown product
218,116,9,0.948273,Product Mapped
293,585,2068,0.156742,Unknown product
383,90,58,0.216803,Unknown product
...,...,...,...,...
20149,1757,2104,0.120556,Unknown product
20255,3021,2542,0.991538,Product Mapped
20275,8771,1022,0.657885,Product Mapped
20522,690,121,0.645592,Product Mapped


In [311]:
def confidence_score(X_test, y_test):
    probabilities = rnn_model.predict(X_test)

    confidence_scores = np.max(probabilities, axis=1)

    predicted_indices = np.argmax(probabilities, axis=1)

    predicted_labels = label_encoder.inverse_transform(predicted_indices)

    true_labels = label_encoder.inverse_transform(y_test)

    confidence_threshold = 0.9

    test_results = []
    incorrect_count = 0
    high_confidence_count = 0

    for i in range(len(X_test)):
        confidence = confidence_scores[i]
        predicted_class = predicted_labels[i]
        true_class = true_labels[i]

        
        if confidence > confidence_threshold:
            high_confidence_count += 1  
            if predicted_class != true_class:
                incorrect_count += 1  

        if confidence < confidence_threshold:
            predicted_class = "Unknown"

        test_results.append({
            'Predicted': predicted_class
        })
        
    if high_confidence_count > 0:
        error_percentage = (incorrect_count / high_confidence_count) * 100
    else:
        error_percentage = 0.0
        

    temp_df = pd.DataFrame(test_results)
    print(f"Percentage of incorrect classifications with confidence > 0.9: {error_percentage:.2f}%")
    print(temp_df.head())
    return temp_df

In [312]:
import Levenshtein

def levenshtein_similarity(s1, s2):
    s1 = s1.strip()
    s2 = s2.strip()
    if not s1 and not s2:
        return 1.0
    # Calculate the Levenshtein distance between s1 and s2
    lev_distance = Levenshtein.distance(s1, s2)
    max_len = max(len(s1), len(s2))
    # Ensure we don't divide by zero
    if max_len == 0:
        return 1.0
    similarity = 1 - (lev_distance / max_len)
    return similarity

In [313]:
# Create a mapping from SKU to master product text
sku_to_master_text = dict(zip(master_df['sku'], master_df['product_name_ar']))

In [314]:
similarity_scores = []
# adjusted_confidence_scores = []

for seller_text, pred_sku, orig_conf in zip(X_test_texts, results_df['Predicted_SKU'], results_df['Confidence_Score']):
    # Get the corresponding master text. If not found, you can set similarity to 0 (or a small value).
    master_text = sku_to_master_text.get(pred_sku, "")
    sim = levenshtein_similarity(seller_text, master_text) if master_text else 0.0
    similarity_scores.append(sim)
    
    # Combine the original model confidence with the similarity score.
    # Here, we use a simple multiplication, but you might consider a weighted average or other method.
    adjusted_conf = orig_conf * sim
    # adjusted_confidence_scores.append(adjusted_conf)

In [315]:
# Add these new scores to your results dataframe.
results_df["Similarity_Score"] = similarity_scores

In [316]:
# Ensure that your confidence and similarity scores are in the [0, 1] range.
alpha = 0.7  # Adjust this weight based on your preferences

results_df["Weighted_Combined_Score"] = results_df.apply(
    lambda row: (alpha * row["Confidence_Score"] + (1 - alpha) * row["Similarity_Score"]), axis=1
)

In [317]:
# Convert scores to percentage form and round to two decimal digits
results_df["Confidence_Score"] = results_df["Confidence_Score"].apply(lambda x: round(x * 100, 2))
results_df["Similarity_Score"] = results_df["Similarity_Score"].apply(lambda x: round(x * 100, 2))

In [319]:
results_df["Prediction_Message"] = results_df["Weighted_Combined_Score"].apply(
    lambda conf: "Unknown product" if conf < 0.6 else "Product Mapped"
)

In [320]:
results_df["Weighted_Combined_Score"] = results_df["Weighted_Combined_Score"].apply(lambda x: round(x * 100, 2))

Here we will use Weighted_Combined_Score

In [321]:
results_df.sample(10)

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message,Similarity_Score,Weighted_Combined_Score
1397,2510,2510,99.8,Product Mapped,72.41,91.59
14220,2517,2517,99.24,Product Mapped,76.92,92.54
17122,1206,1206,99.89,Product Mapped,79.17,93.68
11000,1525,1525,99.81,Product Mapped,58.33,87.37
3735,1296,1296,99.94,Product Mapped,78.26,93.44
18732,166,166,99.95,Product Mapped,54.84,86.42
12343,377,377,99.81,Product Mapped,50.0,84.87
4954,660,660,98.45,Product Mapped,35.0,79.41
15031,1795,1795,99.87,Product Mapped,83.33,94.91
10750,1619,1619,99.91,Product Mapped,50.0,84.94


In [322]:
# get the percentage of incorrect predictions
incorrect_predictions = results_df[results_df["Predicted_SKU"] != results_df["Actual_SKU"]]
incorrect_percentage = len(incorrect_predictions) / len(results_df) * 100
print(f"Percentage of incorrect predictions: {np.round(incorrect_percentage, 2)}%")

Percentage of incorrect predictions: 1.11%


In [329]:
# get the percentage of unknown predictions
unknown_predictions = results_df[results_df["Weighted_Combined_Score"] < 60]
unknown_percentage = len(unknown_predictions) / len(results_df) * 100
print(f"Percentage of unknown products: {np.round(unknown_percentage, 2)}%")

Percentage of unknown products: 1.06%


In [330]:
# get the percentage of incorrect predictions with high confidence score
incorrect_high_confidence = incorrect_predictions[incorrect_predictions["Weighted_Combined_Score"] >= 90]
incorrect_high_confidence_percentage = len(incorrect_high_confidence) / len(results_df) * 100
print(f"Percentage of incorrect predictions with confidence > 90: {np.round(incorrect_high_confidence_percentage, 2)}%")

Percentage of incorrect predictions with confidence > 90: 0.02%


In [328]:
results_df[results_df["Predicted_SKU"] != results_df["Actual_SKU"]]

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message,Similarity_Score,Weighted_Combined_Score
17,1615,1072,65.74,Unknown product,15.62,50.70
85,4188,733,19.09,Unknown product,19.23,19.13
218,116,9,94.83,Product Mapped,44.83,79.83
293,585,2068,15.67,Unknown product,19.05,16.69
383,90,58,21.68,Unknown product,15.38,19.79
...,...,...,...,...,...,...
20149,1757,2104,12.06,Unknown product,20.69,14.65
20255,3021,2542,99.15,Product Mapped,46.15,83.25
20275,8771,1022,65.79,Unknown product,4.55,47.42
20522,690,121,64.56,Product Mapped,66.67,65.19


In [327]:
incorrect_predictions[incorrect_predictions["Weighted_Combined_Score"] >= threshold]

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message,Similarity_Score,Weighted_Combined_Score
17,1615,1072,65.74,Unknown product,15.62,50.70
85,4188,733,19.09,Unknown product,19.23,19.13
218,116,9,94.83,Product Mapped,44.83,79.83
293,585,2068,15.67,Unknown product,19.05,16.69
383,90,58,21.68,Unknown product,15.38,19.79
...,...,...,...,...,...,...
20149,1757,2104,12.06,Unknown product,20.69,14.65
20255,3021,2542,99.15,Product Mapped,46.15,83.25
20275,8771,1022,65.79,Unknown product,4.55,47.42
20522,690,121,64.56,Product Mapped,66.67,65.19
