# Import Libraries

In [70]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from symspellpy import SymSpell
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Lambda

# Read Dataset

In [3]:
xls = pd.ExcelFile("Product Matching Dataset.xlsx")
master_df = pd.read_excel(xls, "Master File")
dataset_df = pd.read_excel(xls, "Dataset")

In [4]:
dataset_df.head()

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
2,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
3,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5


In [5]:
master_df.head()

Unnamed: 0,sku,product_name,product_name_ar,price
0,279,ANAFRONIL 75 MG 20 TAB,انافرونيل 75 مجم اس ار 20 قرص,75.0
1,2282,LOPRECOUGH SYRUP 100 ML,لوبريكاف شراب 100 مل,28.5
2,4331,TOMEX PLUS 50 TAB,تومكس بلس 50 قرص,60.0
3,1022,TAROLIMUS 0.03% OINT. 15 GM,تاروليمس 0.03 % مرهم 15 جم,129.0
4,116,GLIPTUS PLUS 50/1000 MG 30 TAB,جليبتس بلس 50/1000 مجم 30 قرص,192.0


In [6]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83562 entries, 0 to 83561
Data columns (total 4 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sku                          83562 non-null  int64  
 1   marketplace_product_name_ar  83562 non-null  object 
 2   seller_item_name             83562 non-null  object 
 3   price                        83562 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.6+ MB


# Preprocessing

In [None]:
from qalsadi.lemmatizer import Lemmatizer

# Initialize the Arabic lemmatizer
lemmatizer = Lemmatizer()

def normalize_text(text):
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove Arabic diacritics (tashkeel)
    text = re.sub(r"[\u064B-\u065F]", "", text)
    
    # Remove non-Arabic, non-English characters except numbers
    text = re.sub(r'[^\u0621-\u064Aa-zA-Z0-9\s]', ' ', text)
    
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Standardize specific terms
    text = re.sub(r'قرص|\bق\b|\bك\b|اقراص|كبسوله', 'كبسول', text)
    text = re.sub(r'([\u0600-\u06FF])\1+', r'\1', text)  # Remove Arabic repetition
    text = re.sub(r'[إأآ]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ة', 'ه ', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    
    # Separate numbers that stick to Arabic/English words
    text = re.sub(r"(\d+)([a-zA-Z\u0600-\u06FF]+)", r"\1 \2", text)  # Number followed by Arabic/English
    text = re.sub(r"([a-zA-Z\u0600-\u06FF]+)(\d+)", r"\1 \2", text)  # Arabic/English followed by number
    
    # Remove standalone Arabic/English characters (but not numbers)
    text = re.sub(r"\b[^\W\d]\b", "", text)
    
    # Remove specific unwanted phrases
    text = re.sub(r'\b(?:سعر جديد|سعر|قديم|س جديد|س جدي|س ج|ركز)\b', '', text)
    text = re.sub(r'مرهم|اكريم', 'كريم', text)
    text = re.sub(r'مليجرام|\bم\b|مجم', 'مجم', text)
    text = re.sub(r'جرام|جم', 'جم', text)
    text = re.sub(r'شرائط|شريطين', 'كبسول', text)
    text = re.sub(r'امبولات|امبوله|حقن', 'امبول', text)
    text = re.sub(r'لبوس|لبوس اطفال', 'اقماع', text)
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    # Tokenize the text
    tokens = text.split()
    
    # Lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the lemmatized tokens back into a single string
    lemmatized_text = " ".join(lemmatized_tokens)
    
    return lemmatized_text

In [17]:
normalize_text("3helloo3 فيدروب 2800 وحدة دولية//مل نقط _ بالفم helloo 15 مل")

'3 helloo 3 فيدروب 2800 وحد دولي مل نقط فم helloo 15 مل'

In [18]:
normalize_text(" we.ok//  / Hello,World!  ")

'we ok hello world'

Apply preprocessing

In [19]:
dataset_df["cleaned_seller_item_name"] = dataset_df["seller_item_name"].astype(str).apply(normalize_text)

In [22]:
dataset_df.sample(10)

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price,cleaned_seller_item_name
24355,4403,دوزين 1 مجم 20 قرص,دوزين 1مجم س ج,14.0,دوزين 1 مجم
25421,980,لارى برو 20 قرص,لارى برو 20قرص,44.0,لاري برو 20 كبسول
54298,2226,ابيكسيدون 1 مجم 20 قرص,ابيكسدون 1مجم اقراص 1,54.0,ابيكسدون 1 مجم كبسول 1
25328,924,دانتريلاكس مركب 30 كبسولة,دانتريلاكس مركب 30قرص,153.0,دانتريلاكس مركب 30 كبسول
24231,4384,فاستل 120 مجم 20 قرص,فاستيل 120مجم اقراص/جديد,42.0,فاستيل 120 مجم كبسول جديد
22348,2429,الليربان 1مجم/5مل شراب 100 مل,الليربان شراب,21.0,اليربان شراب
79564,2956,اماجلوست 2/30 مجم 30 قرص,اماجلوست 30/2مجم اقراص,67.5,اماجلوست 30 2 مجم كبسول
77861,1884,ليليبيل 10 مجم 20 قرص,ليليبل 10 مجم اقراص س.ج,120.0,ليليبل 10 مجم كبسول
66437,166,بانادول كولد آند فلو داي 24 قرص,بانادول كولد ان فلو اقراص,76.0,بانادول ولد ان لو كبسول
16900,2737,بترو 30 قرص,بترو 20 قرص,69.0,تروي 20 كبسول


In [32]:
# Scale Price
scaler = StandardScaler()
dataset_df["scaled_price"] = scaler.fit_transform(dataset_df[["price"]])

Check for class balancing

In [33]:
dataset_df['sku'].value_counts() / len(dataset_df) * 100

sku
1322    0.214212
252     0.214212
257     0.213015
854     0.213015
26      0.210622
          ...   
1255    0.191475
137     0.191475
146     0.191475
1469    0.191475
1603    0.191475
Name: count, Length: 500, dtype: float64

#### Preparing Data for RNN Model

In [34]:
# Label Encoding for SKU
label_encoder = LabelEncoder()
dataset_df["encoded_sku"] = label_encoder.fit_transform(dataset_df["sku"])

# Feature and Target
X = dataset_df[["cleaned_seller_item_name", "scaled_price"]]
Y = dataset_df["encoded_sku"]

In [29]:
# Tokenization for RNN
num_words = 5000
tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X["cleaned_seller_item_name"])
sequences = tokenizer.texts_to_sequences(X["cleaned_seller_item_name"])

# Padding
max_length = max(len(seq) for seq in sequences)
x_padded = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")

#### Splitting Data

In [39]:
# Train-Test Split for RNN
X_train, X_test, y_train, y_test = train_test_split(x_padded, Y, stratify=Y, shuffle=True)

#### RNN Model

In [77]:
class TemperatureScaling(tf.keras.layers.Layer):
    def __init__(self, temperature=1.0, **kwargs):
        super().__init__(**kwargs)
        self.temperature = tf.Variable(temperature, trainable=False)

    def call(self, inputs):
        return inputs / self.temperature

    def get_config(self):
        return {'temperature': self.temperature.numpy()}

In [78]:
# Build Improved RNN Model
rnn_model = Sequential([
    Embedding(input_dim=num_words, output_dim=256, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.2),
    Bidirectional(GRU(32)),
    Dropout(0.2),
    Dense(32, activation="relu"),
    Dense(len(label_encoder.classes_), activation="softmax"),
])

rnn_model.pop()  # Remove final dense layer
rnn_model.pop()  # Remove previous temperature layer if exists

# Add new temperature scaling
rnn_model.add(Dense(len(label_encoder.classes_)))
rnn_model.add(TemperatureScaling(temperature=1.0))
rnn_model.add(tf.keras.layers.Activation('softmax'))

rnn_model.add(Lambda(lambda x: x / 0.3))

rnn_model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=0.0005), metrics=["accuracy"])

# Early Stopping
early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

# Train RNN Model
rnn_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])



Epoch 1/100
[1m1959/1959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 59ms/step - accuracy: 0.2793 - loss: 4.6878 - val_accuracy: 0.9561 - val_loss: 1.2418
Epoch 2/100
[1m1959/1959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 49ms/step - accuracy: 0.9576 - loss: 1.1200 - val_accuracy: 0.9718 - val_loss: 0.4918
Epoch 3/100
[1m1959/1959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 54ms/step - accuracy: 0.9745 - loss: 0.4474 - val_accuracy: 0.9741 - val_loss: 0.1919
Epoch 4/100
[1m1959/1959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 59ms/step - accuracy: 0.9850 - loss: 0.1827 - val_accuracy: 0.9830 - val_loss: 0.1224
Epoch 5/100
[1m1959/1959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 51ms/step - accuracy: 0.9924 - loss: 0.0963 - val_accuracy: 0.9857 - val_loss: 0.1014
Epoch 6/100
[1m1959/1959[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 44ms/step - accuracy: 0.9943 - loss: 0.0615 - val_accuracy: 0.9870 - val_loss: 0

<keras.src.callbacks.history.History at 0x1bfdb50d3d0>

#### Evaluate RNN Model

In [79]:
# Evaluate RNN Model
loss, accuracy = rnn_model.evaluate(X_test, y_test)
print(f"RNN Model Accuracy: {accuracy * 100:.2f}%")

[1m653/653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.9890 - loss: 0.0851
RNN Model Accuracy: 98.88%


In [80]:
predictions = rnn_model.predict(X_test)
y_pred = np.argmax(predictions, axis=1)
confidence_score = np.max(predictions, axis=1)

[1m653/653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step


In [81]:
# Convert predicted indices and actual labels back to SKUs
predicted_skus = label_encoder.inverse_transform(y_pred)
actual_skus = label_encoder.inverse_transform(y_test)

In [82]:
threshold = 0.6

results_df = pd.DataFrame({
    "Predicted_SKU": predicted_skus,
    "Actual_SKU": actual_skus,
    "Confidence_Score": confidence_score
})

results_df["Prediction_Message"] = results_df["Confidence_Score"].apply(
    lambda conf: "Unknown product" if conf < threshold else "Product prediction accepted"
)

In [83]:
results_df.sample(10)

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message
10539,478,478,3.332189,Product prediction accepted
6332,2510,2510,3.333113,Product prediction accepted
4746,1051,1051,3.333063,Product prediction accepted
4671,796,796,3.332765,Product prediction accepted
12467,3640,3640,3.332966,Product prediction accepted
18706,1619,1619,3.332179,Product prediction accepted
13515,3501,3501,3.332779,Product prediction accepted
2183,977,977,3.332739,Product prediction accepted
5502,1064,1064,3.333169,Product prediction accepted
4037,456,456,3.333011,Product prediction accepted


In [84]:
# get all rows where the confidence score is less than the threshold
results_df[results_df["Confidence_Score"] < threshold]

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message
204,1816,446,0.561019,Unknown product
5079,557,137,0.580685,Unknown product
9267,5215,1003,0.479369,Unknown product
17600,242,2453,0.387895,Unknown product


In [85]:
# get all rows where the Predicted SKU is not equal to the Actual SKU
results_df[results_df["Predicted_SKU"] != results_df["Actual_SKU"]]

Unnamed: 0,Predicted_SKU,Actual_SKU,Confidence_Score,Prediction_Message
32,1326,4139,1.611848,Product prediction accepted
147,10,428,0.710270,Product prediction accepted
204,1816,446,0.561019,Unknown product
221,420,771,1.104077,Product prediction accepted
404,1649,1861,2.554418,Product prediction accepted
...,...,...,...,...
20250,460,1675,1.343858,Product prediction accepted
20276,1317,476,2.606412,Product prediction accepted
20357,22,2750,1.886122,Product prediction accepted
20622,487,1631,2.724891,Product prediction accepted


In [86]:
# get the percentage of incorrect predictions
incorrect_predictions = results_df[results_df["Predicted_SKU"] != results_df["Actual_SKU"]]
incorrect_percentage = len(incorrect_predictions) / len(results_df) * 100
incorrect_percentage

1.1248863146809631

In [87]:
# get the percentage of unknown predictions
unknown_predictions = results_df[results_df["Confidence_Score"] < threshold]
unknown_percentage = len(unknown_predictions) / len(results_df) * 100
unknown_percentage

0.019147001100952563

In [88]:
# get the percentage of incorrect predictions with high confidence score
incorrect_high_confidence = incorrect_predictions[incorrect_predictions["Confidence_Score"] >= threshold]
incorrect_high_confidence_percentage = len(incorrect_high_confidence) / len(results_df) * 100
incorrect_high_confidence_percentage

1.1057393135800107

In [89]:
def confidence_score(X_test, y_test):
    probabilities = rnn_model.predict(X_test)

    confidence_scores = np.max(probabilities, axis=1)

    predicted_indices = np.argmax(probabilities, axis=1)

    predicted_labels = label_encoder.inverse_transform(predicted_indices)

    true_labels = label_encoder.inverse_transform(y_test)

    confidence_threshold = 0.9

    test_results = []
    incorrect_count = 0
    high_confidence_count = 0

    for i in range(len(X_test)):
        confidence = confidence_scores[i]
        predicted_class = predicted_labels[i]
        true_class = true_labels[i]

        
        if confidence > confidence_threshold:
            high_confidence_count += 1  
            if predicted_class != true_class:
                incorrect_count += 1  

        if confidence < confidence_threshold:
            predicted_class = "Unknown"

        test_results.append({
            'Predicted': predicted_class
        })
        
    if high_confidence_count > 0:
        error_percentage = (incorrect_count / high_confidence_count) * 100
    else:
        error_percentage = 0.0
        

    temp_df = pd.DataFrame(test_results)
    print(f"Percentage of incorrect classifications with confidence > 0.9: {error_percentage:.2f}%")
    print(temp_df.head())
    return temp_df

In [90]:
temp_df = confidence_score(X_test, y_test)

[1m653/653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step
Percentage of incorrect classifications with confidence > 0.9: 1.02%
  Predicted
0       491
1       369
2       543
3       421
4      2793


In [91]:
def predict_sku_with_confidence(input_text, threshold=0.6):
    """
    Given an input product text, normalize, tokenize, and pad it,
    then predict the SKU using the trained RNN model.
    Returns a dictionary with the predicted SKU (or None), the confidence,
    and a message indicating whether the prediction is accepted or unknown.
    """
    # Normalize the input text using your existing normalization function
    cleaned_text = normalize_text(input_text)
    
    # Tokenize and pad the sequence for the RNN model
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding="post", truncating="post")
    
    # Get prediction probabilities from the model
    preds = rnn_model.predict(padded_sequence)
    confidence = float(np.max(preds))  # Maximum softmax probability
    predicted_index = int(np.argmax(preds))
    
    # Convert the encoded SKU back to the actual SKU using the label encoder
    predicted_sku = label_encoder.inverse_transform([predicted_index])[0]
    
    # Determine if the product is known or unknown based on the confidence threshold
    if confidence < threshold:
        return {
            "sku": None,
            "confidence": confidence,
            "message": "Unknown product: Confidence below threshold"
        }
    else:
        return {
            "sku": predicted_sku,
            "confidence": confidence,
            "message": "Product prediction accepted"
        }

# Example usage:
example_text = "أدخل هنا اسم المنتج من البائع"
result = predict_sku_with_confidence(example_text, threshold=0.6)
print(result)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
{'sku': 237, 'confidence': 2.8258211612701416, 'message': 'Product prediction accepted'}


In [92]:
from difflib import SequenceMatcher

# Create a dictionary mapping SKU to the corresponding master product name.
# Make sure the master file has columns 'sku' and 'product_name'.
sku_to_product = master_df.set_index("sku")["product_name"].to_dict()

def get_similarity_score(text1, text2):
    """
    Compute a similarity score between two texts using difflib's SequenceMatcher.
    Returns a float between 0 and 1.
    """
    return SequenceMatcher(None, text1, text2).ratio()

# Define a function that performs prediction and similarity computation.
def predict_and_score(row, threshold=0.6):
    seller_name = row["cleaned_seller_item_name"]
    
    # Predict SKU and get confidence level.
    pred_result = predict_sku_with_confidence(seller_name, threshold=threshold)
    predicted_sku = pred_result["sku"]
    confidence = pred_result["confidence"]
    message = pred_result["message"]
    
    # Compute similarity score if we have a valid predicted SKU that exists in the master file.
    if predicted_sku is not None and predicted_sku in sku_to_product:
        # Retrieve and normalize the master product name.
        master_name = sku_to_product[predicted_sku]
        normalized_master_name = normalize_text(master_name)
        # Compute similarity between seller's name and the master name.
        similarity_score = get_similarity_score(seller_name, normalized_master_name)
    else:
        similarity_score = 0.0  # Unknown product or no match.
    
    return pd.Series({
        "predicted_sku": predicted_sku,
        "confidence": confidence,
        "similarity_score": similarity_score,
        "message": message
    })

# Apply the function to each row in your dataset to generate the results DataFrame.
results_df = dataset_df.apply(predict_and_score, axis=1)

# (Optional) Combine with original seller names for easier reference.
results_df["seller_item_name"] = dataset_df["cleaned_seller_item_name"]

# Rearranging columns for clarity.
results_df = results_df[["seller_item_name", "predicted_sku", "confidence", "similarity_score", "message"]]

print(results_df.head())


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3

KeyboardInterrupt: 