# Import Libraries

In [18]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from symspellpy import SymSpell
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Read Dataset

In [19]:
df = pd.read_excel("Product Matching Dataset.xlsx" , sheet_name="Dataset")
df.head(n=10)

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
2,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
3,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
5,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
6,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
7,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 1 شريط * 14 كبسولة,56.5
8,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم,56.5
9,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 كبسول س ج,56.5


# Preprocessing

In [20]:
stop_words = set(stopwords.words("arabic","english"))
# text_column = df["seller_item_name"].astype(str)  # Ensure text format

In [21]:
def preprocessing(text):
    # Remove stopwords
    text = " ".join(word for word in text.split() if word not in stop_words)
    # Remove punctuation & special characters (Keep Arabic, English, and Numbers)
    text = re.sub(r"[^\w\s\u0600-\u06FF]", " ", text)
    # Remove Arabic diacritics (Tashkeel)
    text = re.sub(r"[\u064B-\u065F]", "", text)

    text = re.sub(r'[إأآ]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ة', 'ه ', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ــ', '', text)
    text = re.sub(r"(\d+)", r" \1 ", text)
    text = re.sub(r"\b[\u0600-\u06FF]\b", "", text)
    text = re.sub(r'([\u0600-\u06FF])\1{2,}', r'\1\1', text)
    text = re.sub(r'(.)\1+', r'\1', text) # remove repetitions
    text = re.sub(r'\b\w\b', '', text)  # Removes standalone single characters
    text = re.sub(r'\b(?:سعر جديد|س جديد|س جدي|س ج|ركز)\b', '', text)# Remove specific unwanted phrases
    text = re.sub( r'مرهم|اكريم', 'كريم', text)
    text = re.sub( r'قرص|اقراص|كبسوله', 'كبسول', text)
    text = re.sub( r'اقراص|شريط|شرائط|شريطين', 'قرص', text)
    text = re.sub( r'امبولات|امبوله|حقن', 'امبول', text)
    text = re.sub( r'لبوس|لبوس اطفال|لبوس اطفال فاركو', 'اقماع للاطفال', text)
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

In [22]:
df["cleaned_seller_item_name"] = df["seller_item_name"].astype(str).apply(preprocessing)

# Build SymSpell Dictionary

In [23]:
# word_freq_dict = {}
# for product in df["marketplace_product_name_ar"].dropna():
#     normalized_product = str(product).strip()
#     word_freq_dict[normalized_product] = word_freq_dict.get(normalized_product, 0) + 1

# # Save dictionary
# dict_file = "product_dictionary.txt"
# with open(dict_file, "w", encoding="utf-8") as f:
#     for word, freq in word_freq_dict.items():
#         f.write(f"{word} {freq}\n")

# # Initialize SymSpell
# sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
# sym_spell.load_dictionary(dict_file, term_index=0, count_index=1, encoding="utf-8")

# # 3. Apply SymSpell Correction
# def correct_text(text):
#     suggestions = sym_spell.lookup(text, verbosity=2, max_edit_distance=2)
#     return suggestions[0].term if suggestions else text

# df["corrected_seller_item_name"] = df["cleaned_seller_item_name"].apply(correct_text)

# Train Models

In [25]:
x = df["cleaned_seller_item_name"]
y = df["sku"]
vectorizer = TfidfVectorizer(max_features=500)
x_tfidf = vectorizer.fit_transform(x)

X_train, X_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.3, shuffle=True, stratify=y, random_state=42)

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on Test Data
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.8676


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Convert text labels (SKU) into numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df["sku"])

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["cleaned_seller_item_name"])
sequences = tokenizer.texts_to_sequences(df["cleaned_seller_item_name"])

# Pad sequences to ensure consistent input size
max_length = max(len(seq) for seq in sequences)
x_padded = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(x_padded, y_encoded, test_size=0.3, shuffle=True, stratify=y_encoded, random_state=42)


model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_length),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(32, activation="relu"),
    Dense(len(label_encoder.classes_), activation="softmax")  # Output layer for classification
])

# Compile model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

loss, accuracy = model.evaluate(X_test, y_test)
print(f"RNN Model Accuracy: {accuracy:.4f}")




Epoch 1/20
[1m1828/1828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 28ms/step - accuracy: 0.0270 - loss: 5.0703 - val_accuracy: 0.1530 - val_loss: 2.9970
Epoch 2/20
[1m1828/1828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 27ms/step - accuracy: 0.1521 - loss: 2.9155 - val_accuracy: 0.2949 - val_loss: 2.1919
Epoch 3/20
[1m1828/1828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 30ms/step - accuracy: 0.2590 - loss: 2.2728 - val_accuracy: 0.4767 - val_loss: 1.6218
Epoch 4/20
[1m1828/1828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 32ms/step - accuracy: 0.4011 - loss: 1.7646 - val_accuracy: 0.6906 - val_loss: 1.0945
Epoch 5/20
[1m1828/1828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 32ms/step - accuracy: 0.5663 - loss: 1.3005 - val_accuracy: 0.8157 - val_loss: 0.7008
Epoch 6/20
[1m1828/1828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 36ms/step - accuracy: 0.6997 - loss: 0.9110 - val_accuracy: 0.8631 - val_loss: 0.5304
Epoc

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Ensure text is cleaned before processing
df["cleaned_seller_item_name"] = df["seller_item_name"].astype(str).apply(preprocessing)

# Add an "unknown" class for medicines not in the dataset
df.loc[df["sku"].isna(), "sku"] = "unknown"

# Encode labels (SKUs)
label_encoder = LabelEncoder()
df["encoded_sku"] = label_encoder.fit_transform(df["sku"])  # Convert SKU to numbers

# Tokenize text data
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")  # Handle unknown words
tokenizer.fit_on_texts(df["cleaned_seller_item_name"])
sequences = tokenizer.texts_to_sequences(df["cleaned_seller_item_name"])

# Pad sequences to make them uniform
max_length = max(len(seq) for seq in sequences)
x_padded = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")

# Prepare train/test sets
X_train, X_test, y_train, y_test = train_test_split(x_padded, df["encoded_sku"], test_size=0.3, shuffle=True, stratify=y, random_state=42)

# Build the model
model = Sequential()
num_classes = len(label_encoder.classes_)  # Get the number of unique classes

model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))  # Adjust input dimensions
model.add(LSTM(units=64, return_sequences=True))  # Add LSTM layer
model.add(Dropout(0.2))  # Dropout for regularization
model.add(LSTM(units=32))  # Add another LSTM layer
model.add(Dropout(0.2))  # Dropout for regularization
model.add(Dense(units=num_classes, activation='softmax'))  # Use softmax for multi-class classification

# Compile the model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate accuracy
loss, accuracy = model.evaluate(X_test, y_test)
print(f"RNN Model Accuracy: {accuracy:.4f}")




Epoch 1/20
[1m1828/1828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 27ms/step - accuracy: 0.0546 - loss: 5.2529 - val_accuracy: 0.2953 - val_loss: 3.1487
Epoch 2/20
[1m1828/1828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 28ms/step - accuracy: 0.2832 - loss: 2.9906 - val_accuracy: 0.4274 - val_loss: 2.2830
Epoch 3/20
[1m1828/1828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 32ms/step - accuracy: 0.3929 - loss: 2.2845 - val_accuracy: 0.5272 - val_loss: 1.7855
Epoch 4/20
[1m1828/1828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 29ms/step - accuracy: 0.4830 - loss: 1.8616 - val_accuracy: 0.5937 - val_loss: 1.4865
Epoch 5/20
[1m1828/1828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 30ms/step - accuracy: 0.5483 - loss: 1.5618 - val_accuracy: 0.6842 - val_loss: 1.1994
Epoch 6/20
[1m1828/1828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 41ms/step - accuracy: 0.6221 - loss: 1.3156 - val_accuracy: 0.7550 - val_loss: 0.9652
Epo

In [None]:
def predict_sku(medicine_name, threshold=0.5):
    # Preprocess the input name
    cleaned_text = preprocessing(medicine_name)
    
    # Convert to sequence
    seq = tokenizer.texts_to_sequences([cleaned_text])
    padded_seq = pad_sequences(seq, maxlen=max_length, padding="post", truncating="post")

    # Get model predictions
    predictions = model.predict(padded_seq)[0]
    
    # Get highest probability SKU
    max_prob = np.max(predictions)
    predicted_label = np.argmax(predictions)

    # If confidence is low, return "unknown"
    if max_prob < threshold:
        return "unknown"

    # Convert label back to SKU
    return label_encoder.inverse_transform([predicted_label])[0]

# Example Predictions
print(predict_sku("بانادول اكسترا"))  # Should return the correct SKU or "unknown"
print(predict_sku("فيروجلوبين 30"))  # Should return "unknown"


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
15
