# Import Libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from symspellpy import SymSpell
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Read Dataset

In [2]:
df = pd.read_excel("Product Matching Dataset.xlsx" , sheet_name="Dataset")
df.head(n=10)

Unnamed: 0,sku,marketplace_product_name_ar,seller_item_name,price
0,1322,استوهالت 40 مجم 14 كبسول,ESTOHALT 40 MG 14 CAP,56.5
1,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
2,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
3,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
4,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
5,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
6,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 14 ك,56.5
7,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم 1 شريط * 14 كبسولة,56.5
8,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 مجم,56.5
9,1322,استوهالت 40 مجم 14 كبسول,استوهالت 40 كبسول س ج,56.5


# Preprocessing

In [3]:
stop_words = set(stopwords.words("arabic","english"))
# text_column = df["seller_item_name"].astype(str)  # Ensure text format

In [4]:
def preprocessing(text):
    # Remove stopwords
    text = " ".join(word for word in text.split() if word not in stop_words)
    # Remove punctuation & special characters (Keep Arabic, English, and Numbers)
    text = re.sub(r"[^\w\s\u0600-\u06FF]", " ", text)
    # Remove Arabic diacritics (Tashkeel)
    text = re.sub(r"[\u064B-\u065F]", "", text)

    text = re.sub(r'[إأآ]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ة', 'ه ', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ــ', '', text)
    text = re.sub(r"(\d+)", r" \1 ", text)
    text = re.sub(r"\b[\u0600-\u06FF]\b", "", text)
    text = re.sub(r'([\u0600-\u06FF])\1{2,}', r'\1\1', text)
    text = re.sub(r'(.)\1+', r'\1', text) # remove repetitions
    # Remove short/incomplete words (single characters or partial words)
    text = re.sub(r'\b\w\b', '', text)  # Removes standalone single characters
    text = re.sub(r'\b(?:سعر جديد|س جديد|س جدي|س ج|ركز)\b', '', text)# Remove specific unwanted phrases
    text = re.sub( r'مرهم|اكريم', 'كريم', text)
    text = re.sub( r'قرص|اقراص|كبسوله', 'كبسول', text)
    text = re.sub( r'اقراص|شريط|شرائط|شريطين', 'قرص', text)
    text = re.sub( r'امبولات|امبوله|حقن', 'امبول', text)
    text = re.sub( r'لبوس|لبوس اطفال|لبوس اطفال فاركو', 'اقماع للاطفال', text)
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

In [5]:
df["cleaned_seller_item_name"] = df["seller_item_name"].astype(str).apply(preprocessing)

# Build SymSpell Dictionary

In [6]:
word_freq_dict = {}
for product in df["marketplace_product_name_ar"].dropna():
    normalized_product = str(product).strip()
    word_freq_dict[normalized_product] = word_freq_dict.get(normalized_product, 0) + 1

# Save dictionary
dict_file = "product_dictionary.txt"
with open(dict_file, "w", encoding="utf-8") as f:
    for word, freq in word_freq_dict.items():
        f.write(f"{word} {freq}\n")

# Initialize SymSpell
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary(dict_file, term_index=0, count_index=1, encoding="utf-8")

# 3. Apply SymSpell Correction
def correct_text(text):
    suggestions = sym_spell.lookup(text, verbosity=2, max_edit_distance=2)
    return suggestions[0].term if suggestions else text

df["corrected_seller_item_name"] = df["cleaned_seller_item_name"].apply(correct_text)

# Train Models

In [7]:
x = df["corrected_seller_item_name"]
y = df["sku"]
vectorizer = TfidfVectorizer(max_features=500)
x_tfidf = vectorizer.fit_transform(x)

X_train, X_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.3, random_state=42)

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on Test Data
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.8631


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.utils import to_categorical



# Build the model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))  # Adjust input dimensions
model.add(LSTM(units=64, return_sequences=True))  # Add LSTM layer
model.add(Dropout(0.2))  # Dropout for regularization
model.add(Dense(num_classes, activation='softmax'))  # Use softmax for multi-class classification

# Compile the model with categorical cross-entropy
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")


MemoryError: Unable to allocate 4.36 GiB for an array with shape (58493, 10000) and data type int64