In [None]:
%pip install -r requirements.txt
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from joblib import dump, load
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split
import re
from pyarabic import araby
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import classification_report


In [13]:
master = "./Masterfile.xlsx"
training = "./Dedup_Dataset.xlsx"
test = "./test.xlsx"

# dynamic test column names
ITEM_NAME = "product name"
ITEM_CODE = "item code"
ITEM_PRICE = "price"

In [6]:
def normalize_arabic(text):
    """Harsh and Kinda extreme preprocessing but for the clarity"""
    if not isinstance(text, str):
        return ""
    text = araby.normalize_hamza(text)
    text = araby.normalize_ligature(text)
    text = araby.normalize_alef(text)
    text = araby.normalize_teh(text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[!"#\'()*+,.:;<=>?@[\\]^_`{|}~]', '', text)  # Remove punctuation
    text = re.sub(r'\b(?:سعر|جديد|قديم|س ق|س ج|س|ق|ج|س.ج|س.ق)\b.*', '', text)  # Remove specific words and everything after them
    text = re.sub(r'(سعر|جديد|قديم|س ق|س ج)', '', text)  # Remove specific words even if they are part of another word
    text = re.sub(r'(.)\1+', r'\1', text)  # Remove repeated letters
    return text


def optimize_normalize_arabic(text):
    """Enhanced Arabic text normalization that preserves % and / characters"""
    if not isinstance(text, str):
        return ""
    text = araby.normalize_hamza(araby.normalize_ligature(araby.normalize_alef(araby.normalize_teh(text))))
    
    # Replace unwanted characters except % and /
    text = re.sub(r'[^\w\s%/]', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove specific Arabic terms and everything after them
    text = re.sub(r'\b(?:سعر|جديد|قديم|س ق|س ج|س|ق|ج|س.ج|س.ق)\b.*', '', text)
    
    # Remove specific terms even if they're part of another word
    text = re.sub(r'(سعر|جديد|قديم|س ق|س ج)', '', text)
    
    # Remove repeated letters
    text = re.sub(r'(.)\1+', r'\1', text)
    
    return ' '.join(text.split())

In [8]:
def compute_optimized_features_batch(pairs, tfidf_vectorizer):
    """Compute features for multiple pairs at once with vectorization"""
    features = np.zeros((len(pairs), 4))
    
    texts1, texts2 = zip(*pairs)
    
    tfidf_vectors1 = tfidf_vectorizer.transform(texts1)
    tfidf_vectors2 = tfidf_vectorizer.transform(texts2)
    
    cosine_sims = cosine_similarity(tfidf_vectors1, tfidf_vectors2)
    
    for i, (text1, text2) in enumerate(pairs):
        features[i, 0] = fuzz.ratio(text1, text2)
        features[i, 1] = fuzz.token_set_ratio(text1, text2)
        
        tokens1 = set(text1.split())
        tokens2 = set(text2.split())
        features[i, 2] = len(tokens1 & tokens2) / len(tokens1 | tokens2) if tokens1 or tokens2 else 0.0
        features[i, 3] = cosine_sims[i, i]
    
    return features

def prepare_bayesian_training_data(train_file, master_file, neg_samples_per_query=5):
    """
    Prepare training data for Bayesian classification approach with robust data validation
    """
    print("Reading data files...")
    train_df = pd.read_excel(train_file, usecols=['sku', 'seller_item_name'])
    master_df = pd.read_excel(master_file, usecols=['sku', 'product_name_ar'])
    
    # Remove any duplicates and null values
    train_df = train_df.dropna(subset=['sku', 'seller_item_name']).drop_duplicates(subset=['sku'])
    master_df = master_df.dropna(subset=['sku', 'product_name_ar']).drop_duplicates(subset=['sku'])
    
    print(f"Training samples: {len(train_df)}")
    print(f"Master products: {len(master_df)}")
    
    # Find matching SKUs between training and master data
    matching_skus = set(train_df['sku']).intersection(set(master_df['sku']))
    print(f"Found {len(matching_skus)} matching SKUs")
    
    if len(matching_skus) == 0:
        raise ValueError("No matching SKUs found between training and master data")
    
    # Filter to only matching SKUs
    train_df = train_df[train_df['sku'].isin(matching_skus)]
    
    print("Normalizing text...")
    with ThreadPoolExecutor() as executor:
        train_df['Normalized_Product'] = list(executor.map(
            optimize_normalize_arabic, train_df['seller_item_name']
        ))
        master_df['Normalized_Product'] = list(executor.map(
            optimize_normalize_arabic, master_df['product_name_ar']
        ))
    
    print("Initializing TF-IDF vectorizer...")
    tfidf_vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=3000,
        min_df=2,
        dtype=np.float32
    )
    
    all_texts = pd.concat([train_df['Normalized_Product'], master_df['Normalized_Product']]).unique()
    tfidf_vectorizer.fit(all_texts)
    with open('tfidf_vectorizer.pkl', 'wb') as file:
        pickle.dump(tfidf_vectorizer, file)
    
    print("Creating training pairs...")
    pairs = []
    labels = []
    
    # Create a master product lookup dictionary for efficiency
    master_lookup = master_df.set_index('sku')['Normalized_Product'].to_dict()
    
    for idx, query_row in train_df.iterrows():
        if idx % 100 == 0:
            print(f"Processing query {idx}/{len(train_df)}")
            
        query_sku = query_row['sku']
        query_text = query_row['Normalized_Product']
        
        # Get positive match from lookup dictionary
        positive_match = master_lookup[query_sku]
        
        # Add positive pair
        pairs.append((query_text, positive_match))
        labels.append(1)
        
        # Sample negative matches from products with different SKUs
        negative_skus = np.random.choice(
            [sku for sku in master_lookup.keys() if sku != query_sku],
            size=min(neg_samples_per_query, len(master_lookup)-1),
            replace=False
        )
        
        for neg_sku in negative_skus:
            negative_match = master_lookup[neg_sku]
            pairs.append((query_text, negative_match))
            labels.append(0)
    
    if not pairs:
        raise ValueError("No valid training pairs could be created")
    
    # Compute features for all pairs
    print("Computing features for all pairs...")
    X = compute_optimized_features_batch(pairs, tfidf_vectorizer)
    y = np.array(labels)
    
    print(f"Total pairs: {len(X)}")
    print(f"Positive pairs: {sum(y)}")
    print(f"Negative pairs: {len(y) - sum(y)}")
    
    return X, y, tfidf_vectorizer

def train_bayesian_model(train_file, master_file):
    """Train a Gaussian Naive Bayes model for product matching"""
    print("Preparing training data...")
    X, y, tfidf_vectorizer = prepare_bayesian_training_data(train_file, master_file)
    
    print("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print("Training Gaussian Naive Bayes model...")
    model = GaussianNB()
    model.fit(X_train, y_train)
    
    # Evaluate the model
    train_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)
    y_pred = model.predict(X_test)
    print(f"Train accuracy: {train_accuracy:.4f}")
    print(f"Test accuracy: {test_accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Save the model
    with open("bayesian_product_matcher.pkl", 'wb') as file:
        pickle.dump(model, file)
    print("Model saved as bayesian_product_matcher.pkl")
    
    return model, tfidf_vectorizer

# Train the model
print("Starting Bayesian model training...")
model, vectorizer = train_bayesian_model(training, master)

print("Training completed!")


Starting Bayesian model training...
Preparing training data...
Reading data files...
Training samples: 501
Master products: 1000
Found 500 matching SKUs
Normalizing text...
Initializing TF-IDF vectorizer...
Creating training pairs...
Processing query 0/500
Processing query 43400/500
Processing query 44600/500
Processing query 46600/500
Computing features for all pairs...
Total pairs: 3000
Positive pairs: 500
Negative pairs: 2500
Splitting data...
Training Gaussian Naive Bayes model...
Train accuracy: 0.9404
Test accuracy: 0.9550

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       500
           1       0.83      0.91      0.87       100

    accuracy                           0.95       600
   macro avg       0.91      0.94      0.92       600
weighted avg       0.96      0.95      0.96       600

Model saved as bayesian_product_matcher.pkl
Training completed!


In [15]:
def detect_language(text):
    """
    Detect if text is primarily Arabic or English based on character count
    Returns 'arabic' if primarily Arabic, 'english' if primarily English
    """
    # Count English letters (a-z, A-Z)
    english_count = len(re.findall(r'[a-zA-Z]', text))
    
    # If more than 3 English letters, consider it English
    if english_count > 3:
        return 'english'
    return 'arabic'

def normalize_english(text):
    """Normalize English text"""
    if not isinstance(text, str):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and extra spaces
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove common product-related words and everything after them
    text = re.sub(r'\b(?:price|new|old|p n|p o)\b.*', '', text)
    
    # Remove repeated letters (e.g., 'goood' -> 'good')
    text = re.sub(r'(.)\1+', r'\1', text)
    
    return ' '.join(text.split())

def fast_normalize_arabic(text):
    """Extremely fast Arabic text normalization focusing only on critical operations"""
    if not isinstance(text, str):
        return ''
        
    text = araby.normalize_hamza(text)
    text = araby.normalize_ligature(text)
    text = araby.normalize_alef(text)
    text = araby.normalize_teh(text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[!"#\'()*+,.:;<=>?@[\\]^_`{|}~]', '', text)
    text = re.sub(r'\b(?:سعر|جديد|قديم|س ق|س ج|س|ق|ج|س.ج|س.ق)\b.*', '', text)
    text = re.sub(r'(سعر|جديد|قديم|س ق|س ج)', '', text)
    text = re.sub(r'(.)\1+', r'\1', text)
    return ' '.join(text.split())

def normalize_text(text):
    """
    Normalize text based on detected language
    """
    if not isinstance(text, str):
        return ''
        
    language = detect_language(text)
    if language == 'english':
        return normalize_english(text)
    return fast_normalize_arabic(text)

def compute_token_overlap(text1, text2):
    """Compute token overlap between two texts"""
    tokens1 = set(text1.split())
    tokens2 = set(text2.split())
    if not tokens1 or not tokens2:
        return 0.0
    return len(tokens1 & tokens2) / len(tokens1 | tokens2)

def precompute_tfidf_matrix(texts, vectorizer):
    """Precompute TF-IDF matrix for all texts"""
    return vectorizer.transform(texts)

def compute_batch_features(query_texts, master_texts, query_tfidf, master_tfidf):
    """Compute features for a batch of text pairs efficiently"""
    # Calculate cosine similarities for the entire batch at once
    cosine_sims = cosine_similarity(query_tfidf, master_tfidf)
    
    n_queries = len(query_texts)
    n_masters = len(master_texts)
    
    # Initialize feature matrices
    fuzz_ratios = np.zeros((n_queries, n_masters))
    token_set_ratios = np.zeros((n_queries, n_masters))
    token_overlaps = np.zeros((n_queries, n_masters))
    
    # Compute features in parallel
    def compute_pair_features(i, j):
        return (
            fuzz.ratio(query_texts[i], master_texts[j]),
            fuzz.token_set_ratio(query_texts[i], master_texts[j]),
            compute_token_overlap(query_texts[i], master_texts[j])
        )
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for i in range(n_queries):
            for j in range(n_masters):
                futures.append(executor.submit(compute_pair_features, i, j))
        
        for idx, future in enumerate(futures):
            i = idx // n_masters
            j = idx % n_masters
            ratio, token_set, overlap = future.result()
            fuzz_ratios[i, j] = ratio
            token_set_ratios[i, j] = token_set
            token_overlaps[i, j] = overlap
    
    features = np.stack([
        fuzz_ratios,
        token_set_ratios,
        token_overlaps,
        cosine_sims
    ], axis=2)
    
    return features

def match_products_bilingual(query_file, master_file, bayesian_model, vectorizer, threshold=0.5, output_file="MatchedResults_Pairwise.xlsx"):
    """Bilingual matching function supporting both Arabic and English text"""
    start_time = time.time()
    print("Starting bilingual matching process...")
    
    # Load data efficiently
    query_df = pd.read_excel(query_file, usecols=[ITEM_CODE, ITEM_NAME, ITEM_PRICE])
    master_df = pd.read_excel(master_file)
    
    print(f"Loaded {len(query_df)} query products and {len(master_df)} master products")
    
    # Detect language for each query product
    print("Detecting languages and normalizing texts...")
    query_df['detected_language'] = query_df[ITEM_NAME].apply(detect_language)
    query_df['Normalized_Product'] = query_df[ITEM_NAME].apply(normalize_text)
    
    # Initialize master product columns
    master_df['Normalized_Product_AR'] = master_df['product_name_ar'].apply(fast_normalize_arabic)
    master_df['Normalized_Product_EN'] = master_df['product_name'].apply(normalize_english)
    
    # Process Arabic and English queries separately
    results = []
    
    # Handle Arabic queries
    arabic_queries = query_df[query_df['detected_language'] == 'arabic']
    if len(arabic_queries) > 0:
        print("Processing Arabic queries...")
        arabic_query_tfidf = precompute_tfidf_matrix(arabic_queries['Normalized_Product'], vectorizer)
        arabic_master_tfidf = precompute_tfidf_matrix(master_df['Normalized_Product_AR'], vectorizer)
        
        arabic_features = compute_batch_features(
            arabic_queries['Normalized_Product'].tolist(),
            master_df['Normalized_Product_AR'].tolist(),
            arabic_query_tfidf,
            arabic_master_tfidf
        )
        
        # Process Arabic predictions
        arabic_predictions = process_predictions(
            arabic_queries,
            master_df,
            arabic_features,
            bayesian_model,
            threshold
        )
        results.extend(arabic_predictions)
    
    # Handle English queries
    english_queries = query_df[query_df['detected_language'] == 'english']
    if len(english_queries) > 0:
        print("Processing English queries...")
        english_query_tfidf = precompute_tfidf_matrix(english_queries['Normalized_Product'], vectorizer)
        english_master_tfidf = precompute_tfidf_matrix(master_df['Normalized_Product_EN'], vectorizer)
        
        english_features = compute_batch_features(
            english_queries['Normalized_Product'].tolist(),
            master_df['Normalized_Product_EN'].tolist(),
            english_query_tfidf,
            english_master_tfidf
        )
        
        # Process English predictions
        english_predictions = process_predictions(
            english_queries,
            master_df,
            english_features,
            bayesian_model,
            threshold
        )
        results.extend(english_predictions)
    
    # Create and save results DataFrame
    results_df = pd.DataFrame(results)
    results_df.to_excel(output_file, index=False)
    
    end_time = time.time()
    processing_time = end_time - start_time
    print(f"Matching completed in {processing_time:.2f} seconds")
    print(f"Average time per product: {(processing_time/len(results))*1000:.2f} ms")
    
    return results_df

def process_predictions(query_df, master_df, features, bayesian_model, threshold):
    """
    Process predictions for a set of queries using Naive Bayes model
    
    Parameters:
    - query_df: DataFrame containing query products
    - master_df: DataFrame containing master products
    - features: Computed similarity features
    - bayesian_model: GaussianNB model
    - threshold: Minimum probability threshold for accepting matches
    
    Returns:
    - List of dictionaries containing match results
    """
    n_queries, n_masters, n_features = features.shape
    features_reshaped = features.reshape(-1, n_features)
    
    # Get prediction probabilities from the Bayesian model
    predictions = bayesian_model.predict_proba(features_reshaped)
    
    # Extract probabilities for the positive class (class 1)
    if predictions.shape[1] >= 2:  # For multi-class case
        predictions = predictions[:, 1]  # Take the probability of class 1
    else:  # For binary case with only one probability output
        predictions = np.ones(len(predictions))  # Default to 1 if only one class

    predictions = predictions.reshape(n_queries, n_masters)
    
    best_match_indices = np.argmax(predictions, axis=1)
    best_match_scores = np.max(predictions, axis=1)
    
    results = []
    for i, (_, query_row) in enumerate(query_df.iterrows()):
        if best_match_scores[i] >= threshold:
            master_row = master_df.iloc[best_match_indices[i]]
            results.append({
                "Query SKU": query_row[ITEM_CODE],
                "Query Product": query_row[ITEM_NAME],
                "Price": query_row[ITEM_PRICE],
                "Matched Master SKU": master_row['sku'],
                "Matched Master Product": master_row['product_name_ar'] if query_row['detected_language'] == 'arabic' else master_row['product_name'],
                "Match Probability": best_match_scores[i],
                "Language": query_row['detected_language']
            })
        else:
            results.append({
                "Query SKU": query_row[ITEM_CODE],
                "Query Product": query_row[ITEM_NAME],
                "Price": query_row[ITEM_PRICE],
                "Matched Master SKU": None,
                "Matched Master Product": None,
                "Match Probability": best_match_scores[i],
                "Language": query_row['detected_language']
            })
    
    return results

def run_matching_with_timing(query_file, master_file, bayesian_model, vectorizer, output_file="MatchedResults.xlsx"):
    """Run matching with detailed timing information"""
    print("Starting bilingual matching process...")
    start_time = time.time()
    
    results = match_products_bilingual(query_file, master_file, bayesian_model, vectorizer, output_file=output_file)
    
    end_time = time.time()
    total_time = end_time - start_time
    print(f"\nPerformance Summary:")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Average time per product: {(total_time/len(results))*1000:.2f} ms")
    
    return results


In [16]:
import pickle

# Load the Bayesian model you trained earlier
# You can use the model variable that already exists or reload from the saved file
model_from_file = None
try:
    with open("bayesian_product_matcher.pkl", 'rb') as file:
        model_from_file = pickle.load(file)
    print("Bayesian model loaded from file successfully.")
except Exception as e:
    print(f"Could not load model from file: {e}")
    print("Using the model from memory instead.")

# Use the loaded model or the one in memory
bayesian_model = model_from_file if model_from_file is not None else model

print("Running the matching process with Bayesian model...")
results = run_matching_with_timing(
    test,
    master,
    bayesian_model,
    vectorizer,
    output_file="MatchedResults_Bayesian.xlsx"
)

Bayesian model loaded from file successfully.
Running the matching process with Bayesian model...
Starting bilingual matching process...
Starting bilingual matching process...
Loaded 72 query products and 1000 master products
Detecting languages and normalizing texts...
Processing Arabic queries...
Processing English queries...
Matching completed in 3.68 seconds
Average time per product: 51.07 ms

Performance Summary:
Total processing time: 3.68 seconds
Average time per product: 51.07 ms
