In [6]:
%pip install -r requirements.txt
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from joblib import dump, load
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import xgboost as xgb
from sklearn.model_selection import train_test_split
import re
from pyarabic import araby
from concurrent.futures import ThreadPoolExecutor

^C
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


# Variables for Easy Accessibility

* **master**: Path to the master file containing the correct names.
* **training**: Path to the desired training data.
* **test**: File to match data with and test the model.

### Column Requirements

- **Master File**: Columns should be in the format `(sku, product_name_ar, product_name)`.
- **Training File**: Columns should be in the format `(sku, ..., seller_item_name)`.  // Look at Dedup_Dataset.xlsx structure
- **Test File**: Columns should be in the format `(item code , product name , price)`.
  added a dynamic naming for the test file

The model uses the `sku` in the master file to find matches, so only the `sku` is needed in the training data.


In [None]:
master = "./Masterfile.xlsx"
training = "./Dedup_Dataset.xlsx"
test = "./test.xlsx"

# dynamic test column names
ITEM_NAME = "product name"
ITEM_CODE = "item code"
ITEM_PRICE = "price"

# Arabic Preprocessing

The two functions provided serve the same purpose. The first function is written in a more readable format to illustrate the approach taken.

In [None]:
def normalize_arabic(text):
    """Harsh and Kinda extreme preprocessing but for the clarity"""
    if not isinstance(text, str):
        return ""
    text = araby.normalize_hamza(text)
    text = araby.normalize_ligature(text)
    text = araby.normalize_alef(text)
    text = araby.normalize_teh(text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[!"#\'()*+,.:;<=>?@[\\]^_`{|}~]', '', text)  # Remove punctuation
    text = re.sub(r'\b(?:سعر|جديد|قديم|س ق|س ج|س|ق|ج|س.ج|س.ق)\b.*', '', text)  # Remove specific words and everything after them
    text = re.sub(r'(سعر|جديد|قديم|س ق|س ج)', '', text)  # Remove specific words even if they are part of another word
    text = re.sub(r'(.)\1+', r'\1', text)  # Remove repeated letters
    return text


def optimize_normalize_arabic(text):
    """Enhanced Arabic text normalization that preserves % and / characters"""
    if not isinstance(text, str):
        return ""
    text = araby.normalize_hamza(araby.normalize_ligature(araby.normalize_alef(araby.normalize_teh(text))))
    
    # Replace unwanted characters except % and /
    text = re.sub(r'[^\w\s%/]', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove specific Arabic terms and everything after them
    text = re.sub(r'\b(?:سعر|جديد|قديم|س ق|س ج|س|ق|ج|س.ج|س.ق)\b.*', '', text)
    
    # Remove specific terms even if they're part of another word
    text = re.sub(r'(سعر|جديد|قديم|س ق|س ج)', '', text)
    
    # Remove repeated letters
    text = re.sub(r'(.)\1+', r'\1', text)
    
    return ' '.join(text.split())

# Model and its Features
## 1. using Binary Logistic

In [None]:
def compute_optimized_features_batch(pairs, tfidf_vectorizer):
    """Compute features for multiple pairs at once with corrected vectorization"""
    features = np.zeros((len(pairs), 4))
    
    # Extract text pairs
    texts1, texts2 = zip(*pairs)
    
    # Compute TF-IDF vectors for all texts at once
    tfidf_vectors1 = tfidf_vectorizer.transform(texts1)
    tfidf_vectors2 = tfidf_vectorizer.transform(texts2)
    
    # Compute cosine similarities in one go
    cosine_sims = cosine_similarity(tfidf_vectors1, tfidf_vectors2)
    
    for i, (text1, text2) in enumerate(pairs):
        # Fuzzy string matching features
        features[i, 0] = fuzz.ratio(text1, text2)
        features[i, 1] = fuzz.token_set_ratio(text1, text2)
        
        # Token overlap
        tokens1 = set(text1.split())
        tokens2 = set(text2.split())
        features[i, 2] = len(tokens1 & tokens2) / len(tokens1 | tokens2) if tokens1 or tokens2 else 0.0
        
        # TF-IDF cosine similarity
        features[i, 3] = cosine_sims[i, i]
    
    return features

def prepare_optimized_training_data(train_file, master_file, sample_negatives=2):
    """Optimized training data preparation with corrected batch processing"""
    # Read data efficiently
    print("Reading data files...")
    train_df = pd.read_excel(train_file, usecols=['sku', 'seller_item_name'])
    master_df = pd.read_excel(master_file, usecols=['sku', 'product_name_ar'])
    
    # Parallel text normalization
    print("Normalizing text...")
    with ThreadPoolExecutor() as executor:
        train_df['Normalized_Product'] = list(executor.map(
            optimize_normalize_arabic, train_df['seller_item_name']
        ))
        master_df['Normalized_Product'] = list(executor.map(
            optimize_normalize_arabic, master_df['product_name_ar']
        ))
    
    # Create positive pairs using vectorized operations
    print("Creating positive pairs...")
    train_master_merged = train_df.merge(
        master_df, on='sku', suffixes=('_train', '_master')
    )
    pos_pairs = list(zip(
        train_master_merged['Normalized_Product_train'],
        train_master_merged['Normalized_Product_master']
    ))
    
    # Create negative pairs efficiently
    print("Creating negative pairs...")
    neg_pairs = []
    for _, row in train_df.iterrows():
        negative_samples = master_df[master_df['sku'] != row['sku']].sample(
            n=min(sample_negatives, len(master_df)-1),
            random_state=42
        )
        neg_pairs.extend([
            (row['Normalized_Product'], neg_row['Normalized_Product'])
            for _, neg_row in negative_samples.iterrows()
        ])
    
    # Combine all pairs
    all_pairs = pos_pairs + neg_pairs
    
    # Prepare TF-IDF vectorizer
    print("Computing TF-IDF features...")
    tfidf_vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=3000,
        min_df=2,
        dtype=np.float32
    )
    
    # Fit vectorizer on all texts
    all_texts = [text for pair in all_pairs for text in pair]
    tfidf_vectorizer.fit(all_texts)
    
    # Process features in batches
    print("Computing features in batches...")
    batch_size = 1000
    X = []
    for i in range(0, len(all_pairs), batch_size):
        batch_pairs = all_pairs[i:i+batch_size]
        batch_features = compute_optimized_features_batch(
            batch_pairs,
            tfidf_vectorizer
        )
        X.append(batch_features)
    
    X = np.vstack(X)
    y = np.concatenate([
        np.ones(len(pos_pairs)),
        np.zeros(len(neg_pairs))
    ])
    
    return X, y, tfidf_vectorizer

def train_optimized_model(train_file, master_file):
    """Train model with optimized parameters and early stopping"""
    print("Preparing training data...")
    X, y, tfidf_vectorizer = prepare_optimized_training_data(train_file, master_file)
    
    print("Splitting data...")
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Convert to DMatrix for faster training
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': 6,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 3,
        'tree_method': 'hist',
        'random_state': 42
    }
    
    print("Training model...")
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dtrain, 'train'), (dval, 'val')],
        early_stopping_rounds=10,
        verbose_eval=100
    )
    
    # Save model
    model.save_model("logistic_boosting_model.json")
    print("Model saved as logistic_boosting_model.json")
    
    return model, tfidf_vectorizer




In [None]:
#Model Initialization

model, vectorizer = train_optimized_model(training, master)

Preparing training data...
Reading data files...
Normalizing text...
Creating positive pairs...
Creating negative pairs...


KeyboardInterrupt: 

# Using rank:pairwise

In [10]:
import lightgbm as lgb

def compute_optimized_features_batch(pairs, tfidf_vectorizer):
    """Compute features for multiple pairs at once with vectorization"""
    features = np.zeros((len(pairs), 4))
    
    texts1, texts2 = zip(*pairs)
    
    tfidf_vectors1 = tfidf_vectorizer.transform(texts1)
    tfidf_vectors2 = tfidf_vectorizer.transform(texts2)
    
    cosine_sims = cosine_similarity(tfidf_vectors1, tfidf_vectors2)
    
    for i, (text1, text2) in enumerate(pairs):
        features[i, 0] = fuzz.ratio(text1, text2)
        features[i, 1] = fuzz.token_set_ratio(text1, text2)
        
        tokens1 = set(text1.split())
        tokens2 = set(text2.split())
        features[i, 2] = len(tokens1 & tokens2) / len(tokens1 | tokens2) if tokens1 or tokens2 else 0.0
        features[i, 3] = cosine_sims[i, i]
    
    return features

def prepare_ranking_training_data(train_file, master_file, neg_samples_per_query=5):
    """
    Prepare training data for learning to rank approach with robust data validation
    """
    print("Reading data files...")
    train_df = pd.read_excel(train_file, usecols=['sku', 'seller_item_name'])
    master_df = pd.read_excel(master_file, usecols=['sku', 'product_name_ar'])
    
    # Remove any duplicates and null values
    train_df = train_df.dropna(subset=['sku', 'seller_item_name']).drop_duplicates(subset=['sku'])
    master_df = master_df.dropna(subset=['sku', 'product_name_ar']).drop_duplicates(subset=['sku'])
    
    print(f"Training samples: {len(train_df)}")
    print(f"Master products: {len(master_df)}")
    
    # Find matching SKUs between training and master data
    matching_skus = set(train_df['sku']).intersection(set(master_df['sku']))
    print(f"Found {len(matching_skus)} matching SKUs")
    
    if len(matching_skus) == 0:
        raise ValueError("No matching SKUs found between training and master data")
    
    # Filter to only matching SKUs
    train_df = train_df[train_df['sku'].isin(matching_skus)]
    
    print("Normalizing text...")
    with ThreadPoolExecutor() as executor:
        train_df['Normalized_Product'] = list(executor.map(
            optimize_normalize_arabic, train_df['seller_item_name']
        ))
        master_df['Normalized_Product'] = list(executor.map(
            optimize_normalize_arabic, master_df['product_name_ar']
        ))
    
    print("Initializing TF-IDF vectorizer...")
    tfidf_vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=3000,
        min_df=2,
        dtype=np.float32
    )
    
    all_texts = pd.concat([train_df['Normalized_Product'], master_df['Normalized_Product']]).unique()
    tfidf_vectorizer.fit(all_texts)
    with open('tfidf_vectorizer.pkl', 'wb') as file:
        pickle.dump(tfidf_vectorizer, file)
    
    print("Creating ranking groups...")
    features_list = []
    labels_list = []
    groups_list = []
    query_ids = []
    
    # Create a master product lookup dictionary for efficiency
    master_lookup = master_df.set_index('sku')['Normalized_Product'].to_dict()
    
    for idx, query_row in train_df.iterrows():
        if idx % 100 == 0:
            print(f"Processing query {idx}/{len(train_df)}")
            
        query_sku = query_row['sku']
        query_text = query_row['Normalized_Product']
        
        # Get positive match from lookup dictionary
        positive_match = master_lookup[query_sku]
        
        # Sample negative matches from products with different SKUs
        negative_skus = np.random.choice(
            [sku for sku in master_lookup.keys() if sku != query_sku],
            size=min(neg_samples_per_query, len(master_lookup)-1),
            replace=False
        )
        negative_matches = [master_lookup[sku] for sku in negative_skus]
        
        # Create pairs for this query
        all_pairs = [(query_text, positive_match)] + [(query_text, neg) for neg in negative_matches]
        
        # Compute features for all pairs in this group
        group_features = compute_optimized_features_batch(all_pairs, tfidf_vectorizer)
        
        # Create labels (1 for positive match, 0 for negative matches)
        group_labels = np.zeros(len(all_pairs))
        group_labels[0] = 1
        
        features_list.append(group_features)
        labels_list.append(group_labels)
        groups_list.append(len(all_pairs))
        query_ids.extend([idx] * len(all_pairs))  # Assign same query ID to all pairs in group
    
    if not features_list:
        raise ValueError("No valid training pairs could be created")
    
    X = np.vstack(features_list)
    y = np.concatenate(labels_list)
    groups = np.array(groups_list)
    qids = np.array(query_ids)
    
    print(f"Created {len(groups)} ranking groups")
    print(f"Total pairs: {len(X)}")
    print(f"Positive pairs: {sum(y)}")
    print(f"Negative pairs: {len(y) - sum(y)}")
    
    return X, y, groups, qids, tfidf_vectorizer

def train_ranking_model(train_file, master_file):
    """Train a ranking-based model using LightGBM's ranking objective"""
    print("Preparing training data...")
    X, y, groups, qids, tfidf_vectorizer = prepare_ranking_training_data(train_file, master_file)
    
    print("Splitting data...")
    unique_qids = np.unique(qids)
    n_queries = len(unique_qids)
    train_query_idx = np.random.choice(n_queries, int(0.8 * n_queries), replace=False)
    train_queries = unique_qids[train_query_idx]
    
    train_mask = np.isin(qids, train_queries)
    val_mask = ~train_mask
    
    X_train, y_train = X[train_mask], y[train_mask]
    X_val, y_val = X[val_mask], y[val_mask]
    qids_train = qids[train_mask]
    qids_val = qids[val_mask]
    
    train_data = lgb.Dataset(X_train, label=y_train, group=[sum(qids_train == qid) for qid in np.unique(qids_train)])
    valid_data = lgb.Dataset(X_val, label=y_val, group=[sum(qids_val == qid) for qid in np.unique(qids_val)], reference=train_data)
    
    params = {
        'objective': 'lambdarank',
        'metric': ['ndcg@5', 'map@5'],
        'boosting': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'min_data_in_leaf': 50,
        'random_state': 42,
        'verbose': -1
    }
    
    print("Training LightGBM model...")
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[train_data, valid_data],
        valid_names=['train', 'valid']
    )
    
    model.save_model("ranking_product_matcher_lgb.txt")
    print("Model saved as ranking_product_matcher_lgb.txt")
    
    return model, tfidf_vectorizer


# Train the model
print("Starting model training...")
model, vectorizer = train_ranking_model(training, master)

print("Training completed!")


Starting model training...
Preparing training data...
Reading data files...
Training samples: 501
Master products: 1000
Found 500 matching SKUs
Normalizing text...
Initializing TF-IDF vectorizer...
Creating ranking groups...
Processing query 0/500
Processing query 43400/500
Processing query 44600/500
Processing query 46600/500
Created 500 ranking groups
Total pairs: 3000
Positive pairs: 500.0
Negative pairs: 2500.0
Splitting data...
Training LightGBM model...
Model saved as ranking_product_matcher_lgb.txt
Training completed!


## Matching Function

In [11]:
import pyarabic.araby as araby

def detect_language(text):
    """
    Detect if text is primarily Arabic or English based on character count
    Returns 'arabic' if primarily Arabic, 'english' if primarily English
    """
    
    # If more than 3 English letters, consider it English
    if english_count > 3:
        return 'english'
    return 'arabic'

def normalize_english(text):
    """Normalize English text"""
    if not isinstance(text, str):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and extra spaces
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove common product-related words and everything after them
    text = re.sub(r'\b(?:price|new|old|p n|p o)\b.*', '', text)
    
    # Remove repeated letters (e.g., 'goood' -> 'good')
    text = re.sub(r'(.)\1+', r'\1', text)
    
    return ' '.join(text.split())

def fast_normalize_arabic(text):
    """Extremely fast Arabic text normalization focusing only on critical operations"""
    if not isinstance(text, str):
        return ''
        
    text = araby.normalize_hamza(text)
    text = araby.normalize_ligature(text)
    text = araby.normalize_alef(text)
    text = araby.normalize_teh(text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[!"#\'()*+,.:;<=>?@[\\]^_`{|}~]', '', text)
    text = re.sub(r'\b(?:سعر|جديد|قديم|س ق|س ج|س|ق|ج|س.ج|س.ق)\b.*', '', text)
    text = re.sub(r'(سعر|جديد|قديم|س ق|س ج)', '', text)
    text = re.sub(r'(.)\1+', r'\1', text)
    return ' '.join(text.split())

def normalize_text(text):
    """
    Normalize text based on detected language
    """
    if not isinstance(text, str):
        return ''
        
    language = detect_language(text)
    if language == 'english':
        return normalize_english(text)
    return fast_normalize_arabic(text)

def compute_token_overlap(text1, text2):
    """Compute token overlap between two texts"""
    tokens1 = set(text1.split())
    tokens2 = set(text2.split())
    if not tokens1 or not tokens2:
        return 0.0
    return len(tokens1 & tokens2) / len(tokens1 | tokens2)

def precompute_tfidf_matrix(texts, vectorizer):
    """Precompute TF-IDF matrix for all texts"""
    return vectorizer.transform(texts)

def compute_batch_features(query_texts, master_texts, query_tfidf, master_tfidf):
    """Compute features for a batch of text pairs efficiently"""
    # Calculate cosine similarities for the entire batch at once
    cosine_sims = cosine_similarity(query_tfidf, master_tfidf)
    
    n_queries = len(query_texts)
    n_masters = len(master_texts)
    
    # Initialize feature matrices
    fuzz_ratios = np.zeros((n_queries, n_masters))
    token_set_ratios = np.zeros((n_queries, n_masters))
    token_overlaps = np.zeros((n_queries, n_masters))
    
    # Compute features in parallel
    def compute_pair_features(i, j):
        return (
            fuzz.ratio(query_texts[i], master_texts[j]),
            fuzz.token_set_ratio(query_texts[i], master_texts[j]),
            compute_token_overlap(query_texts[i], master_texts[j])
        )
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for i in range(n_queries):
            for j in range(n_masters):
                futures.append(executor.submit(compute_pair_features, i, j))
        
        for idx, future in enumerate(futures):
            i = idx // n_masters
            j = idx % n_masters
            ratio, token_set, overlap = future.result()
            fuzz_ratios[i, j] = ratio
            token_set_ratios[i, j] = token_set
            token_overlaps[i, j] = overlap
    
    features = np.stack([
        fuzz_ratios,
        token_set_ratios,
        token_overlaps,
        cosine_sims
    ], axis=2)
    
    return features

def match_products_bilingual(query_file, master_file, model, vectorizer, threshold=0.5, output_file="MatchedResults_Pairwise.xlsx"):
    """Bilingual matching function supporting both Arabic and English text"""
    start_time = time.time()
    print("Starting bilingual matching process...")
    
    # Load data efficiently
    query_df = pd.read_excel(query_file, usecols=[ITEM_CODE, ITEM_NAME , ITEM_PRICE])
    master_df = pd.read_excel(master_file)
    
    print(f"Loaded {len(query_df)} query products and {len(master_df)} master products")
    
    # Detect language for each query product
    print("Detecting languages and normalizing texts...")
    query_df['detected_language'] = query_df['product name'].apply(detect_language)
    query_df['Normalized_Product'] = query_df['product name'].apply(normalize_text)
    
    # Initialize master product columns
    master_df['Normalized_Product_AR'] = master_df['product_name_ar'].apply(fast_normalize_arabic)
    master_df['Normalized_Product_EN'] = master_df['product_name'].apply(normalize_english)
    
    # Process Arabic and English queries separately
    results = []
    
    # Handle Arabic queries
    arabic_queries = query_df[query_df['detected_language'] == 'arabic']
    if len(arabic_queries) > 0:
        print("Processing Arabic queries...")
        arabic_query_tfidf = precompute_tfidf_matrix(arabic_queries['Normalized_Product'], vectorizer)
        arabic_master_tfidf = precompute_tfidf_matrix(master_df['Normalized_Product_AR'], vectorizer)
        
        arabic_features = compute_batch_features(
            arabic_queries['Normalized_Product'].tolist(),
            master_df['Normalized_Product_AR'].tolist(),
            arabic_query_tfidf,
            arabic_master_tfidf
        )
        
        # Process Arabic predictions
        arabic_predictions = process_predictions(
            arabic_queries,
            master_df,
            arabic_features,
            model,
            threshold
        )
        results.extend(arabic_predictions)
    
    # Handle English queries
    english_queries = query_df[query_df['detected_language'] == 'english']
    if len(english_queries) > 0:
        print("Processing English queries...")
        english_query_tfidf = precompute_tfidf_matrix(english_queries['Normalized_Product'], vectorizer)
        english_master_tfidf = precompute_tfidf_matrix(master_df['Normalized_Product_EN'], vectorizer)
        
        english_features = compute_batch_features(
            english_queries['Normalized_Product'].tolist(),
            master_df['Normalized_Product_EN'].tolist(),
            english_query_tfidf,
            english_master_tfidf
        )
        
        # Process English predictions
        english_predictions = process_predictions(
            english_queries,
            master_df,
            english_features,
            model,
            threshold
        )
        results.extend(english_predictions)
    
    # Create and save results DataFrame
    results_df = pd.DataFrame(results)
    results_df.to_excel(output_file, index=False)
    
    end_time = time.time()
    processing_time = end_time - start_time
    print(f"Matching completed in {processing_time:.2f} seconds")
    print(f"Average time per product: {(processing_time/len(results))*1000:.2f} ms")
    
    return results_df

def process_predictions(query_df, master_df, features, model, threshold):
    """
    Process predictions for a set of queries using XGBoost's predict method
    
    Parameters:
    - query_df: DataFrame containing query products
    - master_df: DataFrame containing master products
    - features: Computed similarity features
    - model: XGBoost model (Booster object)
    - threshold: Minimum probability threshold for accepting matches
    
    Returns:
    - List of dictionaries containing match results
    """
    n_queries, n_masters, n_features = features.shape
    features_reshaped = features.reshape(-1, n_features)
    
    # Convert features to DMatrix for XGBoost
    dtest = xgb.DMatrix(features_reshaped)
    
    # Get raw predictions and convert to probabilities using softmax
    predictions = model.predict(dtest)
    
    # If the model outputs raw scores (not probabilities), convert to probabilities
    if len(predictions.shape) == 1:  # If predictions are 1-dimensional
        predictions = 1 / (1 + np.exp(-predictions))  # Apply sigmoid for binary classification
    else:  # If predictions are 2-dimensional (multiple classes)
        predictions = predictions[:, 1]  # Take the probability of class 1
    
    predictions = predictions.reshape(n_queries, n_masters)
    
    best_match_indices = np.argmax(predictions, axis=1)
    best_match_scores = np.max(predictions, axis=1)
    
    results = []
    for i, (_, query_row) in enumerate(query_df.iterrows()):
        if best_match_scores[i] >= threshold:
            master_row = master_df.iloc[best_match_indices[i]]
            results.append({
                "Query SKU": query_row[ITEM_CODE],
                "Query Product": query_row[ITEM_NAME],
                "Price" : query_row[ITEM_PRICE],
                "Matched Master SKU": master_row['sku'],
                "Matched Master Product": master_row['product_name_ar'] if query_row['detected_language'] == 'arabic' else master_row['product_name'],
                "Match Probability": best_match_scores[i],
                "Language": query_row['detected_language']
            })
        else:
            results.append({
                "Query SKU": query_row[ITEM_CODE],
                "Query Product": query_row[ITEM_NAME],
                "Price" : query_row[ITEM_PRICE],
                "Matched Master SKU": None,
                "Matched Master Product": None,
                "Match Probability": best_match_scores[i],
                "Language": query_row['detected_language']
            })
    
    return results

def run_matching_with_timing(query_file, master_file, model, vectorizer , output_file="MatchedResults.xlsx"):
    """Run matching with detailed timing information"""
    print("Starting bilingual matching process...")
    start_time = time.time()
    
    results = match_products_bilingual(query_file, master_file, model, vectorizer, output_file=output_file)
    
    end_time = time.time()
    total_time = end_time - start_time
    print(f"\nPerformance Summary:")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Average time per product: {(total_time/len(results))*1000:.2f} ms")
    
    return results

In [12]:
import lightgbm as lgb
import pickle

# Load the LightGBM ranking model
model = lgb.Booster(model_file='ranking_product_matcher_lgb.txt')

# Load the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

print("LightGBM model and vectorizer loaded successfully.")

results = run_matching_with_timing(
    test,
    master,
    model,
    vectorizer,
    output_file="MatchedResults_LightGBM.xlsx"
)


LightGBM model and vectorizer loaded successfully.
Starting bilingual matching process...
Starting bilingual matching process...


FileNotFoundError: [Errno 2] No such file or directory: './test.xlsx'

In [None]:
import xgboost as xgb
import pickle

# Load the Logistic XGBoost model
model = xgb.Booster()
model.load_model('logistic_boosting_model.json')

# Load the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

print("Model and vectorizer loaded successfully.")
results = run_matching_with_timing(
    test,
    master,
    model,
    vectorizer,
    output_file="MatchedResults_Logistic.xlsx"
)