In [None]:
# Import required libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from sklearn.metrics import accuracy_score
import re
import Levenshtein
from fuzzywuzzy import fuzz
from nltk.tokenize import word_tokenize
from pyarabic import araby

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')

def normalize_text(text):
    """
    Normalize Arabic text by applying various preprocessing steps.
    """
    if not isinstance(text, str):
        return ""
    
    # Apply Arabic-specific normalizations
    text = araby.normalize_hamza(text)    # Normalize different forms of hamza
    text = araby.normalize_ligature(text)  # Normalize ligatures
    text = araby.normalize_alef(text)      # Convert أ إ آ to ا
    text = araby.normalize_teh(text)       # Convert ة to ه
    
    # Remove punctuation and extra spaces
    text = re.sub(r'[!"#%\'()*+,./:;<=>?@[\\]^_`{|}~]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

class ContrastiveLoss(nn.Module):
    """
    Contrastive loss function for Siamese networks.
    """
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin
    
    def forward(self, distance, label):
        loss = (1 - label) * torch.pow(distance, 2) + \
               label * torch.pow(torch.clamp(self.margin - distance, min=0.0), 2)
        return torch.mean(loss)

class ProductMatchingDataset(Dataset):
    """
    Dataset class for product matching that prepares text pairs and their labels.
    """
    def __init__(self, df, model):
        self.model = model
        self.pairs = []
        self.labels = []
        
        # Normalize text in both columns
        df['Normalized Master Name'] = df['marketplace_product_name_ar'].apply(normalize_text)
        df['Normalized Seller Name'] = df['seller_item_name'].apply(normalize_text)
        
        # Create embeddings for all texts
        master_embeddings = self.encode_texts(df['Normalized Master Name'].tolist())
        seller_embeddings = self.encode_texts(df['Normalized Seller Name'].tolist())
        
        # Create pairs and labels
        for emb1, emb2, label in zip(master_embeddings, seller_embeddings, df['label']):
            self.pairs.append((emb1, emb2))
            self.labels.append(label)
    
    def encode_texts(self, texts):
        return np.array(self.model.encode([str(t) for t in texts], convert_to_numpy=True))
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        emb1, emb2 = self.pairs[idx]
        label = self.labels[idx]
        return torch.tensor(emb1, dtype=torch.float32), \
               torch.tensor(emb2, dtype=torch.float32), \
               torch.tensor(label, dtype=torch.float32)

class ImprovedSiameseNetwork(nn.Module):
    """
    Improved Siamese network with better architecture for product matching.
    """
    def __init__(self, embedding_dim):
        super(ImprovedSiameseNetwork, self).__init__()
        
        # Projection layers with dropout and batch normalization
        self.projection = nn.Sequential(
            nn.Linear(embedding_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU()
        )
    
    def forward_one(self, x):
        x = self.projection(x)
        x = F.normalize(x, p=2, dim=1)  # L2 normalization
        return x
    
    def forward(self, x1, x2):
        out1 = self.forward_one(x1)
        out2 = self.forward_one(x2)
        return F.pairwise_distance(out1, out2)

def train_improved_model(training_file, model, epochs=10, batch_size=64, lr=0.001):
    """
    Train the improved Siamese network with various enhancements.
    Returns the best performing model based on validation loss.
    """
    # Load and split data
    df = pd.read_excel(training_file)
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    
    # Create data loaders
    train_dataset = ProductMatchingDataset(train_df, model)
    val_dataset = ProductMatchingDataset(val_df, model)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    # Initialize model and training components
    embedding_dim = model.get_sentence_embedding_dimension()
    siamese_model = ImprovedSiameseNetwork(embedding_dim)
    best_model = ImprovedSiameseNetwork(embedding_dim)  # Keep track of best model
    criterion = ContrastiveLoss(margin=1.0)
    optimizer = torch.optim.AdamW(siamese_model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2, verbose=True
    )
    
    # Training loop with early stopping
    best_val_loss = float('inf')
    patience = 5
    patience_counter = 0
    
    for epoch in range(epochs):
        # Training phase
        siamese_model.train()
        total_train_loss = 0
        
        for emb1, emb2, labels in train_loader:
            optimizer.zero_grad()
            distances = siamese_model(emb1, emb2)
            loss = criterion(distances, labels)
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(siamese_model.parameters(), max_norm=1.0)
            
            optimizer.step()
            total_train_loss += loss.item()
        
        # Validation phase
        siamese_model.eval()
        total_val_loss = 0
        
        with torch.no_grad():
            for emb1, emb2, labels in val_loader:
                distances = siamese_model(emb1, emb2)
                val_loss = criterion(distances, labels)
                total_val_loss += val_loss.item()
        
        # Calculate average losses
        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(val_loader)
        
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Training Loss: {avg_train_loss:.4f}")
        print(f"Validation Loss: {avg_val_loss:.4f}")
        
        # Learning rate scheduling
        scheduler.step(avg_val_loss)
        
        # Update best model if current model is better
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model.load_state_dict(siamese_model.state_dict())
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break
    
    return best_model
def match_dataset_improved(master_file, dataset_file, output_file, model, siamese_model, threshold=0.5):
    """
    Match products between master and dataset files using the trained model.
    Now includes proper index handling and tensor conversion.
    """
    master_df = pd.read_excel(master_file)
    dataset_df = pd.read_excel(dataset_file)
    
    # Create temporary indices to ensure proper referencing
    master_df = master_df.reset_index(drop=True)
    dataset_df = dataset_df.reset_index(drop=True)
    
    # Normalize text
    master_df['Normalized Name'] = master_df['product_name_ar'].apply(normalize_text)
    dataset_df['Normalized Name'] = dataset_df['product_name_ar'].apply(normalize_text)
    
    # Create embeddings
    print("Creating embeddings for master products...")
    master_embeddings = model.encode(master_df['Normalized Name'].tolist(), convert_to_numpy=True)
    print("Creating embeddings for dataset products...")
    dataset_embeddings = model.encode(dataset_df['Normalized Name'].tolist(), convert_to_numpy=True)
    
    results = []
    siamese_model.eval()
    batch_size = 32
    
    print(f"Processing {len(dataset_df)} products in batches of {batch_size}...")
    
    with torch.no_grad():
        for i in range(0, len(dataset_df), batch_size):
            current_batch_size = min(batch_size, len(dataset_df) - i)
            batch_results = []
            
            # Get current batch of dataset embeddings
            dataset_batch = torch.tensor(
                dataset_embeddings[i:i + current_batch_size], 
                dtype=torch.float32
            )
            
            for j in range(0, len(master_df), batch_size):
                current_master_batch_size = min(batch_size, len(master_df) - j)
                
                # Get current batch of master embeddings
                master_batch = torch.tensor(
                    master_embeddings[j:j + current_master_batch_size],
                    dtype=torch.float32
                )
                
                # Create all pairs between batches
                dataset_expanded = dataset_batch.unsqueeze(1).expand(
                    -1, current_master_batch_size, -1
                )
                master_expanded = master_batch.unsqueeze(0).expand(
                    current_batch_size, -1, -1
                )
                
                # Reshape for the model
                dataset_flat = dataset_expanded.reshape(-1, dataset_expanded.size(-1))
                master_flat = master_expanded.reshape(-1, master_expanded.size(-1))
                
                # Get distances
                distances = siamese_model(dataset_flat, master_flat)
                distances = distances.reshape(current_batch_size, current_master_batch_size)
                
                batch_results.append(distances)
            
            # Combine all batch results
            all_distances = torch.cat(batch_results, dim=1)
            
            # Find best matches
            min_distances, min_indices = torch.min(all_distances, dim=1)
            
            # Convert PyTorch tensors to numpy arrays for proper indexing
            min_distances = min_distances.cpu().numpy()
            min_indices = min_indices.cpu().numpy()
            
            # Process results for current batch
            for k in range(current_batch_size):
                dataset_idx = i + k
                master_idx = min_indices[k]
                
                # Ensure indices are within bounds
                if dataset_idx < len(dataset_df) and master_idx < len(master_df):
                    results.append({
                        'Seller Item': dataset_df.loc[dataset_idx, 'product_name_ar'],
                        'Matched Item': master_df.loc[master_idx, 'product_name_ar'],
                        'sku': master_df.loc[master_idx, 'sku'],
                        'Match Score': 1 - min_distances[k],
                        'Confidence': 'High' if min_distances[k] < threshold else 'Low'
                    })
                else:
                    print(f"Warning: Skipping invalid indices - dataset_idx: {dataset_idx}, master_idx: {master_idx}")
    
    print("Creating results DataFrame...")
    result_df = pd.DataFrame(results)
    
    print(f"Saving results to {output_file}...")
    result_df.to_excel(output_file, index=False)
    print(f"Matching completed. Output saved to {output_file}")
    
    # Print matching statistics
    print("\nMatching Statistics:")
    print(f"Total products processed: {len(dataset_df)}")
    print(f"Total matches found: {len(results)}")
    print(f"High confidence matches: {len(result_df[result_df['Confidence'] == 'High'])}")
    print(f"Low confidence matches: {len(result_df[result_df['Confidence'] == 'Low'])}")

# Example usage
if __name__ == "__main__":
    try:
        # Load the multilingual BERT model
        model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
        
        # Define file paths
        training_file = "/kaggle/input/augmented-dataset/PreDataset.xlsx"
        master_file = "/kaggle/input/product-matching-dataset/Masterfile.xlsx"
        test_file = "/kaggle/input/test-set-data/augmented_test_set.xlsx"
        output_file = "/kaggle/working/results.xlsx"
        
        # Train the model
        siamese_model = train_improved_model(training_file, model)
        

        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [None]:
def match_dataset_improved(master_file, dataset_file, output_file, model, siamese_model, threshold=0.5):
    """
    Match products between master and dataset files using the trained model.
    Includes accuracy calculation by comparing predicted SKUs with true SKUs.
    """
    # Load and prepare the dataframes
    master_df = pd.read_excel(master_file)
    dataset_df = pd.read_excel(dataset_file)
    
    # Create temporary indices to ensure proper referencing
    master_df = master_df.reset_index(drop=True)
    dataset_df = dataset_df.reset_index(drop=True)
    
    # Normalize text for better matching
    master_df['Normalized Name'] = master_df['product_name_ar'].apply(normalize_text)
    dataset_df['Normalized Name'] = dataset_df['product_name_ar'].apply(normalize_text)
    
    # Create embeddings using the BERT model
    print("Creating embeddings for master products...")
    master_embeddings = model.encode(master_df['Normalized Name'].tolist(), convert_to_numpy=True)
    print("Creating embeddings for dataset products...")
    dataset_embeddings = model.encode(dataset_df['Normalized Name'].tolist(), convert_to_numpy=True)
    
    results = []
    siamese_model.eval()
    batch_size = 32
    
    print(f"Processing {len(dataset_df)} products in batches of {batch_size}...")
    
    with torch.no_grad():
        for i in range(0, len(dataset_df), batch_size):
            current_batch_size = min(batch_size, len(dataset_df) - i)
            batch_results = []
            
            # Process current batch of dataset embeddings
            dataset_batch = torch.tensor(
                dataset_embeddings[i:i + current_batch_size], 
                dtype=torch.float32
            )
            
            for j in range(0, len(master_df), batch_size):
                current_master_batch_size = min(batch_size, len(master_df) - j)
                
                # Process current batch of master embeddings
                master_batch = torch.tensor(
                    master_embeddings[j:j + current_master_batch_size],
                    dtype=torch.float32
                )
                
                # Create pairs between batches for comparison
                dataset_expanded = dataset_batch.unsqueeze(1).expand(
                    -1, current_master_batch_size, -1
                )
                master_expanded = master_batch.unsqueeze(0).expand(
                    current_batch_size, -1, -1
                )
                
                # Prepare data for the model
                dataset_flat = dataset_expanded.reshape(-1, dataset_expanded.size(-1))
                master_flat = master_expanded.reshape(-1, master_expanded.size(-1))
                
                # Calculate distances using the Siamese model
                distances = siamese_model(dataset_flat, master_flat)
                distances = distances.reshape(current_batch_size, current_master_batch_size)
                
                batch_results.append(distances)
            
            # Combine results and find best matches
            all_distances = torch.cat(batch_results, dim=1)
            min_distances, min_indices = torch.min(all_distances, dim=1)
            
            # Convert to numpy for easier processing
            min_distances = min_distances.cpu().numpy()
            min_indices = min_indices.cpu().numpy()
            
            # Process results for the current batch
            for k in range(current_batch_size):
                dataset_idx = i + k
                master_idx = min_indices[k]
                
                # Ensure indices are within bounds
                if dataset_idx < len(dataset_df) and master_idx < len(master_df):
                    results.append({
                        'Seller Item': dataset_df.loc[dataset_idx, 'product_name_ar'],
                        'Matched Item': master_df.loc[master_idx, 'product_name_ar'],
                        'Predicted SKU': master_df.loc[master_idx, 'sku'],
                        'True SKU': dataset_df.loc[dataset_idx, 'sku'],
                        'Match Score': 1 - min_distances[k],
                        'Confidence': 'High' if min_distances[k] < threshold else 'Low'
                    })
                else:
                    print(f"Warning: Skipping invalid indices - dataset_idx: {dataset_idx}, master_idx: {master_idx}")
    
    # Create DataFrame from results
    print("Creating results DataFrame...")
    result_df = pd.DataFrame(results)
    
    # Calculate accuracy metrics
    correct_matches = result_df['Predicted SKU'] == result_df['True SKU']
    overall_accuracy = correct_matches.mean() * 100
    
    # Calculate accuracy for high confidence matches
    high_conf_mask = result_df['Confidence'] == 'High'
    high_conf_accuracy = (
        result_df[high_conf_mask]['Predicted SKU'] == 
        result_df[high_conf_mask]['True SKU']
    ).mean() * 100 if high_conf_mask.any() else 0
    
    # Add accuracy metrics to the output
    print("\nMatching Statistics:")
    print(f"Total products processed: {len(dataset_df)}")
    print(f"Total matches found: {len(results)}")
    print(f"Overall accuracy: {overall_accuracy:.2f}%")
    print(f"High confidence matches: {len(result_df[result_df['Confidence'] == 'High'])}")
    print(f"High confidence accuracy: {high_conf_accuracy:.2f}%")
    print(f"Low confidence matches: {len(result_df[result_df['Confidence'] == 'Low'])}")
    
    # Save results with accuracy information
    print(f"\nSaving results to {output_file}...")
    with pd.ExcelWriter(output_file) as writer:
        result_df.to_excel(writer, sheet_name='Matches', index=False)
        
        # Create a summary sheet with accuracy metrics
        summary_data = {
            'Metric': ['Total Products', 'Total Matches', 'Overall Accuracy', 
                      'High Confidence Matches', 'High Confidence Accuracy',
                      'Low Confidence Matches'],
            'Value': [len(dataset_df), len(results), f"{overall_accuracy:.2f}%",
                     len(result_df[high_conf_mask]), f"{high_conf_accuracy:.2f}%",
                     len(result_df[~high_conf_mask])]
        }
        pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
    
    print("Matching completed. Results and summary saved to Excel file.")
    
    return overall_accuracy, high_conf_accuracy
match_dataset_improved(master_file, test_file, output_file, model, siamese_model)