In [None]:
# Install required libraries
!pip install torch sentence-transformers pyarabic nltk

# Download NLTK data
import nltk
nltk.download('punkt')

# Rest of your code
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
import numpy as np
import re
from pyarabic import araby
from nltk.tokenize import word_tokenize

class ArabicTextPreprocessor:
    def __init__(self):
        pass
        
    def remove_tashkeel(self, text):
        """Remove Arabic diacritics from text."""
        return araby.strip_tashkeel(text)
    
    def remove_tatweel(self, text):
        """Remove Arabic text elongation."""
        return araby.strip_tatweel(text)
    
    
    def normalize_hamza(self, text):
        """Normalize different forms of Hamza."""
        text = re.sub("[إأٱآا]", "ا", text)
        text = re.sub("ى", "ي", text)
        text = re.sub("ؤ", "ء", text)
        text = re.sub("ئ", "ء", text)
        return text
    
    def remove_special_chars(self, text):
        """Remove special characters and non-Arabic letters."""
        # Remove non-linguistic characters like * / # etc.
        text = re.sub(r'[^\w\sاأإآء-ي]', ' ', text)
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def preprocess(self, text):
        """Apply full preprocessing pipeline."""
        if not isinstance(text, str):
            return ""
        
        # Normalize Arabic text
        text = text.lower()
        text = self.remove_tashkeel(text)
        text = self.remove_tatweel(text)
        text = self.normalize_hamza(text)
        
        # Remove special characters and non-linguistic symbols
        text = self.remove_special_chars(text)
        
        return text

class EnhancedVAELoss(nn.Module):
    def __init__(self, alpha=1.0, beta=0.5, gamma=0.3):
        super().__init__()
        self.alpha = alpha  # Reconstruction weight
        self.beta = beta   # KL divergence weight
        self.gamma = gamma # Similarity preservation weight
        
    def forward(self, recon_x, x, mu, log_var, z):
        # Reconstruction loss with cosine similarity
        recon_loss = F.mse_loss(recon_x, x, reduction='sum')
        cos_sim_loss = 1 - F.cosine_similarity(recon_x, x, dim=1).mean()
        
        # KL divergence loss
        kl_loss = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
        
        # Similarity preservation loss in latent space
        z_norm = F.normalize(z, p=2, dim=1)
        similarity_matrix = torch.mm(z_norm, z_norm.t())
        original_norm = F.normalize(x, p=2, dim=1)
        original_similarity = torch.mm(original_norm, original_norm.t())
        similarity_preservation_loss = F.mse_loss(similarity_matrix, original_similarity)
        
        # Combine losses
        total_loss = (
            self.alpha * (recon_loss + cos_sim_loss) + 
            self.beta * kl_loss + 
            self.gamma * similarity_preservation_loss
        )
        
        return total_loss, {
            'recon_loss': recon_loss.item(),
            'cos_sim_loss': cos_sim_loss.item(),
            'kl_loss': kl_loss.item(),
            'similarity_loss': similarity_preservation_loss.item()
        }

class ProductVAE(nn.Module):
    def __init__(self, embedding_dim, hidden_dim=256, latent_dim=128):
        super().__init__()
        
        # Enhanced encoder with residual connections
        self.encoder = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            ResidualBlock(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LayerNorm(hidden_dim // 2),
            nn.ReLU()
        )
        
        # VAE components
        self.fc_mu = nn.Linear(hidden_dim // 2, latent_dim)
        self.fc_var = nn.Linear(hidden_dim // 2, latent_dim)
        
        # Enhanced decoder with residual connections
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim // 2),
            nn.LayerNorm(hidden_dim // 2),
            nn.ReLU(),
            ResidualBlock(hidden_dim // 2),
            nn.Linear(hidden_dim // 2, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, embedding_dim)
        )
        
    def encode(self, x):
        h = self.encoder(x)
        return self.fc_mu(h), self.fc_var(h)
    
    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std
    
    def decode(self, z):
        return self.decoder(z)
    
    def forward(self, x):
        mu, log_var = self.encode(x)
        z = self.reparameterize(mu, log_var)
        return self.decode(z), mu, log_var, z

class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(dim, dim),
            nn.LayerNorm(dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(dim, dim)
        )
        
    def forward(self, x):
        return x + self.block(x)

class EnhancedProductDataset(Dataset):
    def __init__(self, df, bert_model, preprocessor):
        self.model = bert_model
        self.preprocessor = preprocessor
        
        # Preprocess and create embeddings
        texts = df['seller_item_name'].apply(
            lambda x: self.preprocessor.preprocess(x)
        ).tolist()
        
        self.embeddings = torch.tensor(
            self.model.encode(texts, convert_to_numpy=True),
            dtype=torch.float32
        )
        
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        return self.embeddings[idx]

def train_enhanced_vae(model, train_loader, optimizer, device, epochs=10):
    criterion = EnhancedVAELoss()
    model.train()
    
    for epoch in range(epochs):
        total_metrics = {'total_loss': 0, 'recon_loss': 0, 
                        'cos_sim_loss': 0, 'kl_loss': 0, 
                        'similarity_loss': 0}
        
        for batch_idx, data in enumerate(train_loader):
            data = data.to(device)
            optimizer.zero_grad()
            
            recon_batch, mu, log_var, z = model(data)
            loss, metrics = criterion(recon_batch, data, mu, log_var, z)
            
            loss.backward()
            optimizer.step()
            
            # Update metrics
            total_metrics['total_loss'] += loss.item()
            for key, value in metrics.items():
                total_metrics[key] += value
        
        # Print epoch metrics
        print(f'\nEpoch: {epoch+1}')
        for key, value in total_metrics.items():
            avg_value = value / len(train_loader.dataset)
            print(f'Average {key}: {avg_value:.4f}')

def match_products(query_text, master_df, vae_model, bert_model, preprocessor, device, top_k=5):
    vae_model.eval()
    
    # Preprocess query
    processed_query = preprocessor.preprocess(query_text)
    query_embedding = torch.tensor(
        bert_model.encode([processed_query]),
        dtype=torch.float32
    ).to(device)
    
    # Get query latent representation
    with torch.no_grad():
        query_mu, _ = vae_model.encode(query_embedding)
    
    # Process master products
    master_texts = master_df['product_name_ar'].apply(
        lambda x: preprocessor.preprocess(x)
    ).tolist()
    
    master_embeddings = torch.tensor(
        bert_model.encode(master_texts),
        dtype=torch.float32
    ).to(device)
    
    # Get master products latent representations
    with torch.no_grad():
        master_mu, _ = vae_model.encode(master_embeddings)
    
    # Calculate similarities
    similarities = F.cosine_similarity(
        query_mu.unsqueeze(0).expand(len(master_mu), -1),
        master_mu,
        dim=1
    )
    
    # Get top-k matches
    top_k_scores, top_k_indices = torch.topk(similarities, min(top_k, len(master_df)))
    
    return [
        {
            'product': master_df.iloc[idx]['product_name_ar'],
            'score': score.item(),
            'processed_text': master_texts[idx]
        }
        for score, idx in zip(top_k_scores, top_k_indices)
    ]


# Example usage
if __name__ == "__main__":
    # Define device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Define file paths (update these paths as needed)
    training_file = "/kaggle/input/augmented-dataset/PreDataset.xlsx"
    master_file = "/kaggle/input/product-matching-dataset/Masterfile.xlsx"
    test_file = "/kaggle/input/test-set-data/augmented_test_set.xlsx"
    output_file = "/kaggle/working/camel_results.xlsx"
    
    # Load datasets (replace with actual loading code)
    import pandas as pd
    training_df = pd.read_excel(training_file)
    master_df = pd.read_excel(master_file)
    
    # Initialize components
    preprocessor = ArabicTextPreprocessor()
    bert_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    vae_model = ProductVAE(
        embedding_dim=bert_model.get_sentence_embedding_dimension(),
        hidden_dim=256,
        latent_dim=128
    ).to(device)
    
    # Create dataset with preprocessing
    dataset = EnhancedProductDataset(training_df, bert_model, preprocessor)
    train_loader = DataLoader(dataset, batch_size=32, shuffle=True)
    
    # Train with enhanced loss
    optimizer = torch.optim.AdamW(vae_model.parameters(), lr=1e-3, weight_decay=0.01)
    train_enhanced_vae(vae_model, train_loader, optimizer, device)

In [None]:
import torch
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm

def match_dataset_vae(master_file, dataset_file, output_file, vae_model, encoder, preprocessor, threshold=0.5):
    """
    Match products using the trained VAE model.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    vae_model = vae_model.to(device)
    vae_model.eval()
    
    # Load data
    master_df = pd.read_excel(master_file)
    dataset_df = pd.read_excel(dataset_file)
    
    # Normalize text using the preprocessor
    master_df['Normalized Name'] = master_df['product_name_ar'].apply(preprocessor.preprocess)
    dataset_df['Normalized Name'] = dataset_df['seller_item_name'].apply(preprocessor.preprocess)
    
    # Create embeddings
    print("Creating embeddings for master products...")
    master_embeddings = encoder.encode(master_df['Normalized Name'].tolist())
    print("Creating embeddings for dataset products...")
    dataset_embeddings = encoder.encode(dataset_df['Normalized Name'].tolist())
    
    # Convert embeddings to tensors
    master_embeddings = torch.tensor(master_embeddings, dtype=torch.float32).to(device)
    dataset_embeddings = torch.tensor(dataset_embeddings, dtype=torch.float32).to(device)
    
    # Get latent representations using the VAE
    print("Encoding embeddings into latent space...")
    with torch.no_grad():
        master_mu, _ = vae_model.encode(master_embeddings)  # ✅ Fix
        dataset_mu, _ = vae_model.encode(dataset_embeddings)  # ✅ Fix
    
    results = []
    batch_size = 32
    
    print("Finding matches...")
    with torch.no_grad():
        for i in tqdm(range(0, len(dataset_df), batch_size)):
            dataset_batch = dataset_mu[i:i + batch_size]
            
            # Compute cosine similarity between dataset batch and all master embeddings
            similarities = F.cosine_similarity(
                dataset_batch.unsqueeze(1),  # Shape: (batch_size, 1, latent_dim)
                master_mu.unsqueeze(0),     # Shape: (1, num_master, latent_dim)
                dim=-1
            )  # Shape: (batch_size, num_master)
            
            # Get the best match for each item in the batch
            max_scores, max_indices = torch.max(similarities, dim=1)
            
            for k, (max_score, max_idx) in enumerate(zip(max_scores, max_indices)):
                if i + k >= len(dataset_df):
                    break
                    
                max_idx = max_idx.item()  # ✅ Convert tensor to integer

                results.append({
                    'Seller Item': dataset_df.iloc[i + k]['product_name_ar'],
                    'Matched Item': master_df.iloc[max_idx]['product_name_ar'],  # ✅ Fix
                    'sku': master_df.iloc[max_idx]['sku'],  # ✅ Fix
                    'Match Score': max_score.item(),
                    'Confidence': 'High' if max_score.item() > threshold else 'Low'
                })
    
    # Save results
    result_df = pd.DataFrame(results)
    result_df.to_excel(output_file, index=False)
    print(f"Matching completed. Results saved to {output_file}")

# Example usage
if __name__ == "__main__":
    try:
        # Define file paths
        master_file = "/kaggle/input/product-matching-dataset/Masterfile.xlsx"
        dataset_file = "/kaggle/input/manual-testing/results.xlsx"
        output_file = "/kaggle/working/vae_results.xlsx"
        
        # Initialize components
        preprocessor = ArabicTextPreprocessor()
        encoder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
        
        # Perform matching
        print("Performing product matching...")
        match_dataset_vae(master_file, dataset_file, output_file, vae_model, encoder, preprocessor)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
