In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    ElectraTokenizerFast,
    ElectraModel,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
import random
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)


2025-06-28 15:31:51.051452: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751124711.251832      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751124711.318388      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
class ArabicTextDataset(Dataset):
    """Custom dataset for Arabic text multi-task classification"""
    
    def __init__(self, texts, emotions, offensive, hate, tokenizer, max_length=512):
        self.texts = texts
        self.emotions = emotions
        self.offensive = offensive
        self.hate = hate
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'emotion': torch.tensor(self.emotions[idx], dtype=torch.long),
            'offensive': torch.tensor(self.offensive[idx], dtype=torch.long),
            'hate': torch.tensor(self.hate[idx], dtype=torch.long)
        }

print("Dataset class defined successfully!")


Dataset class defined successfully!


In [None]:
class MultiTaskAraELECTRA(nn.Module):
    """Multi-task classification model using Araberta"""
    
    def __init__(self, model_name, num_emotions, num_offensive, num_hate, dropout=0.3):
        super(MultiTaskAraELECTRA, self).__init__()
        
        # Load pre-trained AraELECTRA model
        self.electra = ElectraModel.from_pretrained(model_name)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)
        
        # Classification heads for each task
        hidden_size = self.electra.config.hidden_size
        
        self.emotion_classifier = nn.Linear(hidden_size, num_emotions)
        self.offensive_classifier = nn.Linear(hidden_size, num_offensive)
        self.hate_classifier = nn.Linear(hidden_size, num_hate)
    
    def forward(self, input_ids, attention_mask):
        # Get ELECTRA outputs
        outputs = self.electra(input_ids=input_ids, attention_mask=attention_mask)
        
        # Use last hidden state and apply mean pooling
        # ELECTRA doesn't have pooler_output, so we use mean pooling
        last_hidden_state = outputs.last_hidden_state
        
        # Mean pooling over sequence length
        pooled_output = torch.mean(last_hidden_state, dim=1)
        pooled_output = self.dropout(pooled_output)
        
        # Get predictions for each task
        emotion_logits = self.emotion_classifier(pooled_output)
        offensive_logits = self.offensive_classifier(pooled_output)
        hate_logits = self.hate_classifier(pooled_output)
        
        return emotion_logits, offensive_logits, hate_logits

print("Multi-task AraELECTRA model class defined successfully!")


Multi-task AraELECTRA model class defined successfully!


In [None]:
class ArabicMultiTaskClassifier:
    """Main classifier class for Arabic multi-task text classification"""
    
    def __init__(self, model_name='aubmindlab/bert-base-arabertv2', max_length=512):
        self.model_name = model_name
        self.max_length = max_length
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Initialize tokenizer - use ElectraTokenizer for AraELECTRA
        self.tokenizer = ElectraTokenizerFast.from_pretrained(model_name)
        
        # Label encoders for each task
        self.emotion_encoder = LabelEncoder()
        self.offensive_encoder = LabelEncoder()
        self.hate_encoder = LabelEncoder()
        
        print(f"Using device: {self.device}")
    
    def load_data(self, file_path):
        """Load and preprocess data from Excel or CSV file"""
        print("Loading data...")
        
        # Determine file type and read accordingly
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file_path.endswith(('.xlsx', '.xls')):
            df = pd.read_excel(file_path)
        else:
            raise ValueError("Unsupported file format. Please use CSV (.csv) or Excel (.xlsx, .xls) files.")
        
        # Basic data info
        print(f"Dataset shape: {df.shape}")
        print("\nColumn names:", df.columns.tolist())
        print("\nFirst few rows:")
        print(df.head())
        
        # Check for missing values
        print("\nMissing values:")
        print(df.isnull().sum())
        
        # Remove rows with missing text
        df = df.dropna(subset=['text'])
        
        # Fill missing labels with 'unknown' or most frequent value
        df['Emotion'] = df['Emotion'].fillna('neutral')
        df['Offensive'] = df['Offensive'].fillna('no')
        df['Hate'] = df['Hate'].fillna('not_hate')
        
        # Encode labels
        df['emotion_encoded'] = self.emotion_encoder.fit_transform(df['Emotion'])
        df['offensive_encoded'] = self.offensive_encoder.fit_transform(df['Offensive'])
        df['hate_encoded'] = self.hate_encoder.fit_transform(df['Hate'])
        
        # Print label distributions
        print("\nLabel distributions:")
        print("Emotions:", df['Emotion'].value_counts())
        print("Offensive:", df['Offensive'].value_counts())
        print("Hate:", df['Hate'].value_counts())
        
        return df
    
    def create_model(self, num_emotions, num_offensive, num_hate):
        """Create and initialize the multi-task model"""
        print("Creating AraELECTRA model...")
        
        model = MultiTaskAraELECTRA(
            self.model_name,
            num_emotions,
            num_offensive,
            num_hate
        )
        
        model.to(self.device)
        return model
    
    def train_model_full_data(self, model, train_loader, num_epochs=5, learning_rate=2e-5):
        """Train the multi-task model on full dataset without validation"""
        print("Starting training on full dataset...")
        
        # Loss functions for each task
        criterion_emotion = nn.CrossEntropyLoss()
        criterion_offensive = nn.CrossEntropyLoss()
        criterion_hate = nn.CrossEntropyLoss()
        
        # Optimizer and scheduler
        optimizer = AdamW(model.parameters(), lr=learning_rate)
        total_steps = len(train_loader) * num_epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )
        
        # Training history
        train_losses = []
        
        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch + 1}/{num_epochs}")
            
            # Training phase
            model.train()
            total_train_loss = 0
            
            for batch in tqdm(train_loader, desc="Training"):
                optimizer.zero_grad()
                
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                emotion_labels = batch['emotion'].to(self.device)
                offensive_labels = batch['offensive'].to(self.device)
                hate_labels = batch['hate'].to(self.device)
                
                # Forward pass
                emotion_logits, offensive_logits, hate_logits = model(input_ids, attention_mask)
                
                # Calculate losses
                emotion_loss = criterion_emotion(emotion_logits, emotion_labels)
                offensive_loss = criterion_offensive(offensive_logits, offensive_labels)
                hate_loss = criterion_hate(hate_logits, hate_labels)
                
                # Combined loss (weighted sum)
                total_loss = emotion_loss + offensive_loss + hate_loss
                
                # Backward pass
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                
                total_train_loss += total_loss.item()
            
            avg_train_loss = total_train_loss / len(train_loader)
            train_losses.append(avg_train_loss)
            
            print(f"Average training loss: {avg_train_loss:.4f}")
        
        return train_losses
    
    def predict_text(self, model, text):
        """Predict labels for a single text"""
        model.eval()
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        
        with torch.no_grad():
            emotion_logits, offensive_logits, hate_logits = model(input_ids, attention_mask)
            
            # Get predictions
            emotion_pred = torch.argmax(emotion_logits, dim=1).item()
            offensive_pred = torch.argmax(offensive_logits, dim=1).item()
            hate_pred = torch.argmax(hate_logits, dim=1).item()
            
            # Get probabilities
            emotion_probs = torch.softmax(emotion_logits, dim=1)[0]
            offensive_probs = torch.softmax(offensive_logits, dim=1)[0]
            hate_probs = torch.softmax(hate_logits, dim=1)[0]
            
            # Convert to original labels
            emotion_label = self.emotion_encoder.inverse_transform([emotion_pred])[0]
            offensive_label = self.offensive_encoder.inverse_transform([offensive_pred])[0]
            hate_label = self.hate_encoder.inverse_transform([hate_pred])[0]
            
            return {
                'emotion': {
                    'label': emotion_label,
                    'confidence': emotion_probs[emotion_pred].item()
                },
                'offensive': {
                    'label': offensive_label,
                    'confidence': offensive_probs[offensive_pred].item()
                },
                'hate': {
                    'label': hate_label,
                    'confidence': hate_probs[hate_pred].item()
                }
            }

print("Main classifier class defined successfully!")


Main classifier class defined successfully!


In [None]:
def train_arabic_classifier_full_data(data_file_path, num_epochs=3, batch_size=16, learning_rate=2e-5):
    """
    Train the model using ALL training data without splitting
    
    Args:
        data_file_path: Path to training data (CSV or Excel)
        num_epochs: Number of training epochs
        batch_size: Batch size for training
        learning_rate: Learning rate for optimizer
    
    Returns:
        classifier: Trained classifier object
        model: Trained model
    """
    
    print("="*60)
    print("ARABIC MULTI-TASK TEXT CLASSIFICATION WITH ARAELECTRA")
    print("TRAINING ON FULL DATASET")
    print("="*60)
    
    # Initialize classifier with AraELECTRA
    classifier = ArabicMultiTaskClassifier(
        model_name='aubmindlab/araelectra-base-discriminator'
    )
    
    # Load and prepare ALL training data
    df = classifier.load_data(data_file_path)
    
    # Use ALL data for training (no train/val/test split)
    train_dataset = ArabicTextDataset(
        df['text'].values,
        df['emotion_encoded'].values,
        df['offensive_encoded'].values,
        df['hate_encoded'].values,
        classifier.tokenizer,
        classifier.max_length
    )
    
    # Create data loader for full training set
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # Get number of classes for each task
    num_emotions = len(classifier.emotion_encoder.classes_)
    num_offensive = len(classifier.offensive_encoder.classes_)
    num_hate = len(classifier.hate_encoder.classes_)
    
    print(f"\nTraining on {len(df)} samples")
    print(f"Number of emotion classes: {num_emotions}")
    print(f"Number of offensive classes: {num_offensive}")
    print(f"Number of hate classes: {num_hate}")
    
    # Create model
    model = classifier.create_model(num_emotions, num_offensive, num_hate)
    
    # Train model on full dataset
    train_losses = classifier.train_model_full_data(
        model, train_loader, num_epochs=num_epochs, learning_rate=learning_rate
    )
    
    # Save model
    torch.save(model.state_dict(), 'aubmindlab/bert-base-arabertv2.pth')
    print("\nModel saved as 'aubmindlab/bert-base-arabertv2.pth'")
    
    return classifier, model

print("Training function defined successfully!")


Training function defined successfully!


In [6]:
def predict_and_save_validation(classifier, model, validation_file_path, output_file_path=None):
    """
    Make predictions on validation file and save results
    
    Args:
        classifier: Trained classifier object
        model: Trained model
        validation_file_path: Path to validation CSV/Excel file
        output_file_path: Path to save predictions (optional)
    
    Returns:
        results_df: DataFrame with predictions
    """
    print(f"Loading validation file: {validation_file_path}")
    
    # Load validation file
    if validation_file_path.endswith('.csv'):
        val_df = pd.read_csv(validation_file_path)
    elif validation_file_path.endswith(('.xlsx', '.xls')):
        val_df = pd.read_excel(validation_file_path)
    else:
        raise ValueError("Unsupported file format. Please use CSV or Excel files.")
    
    print(f"Validation file loaded with {len(val_df)} samples")
    
    # Check if 'text' column exists
    if 'text' not in val_df.columns:
        raise ValueError("Validation file must contain a 'text' column")
    
    # Make predictions for all samples
    model.eval()
    all_predictions = []
    
    print("Making predictions on validation data...")
    for i, text in enumerate(tqdm(val_df['text'], desc="Predicting")):
        try:
            prediction = classifier.predict_text(model, str(text))
            all_predictions.append(prediction)
        except Exception as e:
            print(f"Error predicting text at index {i}: {e}")
            # Add default prediction for failed cases
            all_predictions.append({
                'emotion': {'label': 'neutral', 'confidence': 0.0},
                'offensive': {'label': 'no', 'confidence': 0.0},
                'hate': {'label': 'not_hate', 'confidence': 0.0}
            })
    
    # Create results DataFrame
    results_df = val_df.copy()
    results_df['emotion_predicted'] = [pred['emotion']['label'] for pred in all_predictions]
    results_df['emotion_confidence'] = [pred['emotion']['confidence'] for pred in all_predictions]
    results_df['offensive_predicted'] = [pred['offensive']['label'] for pred in all_predictions]
    results_df['offensive_confidence'] = [pred['offensive']['confidence'] for pred in all_predictions]
    results_df['hate_predicted'] = [pred['hate']['label'] for pred in all_predictions]
    results_df['hate_confidence'] = [pred['hate']['confidence'] for pred in all_predictions]
    
    # Set output filename if not provided
    if output_file_path is None:
        output_file_path = f"validation_predictions_{validation_file_path.split('/')[-1].split('.')[0]}.xlsx"
    
    # Save predictions
    results_df.to_excel(output_file_path, index=False)
    print(f"\nPredictions saved to '{output_file_path}'")
    
    # Show sample predictions
    print("\n" + "="*80)
    print("SAMPLE PREDICTIONS")
    print("="*80)
    
    for i in range(min(5, len(results_df))):
        row = results_df.iloc[i]
        text = str(row['text'])
        print(f"\nExample {i+1}:")
        print(f"Text: {text[:100]}{'...' if len(text) > 100 else ''}")
        print(f"Predictions -> Emotion: {row['emotion_predicted']} (conf: {row['emotion_confidence']:.3f})")
        print(f"              Offensive: {row['offensive_predicted']} (conf: {row['offensive_confidence']:.3f})")
        print(f"              Hate: {row['hate_predicted']} (conf: {row['hate_confidence']:.3f})")
        print("-" * 80)
    
    return results_df

print("Prediction function defined successfully!")


Prediction function defined successfully!


In [7]:
# Step 1: Train model on ALL training data
print("Starting training process...")
classifier, model = train_arabic_classifier_full_data(
    data_file_path='/kaggle/input/train44/train.csv',  # Your training file
    num_epochs=3,
    batch_size=16,
    learning_rate=2e-5
)

print("\nTraining completed successfully!")

# Step 2: Make predictions on validation file and save results
print("\nStarting prediction process...")
validation_results = predict_and_save_validation(
    classifier=classifier,
    model=model,
    validation_file_path='/kaggle/input/validation/validation.csv',  # Your validation file
    output_file_path='/kaggle/working/validation_predictions_araelectra.xlsx'  # Output file
)

print("\nAll processes completed successfully!")
print(f"Final predictions saved with {len(validation_results)} samples")


Starting training process...
ARABIC MULTI-TASK TEXT CLASSIFICATION WITH ARAELECTRA
TRAINING ON FULL DATASET


tokenizer_config.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

Using device: cuda
Loading data...
Dataset shape: (5960, 5)

Column names: ['id', 'text', 'Emotion', 'Offensive', 'Hate']

First few rows:
     id                                               text       Emotion  \
0  2537  أحد التجار الشباب العمانيين يقول للاسف لما يكو...       neutral   
1  5579  @JALHARBISKY مجموعه القدرة الجنسيه👍<LF> <LF>بد...      optimism   
2  6092        @rwn4o حبيبييي والله اكثثثرر يارب امين🥺♥️♥️          love   
3  2540  #وصال_دوت_FM<LF>مع سميرة الفطيسية @Samira_Alfu...       neutral   
4  3159  من ينتزع ارواح اطفالنا من أجسادها بكل وحشية عل...  anticipation   

  Offensive Hate  
0        no  NaN  
1        no  NaN  
2        no  NaN  
3        no  NaN  
4        no  NaN  

Missing values:
id              0
text            0
Emotion         0
Offensive       0
Hate         4216
dtype: int64

Label distributions:
Emotions: Emotion
anger           1551
disgust          777
neutral          661
love             593
joy              533
anticipation     491
opti

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

Starting training on full dataset...

Epoch 1/3


Training: 100%|██████████| 373/373 [05:15<00:00,  1.18it/s]


Average training loss: 2.4768

Epoch 2/3


Training: 100%|██████████| 373/373 [05:14<00:00,  1.19it/s]


Average training loss: 1.9411

Epoch 3/3


Training: 100%|██████████| 373/373 [05:14<00:00,  1.19it/s]


Average training loss: 1.6898

Model saved as 'arabic_multitask_araelectra_full_model.pth'

Training completed successfully!

Starting prediction process...
Loading validation file: /kaggle/input/validation/validation.csv
Validation file loaded with 1277 samples
Making predictions on validation data...


Predicting: 100%|██████████| 1277/1277 [00:28<00:00, 44.67it/s]



Predictions saved to '/kaggle/working/validation_predictions_araelectra.xlsx'

SAMPLE PREDICTIONS

Example 1:
Text: النيوك عندي مثل شرب الفناجيل #PS4share https://t.co/NgkLViK32J
Predictions -> Emotion: neutral (conf: 0.752)
              Offensive: no (conf: 0.904)
              Hate: not_hate (conf: 0.980)
--------------------------------------------------------------------------------

Example 2:
Text: لن أتعاطف مع نادي بعض جمهوره الطقطقه عنده شتم و قذفوا والدة عبدالرزاق حمدالله واساءوا له داخل الملعب...
Predictions -> Emotion: anger (conf: 0.594)
              Offensive: yes (conf: 0.956)
              Hate: not_hate (conf: 0.818)
--------------------------------------------------------------------------------

Example 3:
Text: يا ربي ايه الظلم ده😢<LF>ام لخمس اطفال تثتغيث<LF>لفقوا تهم لزوجها عشان رفض يشتغل معاهم مرشد<LF>كسروا ...
Predictions -> Emotion: anger (conf: 0.813)
              Offensive: no (conf: 0.727)
              Hate: not_hate (conf: 0.967)
------------------------