In [1]:
from google.colab import drive
drive.mount('/content/drive')
model_save_path = "/content/drive/MyDrive/emotions_classifier/best_model/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install --upgrade transformers torch datasets



In [3]:
labels_name=[
          "admiration",
          "amusement",
          "anger",
          "annoyance",
          "approval",
          "caring",
          "confusion",
          "curiosity",
          "desire",
          "disappointment",
          "disapproval",
          "disgust",
          "embarrassment",
          "excitement",
          "fear",
          "gratitude",
          "grief",
          "joy",
          "love",
          "nervousness",
          "optimism",
          "pride",
          "realization",
          "relief",
          "remorse",
          "sadness",
          "surprise",
          "neutral"
        ]

In [4]:
!pip install datasets

from datasets import load_dataset

dataset = load_dataset("AnasAlokla/multilingual_go_emotions")





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# Now you can access the dataset, e.g., print some examples from the train split
print(dataset['train'][0])

{'text': 'Kardeşim. Ben de burada oturuyorum, uyanık bir şekilde, tek burun deliğimden zar zor nefes alıyorum. Her on saniyede bir burnumu silmeye çalışıyorum ve bir şeyler oluyor.', 'labels': '[27]', 'id': 'TReebym1y', 'language': 'tr'}


In [6]:
dataset['train'][0]['labels']

'[27]'

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from datasets import load_dataset
import numpy as np
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report
)
import ast
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import torch.cuda.amp as amp  # Mixed precision training

# Advanced device selection
def select_best_device():
    if torch.cuda.is_available():
        # Prefer multi-GPU training if available
        if torch.cuda.device_count() > 1:
            print(f"Using {torch.cuda.device_count()} GPUs!")
            return torch.device("cuda")
        else:
            # Check GPU properties
            gpu = torch.cuda.current_device()
            gpu_name = torch.cuda.get_device_name(gpu)
            gpu_capability = torch.cuda.get_device_capability(gpu)
            print(f"Using GPU: {gpu_name} (Compute Capability: {gpu_capability})")
            return torch.device("cuda")
    else:
        print("No GPU available. Using CPU (training will be slow).")
        return torch.device("cpu")

# Use the advanced device selection
device = select_best_device()

class GoEmotionsDataset(Dataset):
    def __init__(self, texts, labels, label_map, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.label_map = label_map
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        # Convert string label representation to multi-hot encoding
        label_indices = ast.literal_eval(self.labels[idx])
        multi_hot_labels = np.zeros(len(self.label_map), dtype=np.float32)
        for label_idx in label_indices:
            multi_hot_labels[label_idx] = 1.0

        # Tokenize text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(multi_hot_labels, dtype=torch.float)
        }

def compute_metrics(preds, labels):
    """
    Compute comprehensive metrics for multi-label, multi-class classification

    Parameters:
    -----------
    preds : numpy.ndarray
        Predicted labels (binary matrix)
    labels : numpy.ndarray
        True labels (binary matrix)

    Returns:
    --------
    dict
        Comprehensive classification metrics
    """
    # Ensure inputs are numpy arrays
    preds = np.array(preds)
    labels = np.array(labels)

    # Validate input shapes
    if preds.shape != labels.shape:
        raise ValueError("Predictions and labels must have the same shape")

    # Metrics per class
    per_class_metrics = []

    for i in range(preds.shape[1]):
        class_metrics = {
            'class_index': i,
            'accuracy': accuracy_score(labels[:, i], preds[:, i]),
            'precision': precision_score(labels[:, i], preds[:, i], zero_division=0),
            'recall': recall_score(labels[:, i], preds[:, i], zero_division=0),
            'f1_score': f1_score(labels[:, i], preds[:, i], zero_division=0)
        }
        per_class_metrics.append(class_metrics)

    # Macro and micro averaging
    macro_metrics = {
        'macro_precision': np.mean([m['precision'] for m in per_class_metrics]),
        'macro_recall': np.mean([m['recall'] for m in per_class_metrics]),
        'macro_f1_score': np.mean([m['f1_score'] for m in per_class_metrics])
    }

    # Micro averaging (treat all predictions as a single binary classification problem)
    micro_precision = precision_score(labels.ravel(), preds.ravel(), zero_division=0)
    micro_recall = recall_score(labels.ravel(), preds.ravel(), zero_division=0)
    micro_f1 = f1_score(labels.ravel(), preds.ravel(), zero_division=0)

    # Hamming loss (fraction of incorrectly predicted labels)
    hamming_loss = np.mean(np.not_equal(labels, preds))

    # Exact match ratio (percentage of samples with all labels correctly predicted)
    exact_match_ratio = np.mean(np.all(labels == preds, axis=1))

    return {
        'per_class_metrics': per_class_metrics,
        'macro_metrics': macro_metrics,
        'micro_precision': micro_precision,
        'micro_recall': micro_recall,
        'micro_f1_score': micro_f1,
        'hamming_loss': hamming_loss,
        'exact_match_ratio': exact_match_ratio,
        'overall_accuracy': np.mean([m['accuracy'] for m in per_class_metrics])
    }

def compute_metrics2(preds, labels):
    """
    Compute accuracy and F1 score for multi-label classification
    """
    accuracies = []
    f1_scores = []

    for i in range(preds.shape[1]):
        accuracies.append(accuracy_score(labels[:, i], preds[:, i]))
        f1_scores.append(f1_score(labels[:, i], preds[:, i], average='binary'))

    return {
        'accuracy': np.mean(accuracies),
        'f1_score': np.mean(f1_scores)
    }

def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, device, epochs=3, patience=3):
    # Initialize mixed precision scaler
    scaler = amp.GradScaler()

    best_val_loss = float('inf')
    early_stopping_counter = 0
    best_model_state = None

    # Tracking metrics for plotting
    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []
    train_f1_scores, val_f1_scores = [], []

    # Create checkpoint directory
    os.makedirs('checkpoints', exist_ok=True)

    # Prepare for multi-GPU training if available
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    for epoch in range(epochs):
        # Early stopping check
        if early_stopping_counter >= patience:
            print(f"\nEarly stopping triggered after {epoch} epochs.")
            break

        model.train()
        total_train_loss = 0
        all_train_preds, all_train_labels = [], []

        # Progress bar for training
        progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs}', unit='batch')

        for batch_idx, batch in enumerate(progress_bar):
            optimizer.zero_grad()

            # Move data to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Mixed precision training
            with amp.autocast(enabled=device.type == 'cuda'):
                outputs = model(
                    input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss

            # Scaled loss for mixed precision
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_train_loss += loss.item()

            # Get predictions for metrics
            logits = outputs.logits
            preds = (torch.sigmoid(logits) > 0.5).float()

            all_train_preds.append(preds.cpu().numpy())
            all_train_labels.append(labels.cpu().numpy())

        # Compute training metrics
        all_train_preds = np.concatenate(all_train_preds)
        all_train_labels = np.concatenate(all_train_labels)
        train_metrics = compute_metrics(all_train_preds, all_train_labels)

        # Validation
        model.eval()
        total_val_loss = 0
        all_val_preds, all_val_labels = [], []

        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                with amp.autocast(enabled=device.type == 'cuda'):
                    outputs = model(
                        input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
                    loss = outputs.loss

                total_val_loss += loss.item()

                # Get predictions
                logits = outputs.logits
                preds = (torch.sigmoid(logits) > 0.5).float()

                all_val_preds.append(preds.cpu().numpy())
                all_val_labels.append(labels.cpu().numpy())

        # Compute validation metrics
        all_val_preds = np.concatenate(all_val_preds)
        all_val_labels = np.concatenate(all_val_labels)
        val_metrics = compute_metrics(all_val_preds, all_val_labels)

        # Compute average losses
        avg_train_loss = total_train_loss / len(train_dataloader)
        avg_val_loss = total_val_loss / len(val_dataloader)

        # Store metrics for plotting
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        train_accuracies.append(train_metrics['overall_accuracy'])
        val_accuracies.append(val_metrics['overall_accuracy'])
        train_f1_scores.append(train_metrics['micro_f1_score'])
        val_f1_scores.append(val_metrics['micro_f1_score'])

        # Print epoch summary
        print(f"\nEpoch {epoch+1}/{epochs}")
        print(f"Average Training Loss: {avg_train_loss:.4f}")
        print(f"Average Validation Loss: {avg_val_loss:.4f}")
        print(f"Training Accuracy: {train_metrics['overall_accuracy']:.4f}")
        print(f"Validation Accuracy: {val_metrics['overall_accuracy']:.4f}")
        print(f"Training F1 Score: {train_metrics['micro_f1_score']:.4f}")
        print(f"Validation F1 Score: {val_metrics['micro_f1_score']:.4f}")

        # Early Stopping Logic
        if avg_val_loss < best_val_loss:
            # Reset early stopping counter
            early_stopping_counter = 0

            # Update best validation loss
            best_val_loss = avg_val_loss

            # Save the best model state
            best_model_state = model.state_dict() if not isinstance(model, torch.nn.DataParallel) else model.module.state_dict()

            # Save best model checkpoint
            #model.save_pretrained('checkpoints/best_model/')
            model.save_pretrained(model_save_path)

            print(">>> New best model saved! <<<")
        else:
            # Increment early stopping counter
            early_stopping_counter += 1
            print(f"No improvement. Early stopping counter: {early_stopping_counter}/{patience}")

        # Model checkpoint for current epoch
        checkpoint_path = f'checkpoints/model_epoch_{epoch+1}/'
        model.save_pretrained(checkpoint_path)

    # Plot learning curves
    plt.figure(figsize=(15, 5))

    plt.subplot(1, 3, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Loss Curves')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 3, 2)
    plt.plot(train_accuracies, label='Training Accuracy')
    plt.plot(val_accuracies, label='Validation Accuracy')
    plt.title('Accuracy Curves')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 3, 3)
    plt.plot(train_f1_scores, label='Training F1 Score')
    plt.plot(val_f1_scores, label='Validation F1 Score')
    plt.title('F1 Score Curves')
    plt.xlabel('Epoch')
    plt.ylabel('F1 Score')
    plt.legend()

    plt.tight_layout()
    plt.savefig(model_save_path+'learning_curves.png')
    plt.close()

    return model

def main():
    # Load dataset
    dataset = load_dataset('AnasAlokla/multilingual_go_emotions')

    # Prepare tokenizer
    tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-multilingual-cased')

    #save tokenizer
    tokenizer.save_pretrained(model_save_path)

    # Prepare labels
    label_columns = labels_name
    num_labels = len(label_columns)

    # Create label map (index to label mapping)
    label_map = {i: label for i, label in enumerate(label_columns)}

    # Split dataset
    train_texts = dataset['train']['text']
    train_labels = dataset['train']['labels']
    val_texts = dataset['validation']['text']
    val_labels = dataset['validation']['labels']

    # Create datasets
    train_dataset = GoEmotionsDataset(train_texts, train_labels, label_map, tokenizer)
    val_dataset = GoEmotionsDataset(val_texts, val_labels, label_map, tokenizer)

    # Optimize batch size based on GPU memory
    batch_size = (128+64) * (torch.cuda.device_count() if torch.cuda.is_available() else 1)

    # Create dataloaders with pin_memory for faster data transfer to GPU
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True,
        num_workers=4
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        pin_memory=True,
        num_workers=4
    )

    # Initialize model
    model = BertForSequenceClassification.from_pretrained(
        'google-bert/bert-base-multilingual-cased',
        num_labels=num_labels,
        problem_type='multi_label_classification'
    ).to(device)

    # Prepare optimizer and scheduler with weight decay
    optimizer = AdamW(
        model.parameters(),
        lr=2e-5,
        weight_decay=0.01,  # Added weight decay
        eps=1e-8
    )
    epochs=10
    total_steps = len(train_dataloader) * epochs  # 3 epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),  # 10% warmup
        num_training_steps=total_steps
    )

    # Train model
    trained_model = train_model(
        model,
        train_dataloader,
        val_dataloader,
        optimizer,
        scheduler,
        device,
        epochs=epochs,
        patience=3  # Early stopping patience

    )
    # save model
    #model_save_path = "/content/drive/MyDrive/emotions_classifier/"

    #trained_model.save_pretrained(model_save_path)

    # Evaluation code remains the same as in the original script
    # Optional: Evaluate and print classification report
    """
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels']

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = (torch.sigmoid(logits) > 0.5).float()

            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.numpy())

    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    # Print classification report for each emotion
    print("\nClassification Report:")
    for i, emotion in enumerate(label_columns):
        print(f"\n{emotion}:")
        print(classification_report(
            all_labels[:, i],
            all_preds[:, i],
            target_names=labels_name
        ))

    # Save label map for future reference
    with open('label_map.json', 'w') as f:
        json.dump(label_map, f)
"""
if __name__ == "__main__":
    main()

Using GPU: Tesla T4 (Compute Capability: (7, 5))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = amp.GradScaler()
  with amp.autocast(enabled=device.type == 'cuda'):
Epoch 1/10: 100%|██████████| 1357/1357 [20:20<00:00,  1.11batch/s]
  with amp.autocast(enabled=device.type == 'cuda'):



Epoch 1/10
Average Training Loss: 0.2350
Average Validation Loss: 0.1290
Training Accuracy: 0.9351
Validation Accuracy: 0.9625
Training F1 Score: 0.0558
Validation F1 Score: 0.2805
>>> New best model saved! <<<


  with amp.autocast(enabled=device.type == 'cuda'):
Epoch 2/10: 100%|██████████| 1357/1357 [20:22<00:00,  1.11batch/s]
  with amp.autocast(enabled=device.type == 'cuda'):



Epoch 2/10
Average Training Loss: 0.1125
Average Validation Loss: 0.1010
Training Accuracy: 0.9650
Validation Accuracy: 0.9664
Training F1 Score: 0.4030
Validation F1 Score: 0.4575
>>> New best model saved! <<<


  with amp.autocast(enabled=device.type == 'cuda'):
Epoch 3/10: 100%|██████████| 1357/1357 [20:22<00:00,  1.11batch/s]
  with amp.autocast(enabled=device.type == 'cuda'):



Epoch 3/10
Average Training Loss: 0.0949
Average Validation Loss: 0.0957
Training Accuracy: 0.9681
Validation Accuracy: 0.9672
Training F1 Score: 0.4995
Validation F1 Score: 0.5022
>>> New best model saved! <<<


  with amp.autocast(enabled=device.type == 'cuda'):
Epoch 4/10: 100%|██████████| 1357/1357 [20:22<00:00,  1.11batch/s]
  with amp.autocast(enabled=device.type == 'cuda'):



Epoch 4/10
Average Training Loss: 0.0859
Average Validation Loss: 0.0946
Training Accuracy: 0.9704
Validation Accuracy: 0.9668
Training F1 Score: 0.5591
Validation F1 Score: 0.5162
>>> New best model saved! <<<


  with amp.autocast(enabled=device.type == 'cuda'):
Epoch 5/10: 100%|██████████| 1357/1357 [20:23<00:00,  1.11batch/s]
  with amp.autocast(enabled=device.type == 'cuda'):



Epoch 5/10
Average Training Loss: 0.0788
Average Validation Loss: 0.0962
Training Accuracy: 0.9725
Validation Accuracy: 0.9665
Training F1 Score: 0.6041
Validation F1 Score: 0.5278
No improvement. Early stopping counter: 1/3


  with amp.autocast(enabled=device.type == 'cuda'):
Epoch 6/10: 100%|██████████| 1357/1357 [20:23<00:00,  1.11batch/s]
  with amp.autocast(enabled=device.type == 'cuda'):



Epoch 6/10
Average Training Loss: 0.0726
Average Validation Loss: 0.0979
Training Accuracy: 0.9745
Validation Accuracy: 0.9656
Training F1 Score: 0.6415
Validation F1 Score: 0.5180
No improvement. Early stopping counter: 2/3


  with amp.autocast(enabled=device.type == 'cuda'):
Epoch 7/10: 100%|██████████| 1357/1357 [20:24<00:00,  1.11batch/s]
  with amp.autocast(enabled=device.type == 'cuda'):



Epoch 7/10
Average Training Loss: 0.0675
Average Validation Loss: 0.1010
Training Accuracy: 0.9763
Validation Accuracy: 0.9651
Training F1 Score: 0.6732
Validation F1 Score: 0.5245
No improvement. Early stopping counter: 3/3

Early stopping triggered after 7 epochs.
