# Multilingual Hate Speech Classification

This notebook compares transformer-based approaches for multilingual hate speech detection:

1. **XLM-RoBERTa**: Discriminative model with classification head
2. **mT5**: Generative model producing text labels ("yes"/"no")

## Workflow
1. Setup and Installation
2. Data Loading and Preprocessing
3. Model Training (XLM-RoBERTa and mT5)
4. Evaluation and Comparison
5. Multilingual Inference

In [1]:
# Install required packages
!pip install -q evaluate transformers datasets torch

In [2]:
"""Import required libraries and set random seeds for reproducibility."""

from collections import Counter
from typing import Any, Dict, List, Tuple

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import ClassLabel, DatasetDict, Value, concatenate_datasets, load_dataset
from transformers import (
    DataCollatorForSeq2Seq,
    LogitsProcessor,
    LogitsProcessorList,
    MT5ForConditionalGeneration,
    MT5Tokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    Trainer,
    TrainingArguments,
    XLMRobertaForSequenceClassification,
    XLMRobertaTokenizer,
    pipeline,
)

# Set random seeds for reproducibility.
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print("‚úì Libraries imported successfully")

‚úì Libraries imported successfully


## 2. Dataset Loading and Preprocessing

In [3]:
"""Load and create balanced multilingual hate speech dataset."""

# Configuration constants.
DATA_PATH = "/content/drive/MyDrive/Uni/data/Dataset/Training/MultiLanguageTrainDataset.csv"
SAMPLE_SIZE = 30000
TEST_SIZE = 0.2
MAX_LENGTH = 128

# Load full dataset.
print("Loading dataset...")
dataset = load_dataset('csv', data_files=DATA_PATH, split='train')

# Separate by label and create balanced sample.
hate_ds = dataset.filter(lambda x: x['label'] == 1.0)
no_hate_ds = dataset.filter(lambda x: x['label'] == 0.0)

print(f"Original: {len(hate_ds)} hate, {len(no_hate_ds)} non-hate")

# Sample equal amounts from each class.
samples_per_class = min(SAMPLE_SIZE // 2, len(hate_ds), len(no_hate_ds))
hate_sample = hate_ds.shuffle(seed=RANDOM_SEED).select(range(samples_per_class))
no_hate_sample = no_hate_ds.shuffle(seed=RANDOM_SEED).select(range(samples_per_class))

# Combine and cast labels.
balanced_dataset = concatenate_datasets([hate_sample, no_hate_sample])
balanced_dataset = balanced_dataset.cast_column(
    "label", ClassLabel(names=['not hate', 'hate'])
)

# Split into train/test sets with stratification.
print("Splitting dataset...")
train_test_split = balanced_dataset.train_test_split(
    test_size=TEST_SIZE,
    seed=RANDOM_SEED,
    stratify_by_column='label'
)

dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

# Verify results.
print(f"\n‚úì Dataset: {len(dataset_dict['train'])} train, {len(dataset_dict['test'])} test")
print(f"  Train: {dataset_dict['train']['label'].count(1)} hate, "
      f"{dataset_dict['train']['label'].count(0)} non-hate")
print(f"  Test:  {dataset_dict['test']['label'].count(1)} hate, "
      f"{dataset_dict['test']['label'].count(0)} non-hate")

Loading dataset...
Original: 84633 hate, 135348 non-hate


Casting the dataset:   0%|          | 0/30000 [00:00<?, ? examples/s]

Splitting dataset...

‚úì Dataset: 24000 train, 6000 test
  Train: 12000 hate, 12000 non-hate
  Test:  3000 hate, 3000 non-hate


## 3. Evaluation Metrics

In [4]:
"""Define evaluation metrics for model performance."""

METRICS = {
    'accuracy': evaluate.load("accuracy"),
    'precision': evaluate.load("precision"),
    'recall': evaluate.load("recall"),
    'f1': evaluate.load("f1")
}


def compute_metrics(predictions: np.ndarray, labels: np.ndarray) -> Dict[str, float]:
    """Compute classification metrics.

    Args:
        predictions: Model predictions (class labels).
        labels: True labels.

    Returns:
        Dictionary containing accuracy, precision, recall, and F1 scores.
    """
    results = {
        'accuracy': METRICS['accuracy'].compute(
            predictions=predictions, references=labels
        )['accuracy']
    }

    for metric_name in ['precision', 'recall', 'f1']:
        result = METRICS[metric_name].compute(
            predictions=predictions,
            references=labels,
            average="weighted"
        )
        results[metric_name] = result[metric_name]

    return results


print("‚úì Metrics loaded")

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


‚úì Metrics loaded


## 4. XLM-RoBERTa Model Training

In [5]:
"""Preprocess and train XLM-RoBERTa model."""

# Initialize tokenizer.
xlm_tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")


def preprocess_xlm(examples: Dict[str, List]) -> Dict[str, Any]:
    """Tokenize text for XLM-RoBERTa."""
    tokenized = xlm_tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_LENGTH
    )
    tokenized["labels"] = examples["label"]
    return tokenized


# Preprocess dataset.
xlm_dataset = dataset_dict.map(preprocess_xlm, batched=True)
xlm_dataset = xlm_dataset.cast_column("labels", Value("int64"))
xlm_dataset = xlm_dataset.remove_columns(['Unnamed: 0', 'text', 'language'])

print("‚úì XLM-RoBERTa preprocessing complete")

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/24000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6000 [00:00<?, ? examples/s]

‚úì XLM-RoBERTa preprocessing complete


In [6]:
"""Train XLM-RoBERTa model."""

NUM_EPOCHS = 5
BATCH_SIZE = 16
LEARNING_RATE = 2e-5


def compute_xlm_metrics(eval_pred: Tuple[np.ndarray, np.ndarray]) -> Dict[str, float]:
    """Compute metrics for XLM-RoBERTa."""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return compute_metrics(predictions, labels)


# Load model.
xlm_model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=2,
    problem_type="single_label_classification"
)

# Configure training.
training_args = TrainingArguments(
    output_dir="./results_xlm",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
)

# Train.
xlm_trainer = Trainer(
    model=xlm_model,
    args=training_args,
    train_dataset=xlm_dataset["train"],
    eval_dataset=xlm_dataset["test"],
    tokenizer=xlm_tokenizer,
    compute_metrics=compute_xlm_metrics,
)

print(f"Training XLM-RoBERTa for {NUM_EPOCHS} epochs...")
xlm_trainer.train()
print("‚úì Training complete")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  xlm_trainer = Trainer(


Training XLM-RoBERTa for 5 epochs...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4644,0.462044,0.782667,0.788373,0.782667,0.781586
2,0.3903,0.460363,0.7975,0.8075,0.7975,0.79584
3,0.2872,0.523524,0.813333,0.817809,0.813333,0.812674
4,0.2179,0.580328,0.815833,0.81851,0.815833,0.815446
5,0.1723,0.737509,0.82,0.821457,0.82,0.819796


‚úì Training complete


### 4.1 XLM-RoBERTa Evaluation

In [7]:
"""Evaluate XLM-RoBERTa model."""

xlm_results = xlm_trainer.evaluate()

print("\n" + "=" * 50)
print("XLM-ROBERTA RESULTS")
print("=" * 50)
for metric, value in xlm_results.items():
    if isinstance(value, float) and 'loss' not in metric:
        print(f"{metric.upper():<20}: {value:.4f} ({value * 100:.2f}%)")
print("=" * 50)


XLM-ROBERTA RESULTS
EVAL_ACCURACY       : 0.8200 (82.00%)
EVAL_PRECISION      : 0.8215 (82.15%)
EVAL_RECALL         : 0.8200 (82.00%)
EVAL_F1             : 0.8198 (81.98%)
EVAL_RUNTIME        : 14.8580 (1485.80%)
EVAL_SAMPLES_PER_SECOND: 403.8230 (40382.30%)
EVAL_STEPS_PER_SECOND: 25.2390 (2523.90%)
EPOCH               : 5.0000 (500.00%)


In [8]:
"""Helper functions for mT5 text-to-text generation."""


def map_text_to_labels(prediction_texts: List[str]) -> List[int]:
    """Convert text predictions to binary labels.

    Args:
        prediction_texts: List of generated text predictions.

    Returns:
        List of integer labels (0=non-hate, 1=hate).
    """
    labels = []
    for pred_text in prediction_texts:
        cleaned = pred_text.strip().lower()
        if "yes" in cleaned or cleaned == "y":
            labels.append(1)
        elif "no" in cleaned or cleaned == "n":
            labels.append(0)
        else:
            labels.append(0)  # Default to non-hate.
    return labels


def compute_mt5_metrics(
    eval_pred: Tuple[Any, np.ndarray],
    tokenizer: MT5Tokenizer
) -> Dict[str, float]:
    """Compute metrics for mT5 text generation.

    Args:
        eval_pred: Tuple containing predictions and labels.
        tokenizer: mT5 tokenizer for decoding.

    Returns:
        Dictionary containing classification metrics.
    """
    predictions, labels = eval_pred

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # Replace -100 with pad_token_id.
    predictions = np.where(
        predictions != -100, predictions, tokenizer.pad_token_id
    )
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode to text.
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Sample predictions.
    print("\nüîç Sample predictions:")
    for i in range(min(10, len(decoded_preds))):
        print(f"  Predicted: '{decoded_preds[i]}' | True: '{decoded_labels[i]}'")

    # Convert to labels.
    pred_labels = map_text_to_labels(decoded_preds)
    true_labels = map_text_to_labels(decoded_labels)

    # Distribution statistics.
    print(f"\nüìä Predictions: Yes={sum(pred_labels)}, No={len(pred_labels)-sum(pred_labels)}")
    print(f"üìä True labels: Yes={sum(true_labels)}, No={len(true_labels)-sum(true_labels)}")

    return compute_metrics(np.array(pred_labels), np.array(true_labels))


print("‚úì mT5 helper functions defined")

‚úì mT5 helper functions defined


## 5. mT5 Model Training

In [9]:
"""Preprocess data for mT5 text-to-text format."""

MT5_MODEL_CHECKPOINT = "google/mt5-small"
mt5_tokenizer = MT5Tokenizer.from_pretrained(MT5_MODEL_CHECKPOINT)


def preprocess_mt5(examples: Dict[str, List]) -> Dict[str, Any]:
    """Preprocess data for mT5 text-to-text format."""
    inputs = [f"Is this hate speech? {text}" for text in examples["text"]]
    targets = ["yes" if label == 1 else "no" for label in examples["label"]]

    model_inputs = mt5_tokenizer(inputs, max_length=MAX_LENGTH, truncation=True)
    labels = mt5_tokenizer(targets, max_length=3, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


# Preprocess dataset.
mt5_dataset = dataset_dict.map(preprocess_mt5, batched=True)
mt5_dataset = mt5_dataset.remove_columns(["text", "label", "Unnamed: 0", "language"])

print("‚úì mT5 preprocessing complete")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

‚úì mT5 preprocessing complete


### 5.1 Constrained Generation Processor

In [10]:
"""Custom trainer with constrained yes/no generation."""


class ForceYesNoLogitsProcessor(LogitsProcessor):
    """Force model to generate only 'yes' or 'no' tokens."""

    def __init__(self, tokenizer: MT5Tokenizer):
        """Initialize processor with yes/no token IDs."""
        self.tokenizer = tokenizer
        self.yes_token_ids = tokenizer.encode("yes", add_special_tokens=False)
        self.no_token_ids = tokenizer.encode("no", add_special_tokens=False)
        self.eos_token_id = tokenizer.eos_token_id
        self.content_tokens = set(self.yes_token_ids + self.no_token_ids)

        print(f"‚úì Constrained decoding: yes={self.yes_token_ids}, no={self.no_token_ids}")

    def __call__(
        self,
        input_ids: torch.LongTensor,
        scores: torch.FloatTensor
    ) -> torch.FloatTensor:
        """Constrain generation to yes/no tokens then EOS."""
        batch_size = input_ids.shape[0]
        mask = torch.full_like(scores, float('-inf'))

        for batch_idx in range(batch_size):
            generated_ids = input_ids[batch_idx].tolist()
            has_content = any(tid in self.content_tokens for tid in generated_ids)

            if not has_content:
                # Allow only yes/no tokens.
                for token_id in self.content_tokens:
                    mask[batch_idx, token_id] = 0
            else:
                # Force EOS after content.
                mask[batch_idx, self.eos_token_id] = 0

        return scores + mask


class ConstrainedSeq2SeqTrainer(Seq2SeqTrainer):
    """Seq2Seq trainer with constrained yes/no generation."""

    def __init__(self, *args, use_constrained_generation=True, **kwargs):
        super().__init__(*args, **kwargs)
        self.use_constrained_generation = use_constrained_generation
        if use_constrained_generation:
            self.yes_no_processor = ForceYesNoLogitsProcessor(self.processing_class)

    def prediction_step(
        self,
        model,
        inputs,
        prediction_loss_only,
        ignore_keys=None
    ):
        """Override to add constrained generation during evaluation."""
        if not self.args.predict_with_generate or prediction_loss_only:
            return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)

        has_labels = "labels" in inputs
        inputs = self._prepare_inputs(inputs)

        gen_kwargs = {
            "max_length": self.args.generation_max_length or 3,
            "num_beams": self.args.generation_num_beams or 1,
        }

        if self.use_constrained_generation:
            gen_kwargs["logits_processor"] = LogitsProcessorList([self.yes_no_processor])

        with torch.no_grad():
            generated_tokens = self.model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                **gen_kwargs,
            )

        loss = None
        if has_labels:
            with torch.no_grad():
                outputs = model(**inputs)
                loss = outputs.loss.mean().detach()

        labels = inputs.get("labels")

        # Pad tensors to same length.
        if labels is not None:
            max_length = max(generated_tokens.shape[1], labels.shape[1])

            if generated_tokens.shape[1] < max_length:
                pad_length = max_length - generated_tokens.shape[1]
                generated_tokens = torch.nn.functional.pad(
                    generated_tokens,
                    (0, pad_length),
                    value=self.processing_class.pad_token_id
                )

            if labels.shape[1] < max_length:
                pad_length = max_length - labels.shape[1]
                labels = torch.nn.functional.pad(labels, (0, pad_length), value=-100)

        return (loss, generated_tokens, labels)


print("‚úì Constrained trainer defined")

‚úì Constrained trainer defined


### 5.2 Train mT5 Model

In [11]:
"""Train mT5 model with constrained generation."""

MT5_LEARNING_RATE = 3e-4
MT5_BATCH_SIZE = 16
MT5_EPOCHS = 15

# Initialize model and data collator.
mt5_model = MT5ForConditionalGeneration.from_pretrained(MT5_MODEL_CHECKPOINT)
mt5_data_collator = DataCollatorForSeq2Seq(
    tokenizer=mt5_tokenizer,
    model=mt5_model,
    padding=True
)

# Configure training.
mt5_training_args = Seq2SeqTrainingArguments(
    output_dir="./results_mt5",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=MT5_LEARNING_RATE,
    per_device_train_batch_size=MT5_BATCH_SIZE,
    per_device_eval_batch_size=MT5_BATCH_SIZE,
    num_train_epochs=MT5_EPOCHS,
    weight_decay=0.001,
    warmup_ratio=0.1,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    predict_with_generate=True,
    generation_max_length=3,
    generation_num_beams=1,
    report_to="none",
    save_total_limit=2,
    lr_scheduler_type="cosine",
)

# Initialize trainer.
mt5_trainer = ConstrainedSeq2SeqTrainer(
    model=mt5_model,
    args=mt5_training_args,
    train_dataset=mt5_dataset["train"],
    eval_dataset=mt5_dataset["test"],
    tokenizer=mt5_tokenizer,
    data_collator=mt5_data_collator,
    compute_metrics=lambda eval_pred: compute_mt5_metrics(eval_pred, mt5_tokenizer),
    use_constrained_generation=True,
)

# Train.
print(f"\nTraining mT5 for {MT5_EPOCHS} epochs...")
print(f"Train: {len(mt5_dataset['train'])}, Test: {len(mt5_dataset['test'])}")
mt5_trainer.train()
print("‚úì mT5 training complete")

  super().__init__(*args, **kwargs)


‚úì Constrained decoding: yes=[36339], no=[375]

Training mT5 for 15 epochs...
Train: 24000, Test: 6000


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3162,0.29237,0.706333,0.706922,0.706333,0.706124
2,0.2731,0.262219,0.7475,0.763673,0.7475,0.743568
3,0.2289,0.248811,0.775,0.775824,0.775,0.774832
4,0.1728,0.249506,0.793833,0.794883,0.793833,0.79365
5,0.1819,0.249531,0.794833,0.795282,0.794833,0.794755
6,0.1418,0.30888,0.788167,0.788328,0.788167,0.788137
7,0.1547,0.306789,0.785667,0.785753,0.785667,0.785651
8,0.0955,0.341782,0.7895,0.789662,0.7895,0.789471
9,0.053,0.45984,0.788,0.790387,0.788,0.787563
10,0.0705,0.421972,0.785167,0.785169,0.785167,0.785166



üîç Sample predictions:
  Predicted: 'no' | True: 'no'
  Predicted: 'yes' | True: 'no'
  Predicted: 'no' | True: 'no'
  Predicted: 'yes' | True: 'no'
  Predicted: 'yes' | True: 'yes'
  Predicted: 'yes' | True: 'yes'
  Predicted: 'no' | True: 'no'
  Predicted: 'no' | True: 'no'
  Predicted: 'no' | True: 'yes'
  Predicted: 'no' | True: 'no'

üìä Predictions: Yes=3160, No=2840
üìä True labels: Yes=3000, No=3000

üîç Sample predictions:
  Predicted: 'no' | True: 'no'
  Predicted: 'no' | True: 'no'
  Predicted: 'no' | True: 'no'
  Predicted: 'yes' | True: 'no'
  Predicted: 'yes' | True: 'yes'
  Predicted: 'yes' | True: 'yes'
  Predicted: 'no' | True: 'no'
  Predicted: 'no' | True: 'no'
  Predicted: 'no' | True: 'yes'
  Predicted: 'no' | True: 'no'

üìä Predictions: Yes=2257, No=3743
üìä True labels: Yes=3000, No=3000

üîç Sample predictions:
  Predicted: 'no' | True: 'no'
  Predicted: 'yes' | True: 'no'
  Predicted: 'no' | True: 'no'
  Predicted: 'yes' | True: 'no'
  Predicted: 'yes

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


‚úì mT5 training complete


In [12]:
"""Evaluate mT5 model."""

mt5_results = mt5_trainer.evaluate()

print("\n" + "=" * 50)
print("mT5 RESULTS")
print("=" * 50)
for metric, value in mt5_results.items():
    if isinstance(value, float) and 'loss' not in metric:
        print(f"{metric.upper():<20}: {value:.4f} ({value * 100:.2f}%)")
print("=" * 50)


üîç Sample predictions:
  Predicted: 'no' | True: 'no'
  Predicted: 'yes' | True: 'no'
  Predicted: 'no' | True: 'no'
  Predicted: 'yes' | True: 'no'
  Predicted: 'yes' | True: 'yes'
  Predicted: 'yes' | True: 'yes'
  Predicted: 'yes' | True: 'no'
  Predicted: 'no' | True: 'no'
  Predicted: 'yes' | True: 'yes'
  Predicted: 'no' | True: 'no'

üìä Predictions: Yes=2883, No=3117
üìä True labels: Yes=3000, No=3000

mT5 RESULTS
EVAL_ACCURACY       : 0.7948 (79.48%)
EVAL_PRECISION      : 0.7953 (79.53%)
EVAL_RECALL         : 0.7948 (79.48%)
EVAL_F1             : 0.7948 (79.48%)
EVAL_RUNTIME        : 32.1106 (3211.06%)
EVAL_SAMPLES_PER_SECOND: 186.8540 (18685.40%)
EVAL_STEPS_PER_SECOND: 11.6780 (1167.80%)
EPOCH               : 15.0000 (1500.00%)


## 6. Multilingual Inference Demo

In [13]:
"""Inference functions for both models."""


def run_xlm_inference(
    trainer: Trainer,
    tokenizer: XLMRobertaTokenizer,
    text: str
) -> Dict[str, Any]:
    """Run XLM-RoBERTa inference."""
    classifier = pipeline(
        "text-classification",
        model=trainer.model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )
    return classifier(text)[0]


def run_mt5_inference(
    model: MT5ForConditionalGeneration,
    tokenizer: MT5Tokenizer,
    text: str
) -> str:
    """Run mT5 inference."""
    formatted_input = f"Is this hate speech? {text}"
    inputs = tokenizer(formatted_input, return_tensors="pt").to(model.device)

    output_sequences = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=8,
        num_beams=4,
        early_stopping=True,
        do_sample=False
    )

    return tokenizer.decode(output_sequences[0], skip_special_tokens=True)


print("‚úì Inference functions defined")

‚úì Inference functions defined


In [14]:
"""Run multilingual inference on non-hate speech samples."""

NON_HATE_SAMPLES = {
    "English": "This is a wonderful piece of text.",
    "French": "C'est un texte merveilleux.",
    "German": "Das ist ein wundervoller Text.",
    "Italian": "Questo √® un testo meraviglioso."
}

results = []

for lang, text in NON_HATE_SAMPLES.items():
    print(f"\n{'='*60}\nüåê {lang}\n{'='*60}")

    xlm_pred = run_xlm_inference(xlm_trainer, xlm_tokenizer, text)
    mt5_pred = run_mt5_inference(mt5_model, mt5_tokenizer, text)

    results.append({
        "Language": lang,
        "Type": "Non-Hate",
        "Input": text,
        "XLM-R Label": xlm_pred.get("label", "N/A"),
        "XLM-R Score": round(xlm_pred.get("score", 0), 4),
        "mT5": mt5_pred
    })

# Display results.
df_results = pd.DataFrame(results)
print(f"\n{'='*60}\nüìä NON-HATE SPEECH RESULTS\n{'='*60}")
print(df_results.to_string(index=False))

Device set to use cuda:0



üåê English


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0



üåê French

üåê German

üåê Italian

üìä NON-HATE SPEECH RESULTS
Language     Type                              Input XLM-R Label  XLM-R Score mT5
 English Non-Hate This is a wonderful piece of text.     LABEL_0       0.9992  no
  French Non-Hate        C'est un texte merveilleux.     LABEL_0       0.9994  no
  German Non-Hate     Das ist ein wundervoller Text.     LABEL_0       0.9988  no
 Italian Non-Hate    Questo √® un testo meraviglioso.     LABEL_0       0.9991  no


In [15]:
"""Run multilingual inference on hate speech samples."""

HATE_SAMPLES = {
    "English": "I hate you and your stupid ideas.",
    "French": "Je te d√©teste et tes id√©es stupides.",
    "German": "Ich hasse dich und deine dummen Ideen.",
    "Italian": "Ti odio e le tue stupide idee."
}

hate_results = []

for lang, text in HATE_SAMPLES.items():
    print(f"\n{'='*60}\nüåê {lang}\n{'='*60}")

    xlm_pred = run_xlm_inference(xlm_trainer, xlm_tokenizer, text)
    mt5_pred = run_mt5_inference(mt5_model, mt5_tokenizer, text)

    hate_results.append({
        "Language": lang,
        "Type": "Hate",
        "Input": text,
        "XLM-R Label": xlm_pred.get("label", "N/A"),
        "XLM-R Score": round(xlm_pred.get("score", 0), 4),
        "mT5": mt5_pred
    })

# Display results.
df_hate = pd.DataFrame(hate_results)
print(f"\n{'='*60}\nüìä HATE SPEECH RESULTS\n{'='*60}")
print(df_hate.to_string(index=False))

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0



üåê English

üåê French

üåê German


Device set to use cuda:0



üåê Italian

üìä HATE SPEECH RESULTS
Language Type                                  Input XLM-R Label  XLM-R Score mT5
 English Hate      I hate you and your stupid ideas.     LABEL_0       0.6672  no
  French Hate   Je te d√©teste et tes id√©es stupides.     LABEL_1       0.9970  no
  German Hate Ich hasse dich und deine dummen Ideen.     LABEL_1       0.9945  no
 Italian Hate         Ti odio e le tue stupide idee.     LABEL_1       0.9979 yes


In [16]:
"""Combine and export all inference results."""

# Combine both result sets.
all_results = results + hate_results
df_all = pd.DataFrame(all_results)

# Export to CSV.
csv_path = "multilingual_inference_results.csv"
df_all.to_csv(csv_path, index=False)

print(f"\n{'='*60}\nüåç COMBINED MULTILINGUAL RESULTS\n{'='*60}")
print(df_all.to_string(index=False))
print(f"\n‚úÖ Results saved to: {csv_path}\n{'='*60}")


üåç COMBINED MULTILINGUAL RESULTS
Language     Type                                  Input XLM-R Label  XLM-R Score mT5
 English Non-Hate     This is a wonderful piece of text.     LABEL_0       0.9992  no
  French Non-Hate            C'est un texte merveilleux.     LABEL_0       0.9994  no
  German Non-Hate         Das ist ein wundervoller Text.     LABEL_0       0.9988  no
 Italian Non-Hate        Questo √® un testo meraviglioso.     LABEL_0       0.9991  no
 English     Hate      I hate you and your stupid ideas.     LABEL_0       0.6672  no
  French     Hate   Je te d√©teste et tes id√©es stupides.     LABEL_1       0.9970  no
  German     Hate Ich hasse dich und deine dummen Ideen.     LABEL_1       0.9945  no
 Italian     Hate         Ti odio e le tue stupide idee.     LABEL_1       0.9979 yes

‚úÖ Results saved to: multilingual_inference_results.csv
