In [1]:
# Import necessary libraries
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, AdamW
from sklearn.metrics import classification_report

# Step 1: Load and preprocess the dataset
data = pd.read_csv('./Bengali Heatspeech dataset.csv').dropna()

# Tokenize the text using XLM-Roberta tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
tokenized_data = tokenizer(
    list(data['Text']),  # List of text samples
    padding=True,        # Pad to the longest sequence
    truncation=True,     # Truncate sequences longer than max_length
    max_length=128,      # Maximum sequence length
    return_tensors="pt"  # Return PyTorch tensors
)

# Extract input IDs and attention masks
input_ids = tokenized_data["input_ids"]
attention_masks = tokenized_data["attention_mask"]

# Extract labels
labels = data[['Race', 'Behaviour', 'Physical', 'Class', 'Religion', 'Disability', 'Ethnicity', 'Gender', 'Sexual Orientation', 'Political']]
labels = torch.tensor(labels.values, dtype=torch.float32)

# Split the data into training and validation sets
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.2, random_state=42
)

# Create TensorDatasets
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64)

In [None]:
# Step 2: Define the Multi-task Learning Model
class MultiTaskModel(nn.Module):
    def __init__(self, base_model, num_tasks):
        super(MultiTaskModel, self).__init__()
        self.base_model = base_model
        self.task_heads = nn.ModuleList([nn.Linear(768, 1) for _ in range(num_tasks)])  # 768 is the hidden size of XLM-Roberta

    def forward(self, input_ids, attention_mask):
        # Shared encoder output
        shared_output = self.base_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        # Task-specific outputs
        task_outputs = [head(shared_output) for head in self.task_heads]
        return task_outputs


# Load the pre-trained XLM-Roberta model
base_model = AutoModel.from_pretrained("xlm-roberta-base")

# Extract column names from the original labels DataFrame
label_names = data[['Race', 'Behaviour', 'Physical', 'Class', 'Religion', 'Disability', 'Ethnicity', 'Gender', 'Sexual Orientation', 'Political']].columns.tolist()

# Number of tasks
num_tasks = len(label_names)

# Initialize the multi-task model
multitask_model = MultiTaskModel(base_model, num_tasks)

# Step 3: Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(multitask_model.parameters(), lr=1e-5)

# Step 4: Define the training loop
# def compute_loss(outputs, labels):
#     total_loss = 0
#     for i, output in enumerate(outputs):
#         total_loss += criterion(output.squeeze(), labels[:, i])
#     return total_loss

def compute_loss(outputs, labels, class_indices=None):
    total_loss = 0
    if class_indices is not None:
        # Use only the outputs and labels for the specified class indices
        outputs = [outputs[i] for i in class_indices]
        for i, output in enumerate(outputs):
            total_loss += criterion(output.squeeze(), labels[:, i])
    else:
        # General case: use all outputs and labels
        for i, output in enumerate(outputs):
            total_loss += criterion(output.squeeze(), labels[:, i])
    return total_loss


# Define a threshold for stopping
class LossMonitor:
    def __init__(self, patience=3):
        self.previous_loss = float('inf')  # Initialize with a very high value
        self.patience = patience          # Number of epochs to tolerate increasing loss
        self.counter = 0                  # Counter for consecutive increases

    def check_loss(self, current_loss):
        if current_loss > self.previous_loss or current_loss < 0:
            self.counter += 1
            print(f"Warning: Loss increased or became negative! Counter: {self.counter}")
        else:
            self.counter = 0  # Reset counter if loss improves

        self.previous_loss = current_loss

        # Stop training if loss increases for `patience` consecutive epochs
        return self.counter >= self.patience


# Training the model
epochs = 200
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
multitask_model.to(device)

# Initialize the loss monitor
loss_monitor = LossMonitor(patience=3)

for epoch in range(epochs):
    multitask_model.train()
    total_train_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = multitask_model(input_ids=input_ids, attention_mask=attention_mask)
        loss = compute_loss(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(multitask_model.parameters(), max_norm=1.0)  # Enable gradient clipping
        optimizer.step()

        total_train_loss += loss.item()

    # Calculate average loss for the epoch
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Check if training should stop
    if loss_monitor.check_loss(avg_train_loss):
        print("Stopping training due to increasing or invalid loss.")
        break

    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss}")

2025-06-07 02:05:08.045652: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-07 02:05:08.054700: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749240308.064345  376422 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749240308.067535  376422 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749240308.075653  376422 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Epoch 1/200, Training Loss: 5.108789652585983
Epoch 2/200, Training Loss: 3.6857836320996284
Epoch 3/200, Training Loss: 3.427482984960079
Epoch 4/200, Training Loss: 3.308367505669594
Epoch 5/200, Training Loss: 3.1876787170767784
Epoch 6/200, Training Loss: 3.016574703156948
Epoch 7/200, Training Loss: 2.9044854566454887
Epoch 8/200, Training Loss: 2.8155576810240746
Epoch 9/200, Training Loss: 2.7580373361706734
Epoch 10/200, Training Loss: 2.705940544605255
Epoch 11/200, Training Loss: 2.654635727405548
Epoch 12/200, Training Loss: 2.5661684535443783
Epoch 13/200, Training Loss: 2.526083506643772
Epoch 14/200, Training Loss: 2.460718672722578
Epoch 15/200, Training Loss: 2.425331048667431
Epoch 16/200, Training Loss: 2.368496961891651
Epoch 17/200, Training Loss: 2.2924506589770317
Epoch 18/200, Training Loss: 2.2342964708805084
Epoch 19/200, Training Loss: 2.186908580362797
Epoch 20/200, Training Loss: 2.110852997750044
Epoch 21/200, Training Loss: 2.0857756324112415
Epoch 22/200,

In [None]:
import numpy as np
from sklearn.metrics import classification_report, f1_score

# Step 5: Evaluate the model
multitask_model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        # Get model outputs
        outputs = multitask_model(input_ids=input_ids, attention_mask=attention_mask)

        # Ensure outputs are stacked correctly
        predictions = torch.cat([torch.sigmoid(output) for output in outputs], dim=1).cpu()  # Concatenate task outputs along dimension 1
        all_predictions.append(predictions)
        all_labels.append(labels.cpu())

# Flatten predictions and labels
all_predictions = torch.cat(all_predictions, dim=0)  # Concatenate along batch dimension
all_labels = torch.cat(all_labels, dim=0)

# Convert to NumPy for threshold tuning
all_predictions = all_predictions.numpy()
all_labels = all_labels.numpy()

# Tune thresholds for each label
best_thresholds = []
for i in range(all_labels.shape[1]):  # Iterate over each label
    best_f1 = 0
    best_threshold = 0.5
    for threshold in np.arange(0.1, 1.0, 0.1):  # Test thresholds from 0.1 to 0.9
        preds = (all_predictions[:, i] > threshold).astype(int)
        f1 = f1_score(all_labels[:, i], preds)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    best_thresholds.append(best_threshold)

print("Best thresholds:", best_thresholds)

# Apply thresholds to predictions
final_predictions = np.zeros_like(all_predictions)
for i, threshold in enumerate(best_thresholds):
    final_predictions[:, i] = (all_predictions[:, i] > threshold).astype(int)

# Extract column names from the original labels DataFrame
label_names = data[['Race', 'Behaviour', 'Physical', 'Class', 'Religion', 'Disability', 'Ethnicity', 'Gender', 'Sexual Orientation', 'Political']].columns.tolist()

# Generate classification report
print(classification_report(all_labels, final_predictions, target_names=label_names))

---

### **1. General Training**
- **Purpose**: Train the model on all tasks (labels) using the full dataset.
- **Outcome**:
  - The shared base model (`XLM-Roberta`) learns general representations for all tasks.
  - The task-specific heads are trained to predict each label, but resource-limited classes may not perform well due to insufficient data or imbalance.

---

### **2. Identify Low-Performing Classes**
- After general training, evaluate the model on the validation set and analyze the **classification report**.
- Look for:
  - **Low F1-scores**: Indicates poor performance.
  - **Low support**: Indicates insufficient data for the class.
- Example (from your earlier report):
  - `Disability`: F1-score = 0.00 (support = 11)
  - `Physical`: F1-score = 0.21 (support = 30)
  - `Gender`: F1-score = 0.33 (support = 45)
  - `Sexual Orientation`: F1-score = 0.40 (support = 36)

---

### **3. Few-Shot Fine-Tuning**
- **Purpose**: Focus on improving the performance of resource-limited classes.
- **Why It Works**:
  - The shared base model (`XLM-Roberta`) already has a strong foundation from general training.
  - Fine-tuning on resource-limited classes allows the task-specific heads for these classes to adapt further without forgetting the general knowledge learned earlier.
- **Steps**:
  1. Extract a subset of the dataset containing examples for the resource-limited classes.
  2. Fine-tune the model on this subset.

---

### **4. Benefits of This Approach**
1. **Shared Knowledge**:
   - The shared base model retains the general knowledge learned during the first phase of training.
   - This helps the resource-limited classes benefit from the representations learned for other tasks.

2. **Efficient Use of Data**:
   - Few-shot fine-tuning focuses only on the resource-limited classes, making efficient use of the limited data available for these classes.

3. **Improved Balance**:
   - By improving the performance of low-performing classes, the overall balance across all tasks is improved.

---

### **Key Insight**
The **shared weights** in the base model ensure that the knowledge learned during general training is not lost. Fine-tuning only adjusts the task-specific heads and slightly refines the shared representations, making it an efficient and effective approach.

Let me know if you'd like help implementing or optimizing this workflow further! 🚀
---

### **1. General Training**
- **Purpose**: Train the model on all tasks (labels) using the full dataset.
- **Outcome**:
  - The shared base model (`XLM-Roberta`) learns general representations for all tasks.
  - The task-specific heads are trained to predict each label, but resource-limited classes may not perform well due to insufficient data or imbalance.

---

### **2. Identify Low-Performing Classes**
- After general training, evaluate the model on the validation set and analyze the **classification report**.
- Look for:
  - **Low F1-scores**: Indicates poor performance.
  - **Low support**: Indicates insufficient data for the class.
- Example (from your earlier report):
  - `Disability`: F1-score = 0.00 (support = 11)
  - `Physical`: F1-score = 0.21 (support = 30)
  - `Gender`: F1-score = 0.33 (support = 45)
  - `Sexual Orientation`: F1-score = 0.40 (support = 36)

---

### **3. Few-Shot Fine-Tuning**
- **Purpose**: Focus on improving the performance of resource-limited classes.
- **Why It Works**:
  - The shared base model (`XLM-Roberta`) already has a strong foundation from general training.
  - Fine-tuning on resource-limited classes allows the task-specific heads for these classes to adapt further without forgetting the general knowledge learned earlier.
- **Steps**:
  1. Extract a subset of the dataset containing examples for the resource-limited classes.
  2. Fine-tune the model on this subset.

---

### **4. Benefits of This Approach**
1. **Shared Knowledge**:
   - The shared base model retains the general knowledge learned during the first phase of training.
   - This helps the resource-limited classes benefit from the representations learned for other tasks.

2. **Efficient Use of Data**:
   - Few-shot fine-tuning focuses only on the resource-limited classes, making efficient use of the limited data available for these classes.

3. **Improved Balance**:
   - By improving the performance of low-performing classes, the overall balance across all tasks is improved.

---

### **Key Insight**
The **shared weights** in the base model ensure that the knowledge learned during general training is not lost. Fine-tuning only adjusts the task-specific heads and slightly refines the shared representations, making it an efficient and effective approach.

Let me know if you'd like help implementing or optimizing this workflow further! 🚀

In [None]:
def compute_loss(outputs, labels, class_indices=None):
    total_loss = 0
    if class_indices is not None:
        # Use only the outputs and labels for the specified class indices
        outputs = [outputs[i] for i in class_indices]  # Select outputs for resource-limited classes
        for i, output in enumerate(outputs):
            total_loss += criterion(output.squeeze(), labels[:, i])  # Match labels to selected outputs
    else:
        # General case: use all outputs and labels
        for i, output in enumerate(outputs):
            total_loss += criterion(output.squeeze(), labels[:, i])
    return total_loss

In [None]:
# Step 1: Identify Resource-Limited Classes
resource_limited_classes = ['Race', 'Physical', 'Class', 'Religion', 'Disability', 'Ethnicity']

# Step 2: Prepare Few-Shot Dataset
# Filter the dataset for resource-limited classes
few_shot_data = data[data[resource_limited_classes].sum(axis=1) > 0]  # Rows where at least one resource-limited class is positive

# Tokenize the few-shot dataset
few_shot_tokenized = tokenizer(
    list(few_shot_data['Text']),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Extract input IDs, attention masks, and labels
few_shot_inputs = few_shot_tokenized["input_ids"]
few_shot_masks = few_shot_tokenized["attention_mask"]
few_shot_labels = torch.tensor(few_shot_data[resource_limited_classes].values, dtype=torch.float32)

# Create a TensorDataset and DataLoader
few_shot_dataset = TensorDataset(few_shot_inputs, few_shot_masks, few_shot_labels)
few_shot_dataloader = DataLoader(few_shot_dataset, batch_size=8, shuffle=True)




# Create train_data and val_data using the indices
train_data = data.loc[train_indices]
val_data = data.loc[val_indices]

# Filter the training data for resource-limited classes
resource_limited_classes = ['Race', 'Physical', 'Class', 'Religion', 'Disability', 'Ethnicity']
few_shot_data = train_data[train_data[resource_limited_classes].sum(axis=1) > 0]  # Use only training data

# Tokenize the few-shot dataset
few_shot_tokenized = tokenizer(
    list(few_shot_data['Text']),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Extract input IDs, attention masks, and labels
few_shot_inputs = few_shot_tokenized["input_ids"]
few_shot_masks = few_shot_tokenized["attention_mask"]
few_shot_labels = torch.tensor(few_shot_data[resource_limited_classes].values, dtype=torch.float32)

# Create a TensorDataset and DataLoader
few_shot_dataset = TensorDataset(few_shot_inputs, few_shot_masks, few_shot_labels)
few_shot_dataloader = DataLoader(few_shot_dataset, batch_size=8, shuffle=True)

In [None]:
# Get the indices of the resource-limited classes
resource_indices = [label_names.index(cls) for cls in resource_limited_classes]

# Few-Shot Fine-Tuning
few_shot_epochs = 20
optimizer = AdamW(multitask_model.parameters(), lr=1e-5)

for epoch in range(few_shot_epochs):
    multitask_model.train()
    total_loss = 0

    for batch in few_shot_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = multitask_model(input_ids=input_ids, attention_mask=attention_mask)
        loss = compute_loss(outputs, labels, class_indices=resource_indices)  # Pass resource class indices
        loss.backward()
        torch.nn.utils.clip_grad_norm_(multitask_model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(few_shot_dataloader)
        # Check if training should stop
    if loss_monitor.check_loss(avg_train_loss):
        print("Stopping training due to increasing or invalid loss.")
        break

    # print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss}")
    print(f"Few-Shot Epoch {epoch + 1}/{few_shot_epochs}, Loss: {avg_loss}")

In [None]:
import numpy as np
from sklearn.metrics import classification_report, f1_score

# Step 5: Evaluate the model
multitask_model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        # Get model outputs
        outputs = multitask_model(input_ids=input_ids, attention_mask=attention_mask)

        # Ensure outputs are stacked correctly
        predictions = torch.cat([torch.sigmoid(output) for output in outputs], dim=1).cpu()  # Concatenate task outputs along dimension 1
        all_predictions.append(predictions)
        all_labels.append(labels.cpu())

# Flatten predictions and labels
all_predictions = torch.cat(all_predictions, dim=0)  # Concatenate along batch dimension
all_labels = torch.cat(all_labels, dim=0)

# Convert to NumPy for threshold tuning
all_predictions = all_predictions.numpy()
all_labels = all_labels.numpy()

# Tune thresholds for each label
best_thresholds = []
for i in range(all_labels.shape[1]):  # Iterate over each label
    best_f1 = 0
    best_threshold = 0.5
    for threshold in np.arange(0.1, 1.0, 0.1):  # Test thresholds from 0.1 to 0.9
        preds = (all_predictions[:, i] > threshold).astype(int)
        f1 = f1_score(all_labels[:, i], preds)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    best_thresholds.append(best_threshold)

print("Best thresholds:", best_thresholds)

# Apply thresholds to predictions
final_predictions = np.zeros_like(all_predictions)
for i, threshold in enumerate(best_thresholds):
    final_predictions[:, i] = (all_predictions[:, i] > threshold).astype(int)

# Extract column names from the original labels DataFrame
label_names = data[['Race', 'Behaviour', 'Physical', 'Class', 'Religion', 'Disability', 'Ethnicity', 'Gender', 'Sexual Orientation', 'Political']].columns.tolist()

# Generate classification report
print(classification_report(all_labels, final_predictions, target_names=label_names))

This classification report provides a detailed evaluation of the model's performance across multiple classes. Here's a breakdown of the metrics and their implications:

---

### **Metrics Explained**
1. **Precision**:
   - Precision measures the proportion of true positive predictions out of all positive predictions made by the model.
   - Formula: `Precision = True Positives / (True Positives + False Positives)`
   - High precision indicates fewer false positives.

2. **Recall**:
   - Recall measures the proportion of true positive predictions out of all actual positive instances.
   - Formula: `Recall = True Positives / (True Positives + False Negatives)`
   - High recall indicates fewer false negatives.

3. **F1-Score**:
   - The F1-score is the harmonic mean of precision and recall, balancing both metrics.
   - Formula: `F1-Score = 2 * (Precision * Recall) / (Precision + Recall)`
   - High F1-scores indicate a good balance between precision and recall.

4. **Support**:
   - Support refers to the number of actual instances for each class in the dataset.

---

### **Class-Level Analysis**
1. **Race**:
   - Precision: 0.71, Recall: 0.80, F1-Score: 0.75
   - The model performs well for this class, with balanced precision and recall.

2. **Behaviour**:
   - Precision: 0.50, Recall: 0.82, F1-Score: 0.62
   - High recall but lower precision indicates the model is good at identifying positive instances but may produce more false positives.

3. **Physical**:
   - Precision: 0.83, Recall: 1.00, F1-Score: 0.91
   - Excellent performance with perfect recall and high precision.

4. **Class**:
   - Precision: 0.61, Recall: 0.36, F1-Score: 0.45
   - Lower recall indicates the model struggles to identify positive instances for this class.

5. **Religion**:
   - Precision: 0.66, Recall: 0.79, F1-Score: 0.72
   - Balanced performance with good recall and precision.

6. **Disability**:
   - Precision: 0.83, Recall: 0.91, F1-Score: 0.87
   - Strong performance despite low support (11 samples).

7. **Ethnicity**:
   - Precision: 0.65, Recall: 0.83, F1-Score: 0.73
   - Good recall and balanced precision.

8. **Gender**:
   - Precision: 0.68, Recall: 0.89, F1-Score: 0.77
   - High recall and good precision indicate strong performance.

9. **Sexual Orientation**:
   - Precision: 0.80, Recall: 0.89, F1-Score: 0.84
   - Excellent performance with high precision and recall.

10. **Political**:
    - Precision: 0.77, Recall: 0.59, F1-Score: 0.67
    - Lower recall indicates the model misses some positive instances for this class.

---

### **Overall Metrics**
1. **Micro Average**:
   - Precision: 0.63, Recall: 0.78, F1-Score: 0.69
   - Micro average aggregates metrics globally across all classes, treating all instances equally.

2. **Macro Average**:
   - Precision: 0.70, Recall: 0.79, F1-Score: 0.73
   - Macro average computes metrics for each class independently and averages them, giving equal weight to all classes.

3. **Weighted Average**:
   - Precision: 0.65, Recall: 0.78, F1-Score: 0.69
   - Weighted average considers the support of each class, giving more weight to classes with higher support.

4. **Samples Average**:
   - Precision: 0.58, Recall: 0.66, F1-Score: 0.59
   - Samples average evaluates multi-label classification by averaging metrics across all samples.

---

### **Insights**
1. **Improved Performance for Resource-Limited Classes**:
   - Classes like `Disability`, `Physical`, `Gender`, and `Sexual Orientation` show significant improvement in precision, recall, and F1-score, likely due to the few-shot fine-tuning.

2. **Balanced Performance**:
   - The macro average F1-score (0.73) indicates balanced performance across all classes, including those with lower support.

3. **Areas for Improvement**:
   - Classes like `Class` and `Political` have lower recall, suggesting the model struggles to identify positive instances for these classes.

---

### **Conclusion**
The model demonstrates strong overall performance, particularly for resource-limited classes after few-shot fine-tuning. However, further optimization may be needed for classes with lower recall, such as `Class` and `Political`. Let me know if you'd like to explore specific improvements! 🚀

The type of **few-shot learning** used here is **fine-tuning-based few-shot learning**. This approach leverages a pre-trained model (e.g., XLM-Roberta) and fine-tunes it on a small subset of data for specific tasks or classes. Here's a breakdown of the methodology:

---

### **Type of Few-Shot Learning: Fine-Tuning-Based Few-Shot Learning**

#### **Key Characteristics**
1. **Pre-trained Model**:
   - The model starts with pre-trained weights from a large dataset (e.g., XLM-Roberta trained on multilingual text).
   - These weights provide general knowledge that can be transferred to the target tasks.

2. **Fine-Tuning**:
   - The model is fine-tuned on a small subset of labeled data for resource-limited classes.
   - This allows the model to adapt its task-specific heads and shared representations to better handle the few-shot data.

3. **Task-Specific Focus**:
   - Instead of retraining the model on all tasks, the fine-tuning focuses only on the resource-limited classes (e.g., `Disability`, `Physical`, `Gender`, `Sexual Orientation`).

---

### **Why Fine-Tuning-Based Few-Shot Learning?**
This approach is suitable because:
1. **Pre-trained Knowledge**:
   - The shared base model (XLM-Roberta) already has general knowledge from the initial training phase.
   - Fine-tuning refines this knowledge for specific tasks or classes.

2. **Small Data Availability**:
   - Few-shot learning is ideal for resource-limited classes with low support (e.g., `Disability` with only 11 samples).

3. **Efficiency**:
   - Fine-tuning only adjusts the task-specific heads and slightly refines the shared representations, making it computationally efficient.

---

### **Alternative Few-Shot Learning Approaches**
If you want to explore other few-shot learning paradigms, here are some alternatives:

#### **1. Meta-Learning-Based Few-Shot Learning**
- **Example**: **MAML (Model-Agnostic Meta-Learning)** or **Prototypical Networks**.
- **How It Works**:
  - The model is trained to quickly adapt to new tasks with minimal data.
  - Instead of fine-tuning, the model learns a meta-knowledge representation that generalizes across tasks.
- **Use Case**:
  - Ideal for scenarios where the model needs to adapt to entirely new tasks or classes.

#### **2. Prompt-Based Few-Shot Learning**
- **Example**: GPT-style models with in-context learning.
- **How It Works**:
  - The model is provided with a few examples in the input prompt (e.g., "Here are 3 examples of class X").
  - No fine-tuning is required; the model uses its pre-trained knowledge to infer the task.
- **Use Case**:
  - Suitable for large language models with extensive pre-training.

#### **3. Augmentation-Based Few-Shot Learning**
- **How It Works**:
  - Augment the few-shot dataset using techniques like paraphrasing, back-translation, or adding noise.
  - Train the model on the augmented dataset to improve generalization.
- **Use Case**:
  - Useful when the few-shot dataset is extremely small.

---

### **Why Fine-Tuning Was Chosen Here**
- **Pre-trained Model**: XLM-Roberta is already trained on a large multilingual corpus, making it ideal for transfer learning.
- **Resource-Limited Classes**: Fine-tuning allows targeted improvement for specific classes without retraining the entire model.
- **Efficiency**: Fine-tuning is computationally efficient compared to meta-learning or prompt-based approaches.

---

Let me know if you'd like to explore other few-shot learning paradigms or need further clarification! 🚀

In [1]:
# Import necessary libraries
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, AdamW
from sklearn.metrics import classification_report, f1_score
import numpy as np

# Step 1: Load the dataset
data = pd.read_csv('./Bengali Heatspeech dataset.csv').dropna()

# Step 2: Dynamically set the number of tasks
label_columns = ['Race', 'Behaviour', 'Physical', 'Class', 'Religion', 'Disability', 'Ethnicity', 'Gender', 'Sexual Orientation', 'Political']
num_tasks = len(label_columns)

# Step 3: Split the dataset into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 4: Tokenize the data
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize training data
train_tokenized = tokenizer(
    list(train_data['Text']),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Tokenize validation data
val_tokenized = tokenizer(
    list(val_data['Text']),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Extract input IDs, attention masks, and labels for training and validation data
train_inputs = train_tokenized["input_ids"]
train_masks = train_tokenized["attention_mask"]
train_labels = torch.tensor(train_data[label_columns].values, dtype=torch.float32)

val_inputs = val_tokenized["input_ids"]
val_masks = val_tokenized["attention_mask"]
val_labels = torch.tensor(val_data[label_columns].values, dtype=torch.float32)

# Step 5: Create DataLoaders
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64)

# Step 6: Define the Multi-task Learning Model
class MultiTaskModel(nn.Module):
    def __init__(self, base_model, num_tasks):
        super(MultiTaskModel, self).__init__()
        self.base_model = base_model
        self.task_heads = nn.ModuleList([nn.Linear(768, 1) for _ in range(num_tasks)])  # 768 is the hidden size of XLM-Roberta

    def forward(self, input_ids, attention_mask):
        shared_output = self.base_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        task_outputs = [head(shared_output) for head in self.task_heads]
        return task_outputs

base_model = AutoModel.from_pretrained("xlm-roberta-base")
multitask_model = MultiTaskModel(base_model, num_tasks)

# Step 7: Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(multitask_model.parameters(), lr=1e-5)

# Step 8: Training Loop
epochs = 50
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
multitask_model.to(device)

for epoch in range(epochs):
    multitask_model.train()
    total_train_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = multitask_model(input_ids=input_ids, attention_mask=attention_mask)
        # Debugging: Print shapes
        # print(f"Number of outputs: {len(outputs)}")
        # print(f"Labels shape: {labels.shape}")
        # Ensure outputs and labels are aligned
        assert len(outputs) == labels.shape[1], "Mismatch between model outputs and labels"
        loss = sum(criterion(output.squeeze(), labels[:, i]) for i, output in enumerate(outputs))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(multitask_model.parameters(), max_norm=1.0)
        optimizer.step()
        total_train_loss += loss.item()
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss}")

# Step 9: Evaluate the Model
multitask_model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = multitask_model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.cat([torch.sigmoid(output) for output in outputs], dim=1).cpu()
        all_predictions.append(predictions)
        all_labels.append(labels.cpu())

all_predictions = torch.cat(all_predictions, dim=0).numpy()
all_labels = torch.cat(all_labels, dim=0).numpy()

best_thresholds = []
for i in range(all_labels.shape[1]):
    best_f1 = 0
    best_threshold = 0.5
    for threshold in np.arange(0.1, 1.0, 0.1):
        preds = (all_predictions[:, i] > threshold).astype(int)
        f1 = f1_score(all_labels[:, i], preds)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    best_thresholds.append(best_threshold)

print("Best thresholds:", best_thresholds)

final_predictions = np.zeros_like(all_predictions)
for i, threshold in enumerate(best_thresholds):
    final_predictions[:, i] = (all_predictions[:, i] > threshold).astype(int)

print(classification_report(all_labels, final_predictions, target_names=label_columns))

2025-06-07 02:17:40.850819: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-07 02:17:40.863944: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749241060.877585  387582 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749241060.881638  387582 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749241060.893670  387582 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Epoch 1/50, Training Loss: 6.2243523597717285
Epoch 2/50, Training Loss: 3.790535628795624
Epoch 3/50, Training Loss: 3.467610940337181
Epoch 4/50, Training Loss: 3.343405559659004
Epoch 5/50, Training Loss: 3.1992236226797104
Epoch 6/50, Training Loss: 3.085482895374298
Epoch 7/50, Training Loss: 2.955268256366253
Epoch 8/50, Training Loss: 2.8634586110711098
Epoch 9/50, Training Loss: 2.79030305147171
Epoch 10/50, Training Loss: 2.7488602995872498
Epoch 11/50, Training Loss: 2.6752632558345795
Epoch 12/50, Training Loss: 2.643180102109909
Epoch 13/50, Training Loss: 2.5848512649536133
Epoch 14/50, Training Loss: 2.5357800610363483
Epoch 15/50, Training Loss: 2.4633786976337433
Epoch 16/50, Training Loss: 2.411793239414692
Epoch 17/50, Training Loss: 2.3641993403434753
Epoch 18/50, Training Loss: 2.3147896640002728
Epoch 19/50, Training Loss: 2.2783000953495502
Epoch 20/50, Training Loss: 2.218956932425499
Epoch 21/50, Training Loss: 2.136879589408636
Epoch 22/50, Training Loss: 2.105

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import numpy as np
from sklearn.metrics import f1_score
# Step 1: Generate predictions and labels
multitask_model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = multitask_model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.cat([torch.sigmoid(output) for output in outputs], dim=1).cpu()
        all_predictions.append(predictions)
        all_labels.append(labels.cpu())

# Flatten predictions and labels
all_predictions = torch.cat(all_predictions, dim=0).numpy()
all_labels = torch.cat(all_labels, dim=0).numpy()

# Step 2: Tune thresholds for each class
best_thresholds = []
for i in range(all_labels.shape[1]):  # Iterate over each class
    best_f1 = 0
    best_threshold = 0.5
    for threshold in np.arange(0.1, 1.0, 0.1):  # Test thresholds from 0.1 to 0.9
        preds = (all_predictions[:, i] > threshold).astype(int)
        f1 = f1_score(all_labels[:, i], preds)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    best_thresholds.append(best_threshold)

print("Best thresholds:", best_thresholds)

# Step 3: Apply thresholds to predictions
final_predictions = np.zeros_like(all_predictions)
for i, threshold in enumerate(best_thresholds):
    final_predictions[:, i] = (all_predictions[:, i] > threshold).astype(int)

# Step 4: Evaluate the model
from sklearn.metrics import classification_report
print(classification_report(all_labels, final_predictions, target_names=label_columns))

In [2]:
# Step 1: Identify Resource-Limited Classes
resource_limited_classes = ['Race', 'Physical', 'Class', 'Religion', 'Disability', 'Ethnicity']

# Step 2: Map Resource-Limited Classes to Indices
resource_indices = [label_columns.index(cls) for cls in resource_limited_classes]

# Step 3: Filter Few-Shot Data from Training Data
few_shot_data = train_data[train_data[resource_limited_classes].sum(axis=1) > 0]

# Step 4: Tokenize the Few-Shot Dataset
few_shot_tokenized = tokenizer(
    list(few_shot_data['Text']),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

few_shot_inputs = few_shot_tokenized["input_ids"]
few_shot_masks = few_shot_tokenized["attention_mask"]
few_shot_labels = torch.tensor(few_shot_data[resource_limited_classes].values, dtype=torch.float32)

# Step 5: Create Few-Shot DataLoader
few_shot_dataset = TensorDataset(few_shot_inputs, few_shot_masks, few_shot_labels)
few_shot_dataloader = DataLoader(few_shot_dataset, batch_size=8, shuffle=True)

for param in multitask_model.base_model.parameters():
    param.requires_grad = False
# Step 6: Fine-Tune the Model
few_shot_epochs = 20
optimizer = AdamW(multitask_model.parameters(), lr=1e-5)

few_shot_epochs = 5
for epoch in range(few_shot_epochs):
    multitask_model.train()
    total_loss = 0
    for batch in few_shot_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = multitask_model(input_ids=input_ids, attention_mask=attention_mask)
        # Compute loss only for resource-limited classes
        loss = sum(criterion(outputs[idx].squeeze(), labels[:, i]) for i, idx in enumerate(resource_indices))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(multitask_model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(few_shot_dataloader)
    print(f"Few-Shot Epoch {epoch + 1}/{few_shot_epochs}, Loss: {avg_loss}")



Few-Shot Epoch 1/5, Loss: 1.0140941497825442
Few-Shot Epoch 2/5, Loss: 0.990690614212127
Few-Shot Epoch 3/5, Loss: 0.9603790286041441
Few-Shot Epoch 4/5, Loss: 0.9416010152725947
Few-Shot Epoch 5/5, Loss: 0.9176006478922708


In [4]:
import numpy as np
from sklearn.metrics import f1_score

# Step 1: Generate predictions and labels
multitask_model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = multitask_model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.cat([torch.sigmoid(output) for output in outputs], dim=1).cpu()
        all_predictions.append(predictions)
        all_labels.append(labels.cpu())

# Flatten predictions and labels
all_predictions = torch.cat(all_predictions, dim=0).numpy()
all_labels = torch.cat(all_labels, dim=0).numpy()

# Step 2: Tune thresholds for each class
best_thresholds = []
for i in range(all_labels.shape[1]):  # Iterate over each class
    best_f1 = 0
    best_threshold = 0.5
    for threshold in np.arange(0.1, 1.0, 0.1):  # Test thresholds from 0.1 to 0.9
        preds = (all_predictions[:, i] > threshold).astype(int)
        f1 = f1_score(all_labels[:, i], preds)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    best_thresholds.append(best_threshold)

print("Best thresholds:", best_thresholds)

# Step 3: Apply thresholds to predictions
final_predictions = np.zeros_like(all_predictions)
for i, threshold in enumerate(best_thresholds):
    final_predictions[:, i] = (all_predictions[:, i] > threshold).astype(int)

# Step 4: Evaluate the model
from sklearn.metrics import classification_report
print(classification_report(all_labels, final_predictions, target_names=label_columns))

Best thresholds: [np.float64(0.1), np.float64(0.30000000000000004), np.float64(0.30000000000000004), np.float64(0.4), np.float64(0.1), 0.5, np.float64(0.1), np.float64(0.4), np.float64(0.5), np.float64(0.1)]
                    precision    recall  f1-score   support

              Race       0.32      0.48      0.39        64
         Behaviour       0.54      0.76      0.63       194
          Physical       0.20      0.23      0.22        30
             Class       0.38      0.21      0.27        39
          Religion       0.50      0.65      0.57        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.47      0.62      0.54       110
            Gender       0.49      0.40      0.44        45
Sexual Orientation       0.56      0.39      0.46        36
         Political       0.65      0.69      0.67       106

         micro avg       0.49      0.59      0.54       716
         macro avg       0.41      0.44      0.42       716
      weig

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, AdamW
from sklearn.metrics import classification_report, f1_score
import numpy as np

def load_and_preprocess_data(file_path, label_columns):
    """Load and preprocess the dataset"""
    data = pd.read_csv(file_path).dropna()
    labels = torch.tensor(data[label_columns].values, dtype=torch.float32)
    return data, labels

def tokenize_data(texts, tokenizer, max_length=128):
    """Tokenize the input texts"""
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

def create_dataloaders(input_ids, attention_masks, labels, batch_size=64, test_size=0.2):
    """Create train and validation dataloaders"""
    train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
        input_ids, attention_masks, labels, test_size=test_size, random_state=42
    )
    
    train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
    val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    
    return train_dataloader, val_dataloader, (train_labels, val_labels)

class MultiTaskModel(nn.Module):
    def __init__(self, base_model, num_tasks):
        super(MultiTaskModel, self).__init__()
        self.base_model = base_model
        self.task_heads = nn.ModuleList([nn.Linear(768, 1) for _ in range(num_tasks)])

    def forward(self, input_ids, attention_mask):
        shared_output = self.base_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        task_outputs = [head(shared_output) for head in self.task_heads]
        return task_outputs

def compute_loss(outputs, labels, criterion, class_indices=None, class_weights=None):
    """Compute the loss with optional class weights and indices"""
    total_loss = 0
    if class_indices is not None:
        for i, idx in enumerate(class_indices):
            weight = class_weights[i] if class_weights is not None else 1.0
            total_loss += weight * criterion(outputs[idx].squeeze(), labels[:, i])
    else:
        for i, output in enumerate(outputs):
            total_loss += criterion(output.squeeze(), labels[:, i])
    return total_loss

def train_epoch(model, dataloader, optimizer, criterion, device, class_indices=None, class_weights=None):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = compute_loss(outputs, labels, criterion, class_indices, class_weights)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, device, label_names):
    """Evaluate the model and return predictions and metrics"""
    model.eval()
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.cat([torch.sigmoid(output) for output in outputs], dim=1).cpu()
            all_predictions.append(predictions)
            all_labels.append(labels.cpu())
    
    all_predictions = torch.cat(all_predictions, dim=0).numpy()
    all_labels = torch.cat(all_labels, dim=0).numpy()
    
    # Find best thresholds
    best_thresholds = []
    for i in range(all_labels.shape[1]):
        best_f1 = 0
        best_threshold = 0.5
        for threshold in np.arange(0.1, 1.0, 0.1):
            preds = (all_predictions[:, i] > threshold).astype(int)
            f1 = f1_score(all_labels[:, i], preds)
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
        best_thresholds.append(best_threshold)
    
    # Apply thresholds
    final_predictions = np.zeros_like(all_predictions)
    for i, threshold in enumerate(best_thresholds):
        final_predictions[:, i] = (all_predictions[:, i] > threshold).astype(int)
    
    return final_predictions, all_labels, best_thresholds

def main():
    # Configuration
    label_columns = ['Race', 'Behaviour', 'Physical', 'Class', 'Religion', 
                    'Disability', 'Ethnicity', 'Gender', 'Sexual Orientation', 'Political']
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load and preprocess data
    data, labels = load_and_preprocess_data('./Bengali Heatspeech dataset.csv', label_columns)
    
    # Initialize tokenizer and tokenize data
    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
    tokenized = tokenize_data(list(data['Text']), tokenizer)
    
    # Create dataloaders
    train_dataloader, val_dataloader, (train_labels, val_labels) = create_dataloaders(
        tokenized["input_ids"],
        tokenized["attention_mask"],
        labels
    )
    
    # Initialize model
    base_model = AutoModel.from_pretrained("xlm-roberta-base")
    model = MultiTaskModel(base_model, len(label_columns)).to(device)
    
    # Training configuration
    criterion = nn.BCEWithLogitsLoss()
    optimizer = AdamW(model.parameters(), lr=1e-5)
    
    # Initial training
    print("Starting initial training...")
    for epoch in range(50):
        avg_loss = train_epoch(model, train_dataloader, optimizer, criterion, device)
        print(f"Epoch {epoch + 1}, Loss: {avg_loss}")
        
        if epoch % 5 == 0:  # Evaluate every 5 epochs
            predictions, labels, thresholds = evaluate_model(model, val_dataloader, device, label_columns)
            print("\nValidation Performance:")
            print(classification_report(labels, predictions, target_names=label_columns))
    
    # Few-shot fine-tuning
    print("\nStarting few-shot fine-tuning...")
    resource_limited_classes = ['Race', 'Physical', 'Class', 'Religion', 'Disability', 'Ethnicity']
    resource_indices = [label_columns.index(cls) for cls in resource_limited_classes]
    
    # Prepare few-shot data
    few_shot_data = data[data[resource_limited_classes].sum(axis=1) > 0]
    few_shot_tokenized = tokenize_data(list(few_shot_data['Text']), tokenizer)
    few_shot_labels = torch.tensor(few_shot_data[resource_limited_classes].values, dtype=torch.float32)
    
    few_shot_dataset = TensorDataset(
        few_shot_tokenized["input_ids"],
        few_shot_tokenized["attention_mask"],
        few_shot_labels
    )
    few_shot_dataloader = DataLoader(few_shot_dataset, batch_size=8, shuffle=True)
    
    # Calculate class weights for few-shot learning
    class_counts = few_shot_data[resource_limited_classes].sum(axis=0)
    class_weights = torch.tensor([1.0 / count if count > 0 else 1.0 for count in class_counts], 
                               dtype=torch.float32).to(device)
    
    # Fine-tune
    for epoch in range(5):
        avg_loss = train_epoch(
            model, 
            few_shot_dataloader, 
            optimizer, 
            criterion, 
            device, 
            class_indices=resource_indices,
            class_weights=class_weights
        )
        print(f"Few-shot Epoch {epoch + 1}, Loss: {avg_loss}")
    
    # Final evaluation
    print("\nFinal Evaluation:")
    predictions, labels, thresholds = evaluate_model(model, val_dataloader, device, label_columns)
    print("\nBest thresholds:", thresholds)
    print("\nClassification Report:")
    print(classification_report(labels, predictions, target_names=label_columns))

if __name__ == "__main__":
    main()

2025-06-07 02:48:42.993667: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-07 02:48:43.079798: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749242923.112890    2520 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749242923.123095    2520 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749242923.195588    2520 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Starting initial training...
Epoch 1, Loss: 5.629242122173309

Validation Performance:
                    precision    recall  f1-score   support

              Race       0.13      1.00      0.23        64
         Behaviour       0.39      1.00      0.56       194
          Physical       0.06      0.77      0.12        30
             Class       0.06      0.33      0.11        39
          Religion       0.16      1.00      0.28        81
        Disability       0.02      0.36      0.04        11
         Ethnicity       0.22      1.00      0.36       110
            Gender       0.09      0.89      0.16        45
Sexual Orientation       0.08      1.00      0.15        36
         Political       0.21      1.00      0.35       106

         micro avg       0.16      0.94      0.27       716
         macro avg       0.14      0.84      0.23       716
      weighted avg       0.22      0.94      0.34       716
       samples avg       0.16      0.78      0.26       716



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2, Loss: 3.6963652819395065
Epoch 3, Loss: 3.4588871747255325
Epoch 4, Loss: 3.3180352449417114
Epoch 5, Loss: 3.228292889893055
Epoch 6, Loss: 3.126803159713745

Validation Performance:
                    precision    recall  f1-score   support

              Race       0.32      0.75      0.45        64
         Behaviour       0.48      0.81      0.60       194
          Physical       0.15      0.10      0.12        30
             Class       0.10      0.62      0.17        39
          Religion       0.33      0.58      0.42        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.46      0.66      0.54       110
            Gender       0.24      0.44      0.31        45
Sexual Orientation       0.29      0.75      0.42        36
         Political       0.55      0.75      0.63       106

         micro avg       0.35      0.67      0.46       716
         macro avg       0.29      0.55      0.37       716
      weighted avg       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 7, Loss: 3.037480317056179
Epoch 8, Loss: 2.9275747388601303
Epoch 9, Loss: 2.8551408275961876
Epoch 10, Loss: 2.7899747267365456
Epoch 11, Loss: 2.7150727435946465

Validation Performance:
                    precision    recall  f1-score   support

              Race       0.37      0.64      0.47        64
         Behaviour       0.56      0.70      0.62       194
          Physical       0.17      0.27      0.21        30
             Class       0.21      0.41      0.28        39
          Religion       0.58      0.52      0.55        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.53      0.74      0.62       110
            Gender       0.26      0.64      0.37        45
Sexual Orientation       0.37      0.47      0.41        36
         Political       0.63      0.72      0.67       106

         micro avg       0.46      0.62      0.53       716
         macro avg       0.37      0.51      0.42       716
      weighted avg    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 12, Loss: 2.624038189649582
Epoch 13, Loss: 2.5840100087225437
Epoch 14, Loss: 2.55990132689476
Epoch 15, Loss: 2.5104625895619392
Epoch 16, Loss: 2.4391228035092354

Validation Performance:
                    precision    recall  f1-score   support

              Race       0.48      0.45      0.46        64
         Behaviour       0.53      0.76      0.62       194
          Physical       0.12      0.10      0.11        30
             Class       0.29      0.26      0.27        39
          Religion       0.52      0.63      0.57        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.53      0.77      0.63       110
            Gender       0.29      0.51      0.37        45
Sexual Orientation       0.34      0.53      0.41        36
         Political       0.70      0.70      0.70       106

         micro avg       0.49      0.62      0.55       716
         macro avg       0.38      0.47      0.41       716
      weighted avg   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 17, Loss: 2.387563604861498
Epoch 18, Loss: 2.3072171807289124
Epoch 19, Loss: 2.271461881697178
Epoch 20, Loss: 2.1843342669308186
Epoch 21, Loss: 2.123511478304863

Validation Performance:
                    precision    recall  f1-score   support

              Race       0.37      0.55      0.44        64
         Behaviour       0.63      0.65      0.64       194
          Physical       0.20      0.30      0.24        30
             Class       0.34      0.38      0.36        39
          Religion       0.53      0.64      0.58        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.52      0.70      0.60       110
            Gender       0.28      0.51      0.36        45
Sexual Orientation       0.35      0.58      0.44        36
         Political       0.62      0.68      0.65       106

         micro avg       0.48      0.60      0.54       716
         macro avg       0.38      0.50      0.43       716
      weighted avg   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 22, Loss: 2.112840037792921
Epoch 23, Loss: 2.0486152209341526
Epoch 24, Loss: 1.9969901219010353
Epoch 25, Loss: 1.9157525785267353
Epoch 26, Loss: 1.8659468851983547

Validation Performance:
                    precision    recall  f1-score   support

              Race       0.39      0.48      0.43        64
         Behaviour       0.58      0.67      0.62       194
          Physical       0.22      0.23      0.23        30
             Class       0.30      0.28      0.29        39
          Religion       0.48      0.77      0.59        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.51      0.69      0.58       110
            Gender       0.30      0.56      0.39        45
Sexual Orientation       0.37      0.61      0.46        36
         Political       0.65      0.67      0.66       106

         micro avg       0.48      0.61      0.54       716
         macro avg       0.38      0.50      0.43       716
      weighted avg 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 27, Loss: 1.8442133963108063
Epoch 28, Loss: 1.7935220710933208
Epoch 29, Loss: 1.7498814798891544
Epoch 30, Loss: 1.6849523559212685
Epoch 31, Loss: 1.6454904302954674

Validation Performance:
                    precision    recall  f1-score   support

              Race       0.40      0.53      0.46        64
         Behaviour       0.46      0.85      0.60       194
          Physical       0.16      0.23      0.19        30
             Class       0.21      0.38      0.27        39
          Religion       0.48      0.75      0.59        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.48      0.65      0.55       110
            Gender       0.58      0.31      0.41        45
Sexual Orientation       0.38      0.56      0.45        36
         Political       0.58      0.74      0.65       106

         micro avg       0.44      0.65      0.53       716
         macro avg       0.37      0.50      0.42       716
      weighted avg

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 32, Loss: 1.5688375011086464
Epoch 33, Loss: 1.5530081205070019
Epoch 34, Loss: 1.455374775454402
Epoch 35, Loss: 1.4524581916630268
Epoch 36, Loss: 1.3636515624821186

Validation Performance:
                    precision    recall  f1-score   support

              Race       0.41      0.45      0.43        64
         Behaviour       0.49      0.74      0.59       194
          Physical       0.15      0.27      0.19        30
             Class       0.35      0.18      0.24        39
          Religion       0.53      0.62      0.57        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.46      0.61      0.53       110
            Gender       0.39      0.49      0.43        45
Sexual Orientation       0.52      0.47      0.49        36
         Political       0.70      0.61      0.65       106

         micro avg       0.48      0.57      0.52       716
         macro avg       0.40      0.44      0.41       716
      weighted avg 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 37, Loss: 1.327190676704049
Epoch 38, Loss: 1.2679235581308603
Epoch 39, Loss: 1.2250130465254188
Epoch 40, Loss: 1.1855799742043018
Epoch 41, Loss: 1.1420283475890756

Validation Performance:
                    precision    recall  f1-score   support

              Race       0.37      0.50      0.43        64
         Behaviour       0.58      0.62      0.60       194
          Physical       0.15      0.27      0.19        30
             Class       0.23      0.33      0.27        39
          Religion       0.51      0.69      0.59        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.48      0.60      0.53       110
            Gender       0.45      0.40      0.42        45
Sexual Orientation       0.50      0.56      0.53        36
         Political       0.56      0.68      0.62       106

         micro avg       0.47      0.57      0.51       716
         macro avg       0.38      0.46      0.42       716
      weighted avg 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 42, Loss: 1.092144774273038
Epoch 43, Loss: 1.0658020144328475
Epoch 44, Loss: 1.0514668114483356
Epoch 45, Loss: 0.9753288077190518
Epoch 46, Loss: 0.9702897761017084

Validation Performance:
                    precision    recall  f1-score   support

              Race       0.30      0.53      0.39        64
         Behaviour       0.51      0.77      0.62       194
          Physical       0.13      0.30      0.19        30
             Class       0.40      0.21      0.27        39
          Religion       0.50      0.68      0.58        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.48      0.53      0.50       110
            Gender       0.41      0.42      0.42        45
Sexual Orientation       0.40      0.56      0.47        36
         Political       0.62      0.70      0.65       106

         micro avg       0.45      0.60      0.52       716
         macro avg       0.38      0.47      0.41       716
      weighted avg 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 47, Loss: 0.9070147937163711
Epoch 48, Loss: 0.8517710326705128
Epoch 49, Loss: 0.8309390826616436
Epoch 50, Loss: 0.7831956269219518

Starting few-shot fine-tuning...
Few-shot Epoch 1, Loss: 0.009012974544841432
Few-shot Epoch 2, Loss: 0.0076876540188642595
Few-shot Epoch 3, Loss: 0.006970446816771606
Few-shot Epoch 4, Loss: 0.005417284438442579
Few-shot Epoch 5, Loss: 0.0046823401217649975

Final Evaluation:

Best thresholds: [np.float64(0.7000000000000001), np.float64(0.1), np.float64(0.9), np.float64(0.9), np.float64(0.6), np.float64(0.9), np.float64(0.5), np.float64(0.1), np.float64(0.2), np.float64(0.30000000000000004)]

Classification Report:
                    precision    recall  f1-score   support

              Race       0.68      0.77      0.72        64
         Behaviour       0.45      0.77      0.57       194
          Physical       0.50      0.60      0.55        30
             Class       0.84      0.41      0.55        39
          Religion       0.68      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, AdamW
from sklearn.metrics import classification_report
import numpy as np

class MultiLabelClassifier(nn.Module):
    def __init__(self, num_labels):
        super(MultiLabelClassifier, self).__init__()
        self.base_model = AutoModel.from_pretrained("xlm-roberta-base")
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state[:, 0, :]
        sequence_output = self.dropout(sequence_output)
        return self.classifier(sequence_output)

def evaluate_model(model, dataloader, label_names, device):
    """Evaluate the model on the validation set"""
    model.eval()
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask)
            predictions = torch.sigmoid(outputs)
            all_predictions.append(predictions.cpu())
            all_labels.append(labels.cpu())
    
    all_predictions = torch.cat(all_predictions, dim=0).numpy()
    all_labels = torch.cat(all_labels, dim=0).numpy()
    
    # Convert predictions to binary (0 or 1)
    final_predictions = (all_predictions > 0.5).astype(int)
    
    print("\nClassification Report:")
    print(classification_report(all_labels, final_predictions, target_names=label_names))
    return all_predictions, all_labels

def train_model(model, train_dataloader, val_dataloader, label_columns, epochs=10):
    """Train the model"""
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Move model to device
    model = model.to(device)
    
    # Set up training parameters
    criterion = nn.BCEWithLogitsLoss()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    best_loss = float('inf')
    patience = 10
    patience_counter = 0
    
    for epoch in range(epochs):
        # Training
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_loss}")
        
        # Validation
        if (epoch + 1) % 2 == 0:  # Evaluate every 2 epochs
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    input_ids, attention_mask, labels = [b.to(device) for b in batch]
                    outputs = model(input_ids, attention_mask)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item()
            
            avg_val_loss = val_loss / len(val_dataloader)
            print(f"Validation Loss: {avg_val_loss}")
            
            # Early stopping
            if avg_val_loss < best_loss:
                best_loss = avg_val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered")
                    break
            
            # Evaluate model performance
            evaluate_model(model, val_dataloader, label_columns, device)

def main():
    # Load and preprocess data
    data = pd.read_csv('./Bengali Heatspeech dataset.csv').dropna()
    label_columns = ['Race', 'Behaviour', 'Physical', 'Class', 'Religion', 
                    'Disability', 'Ethnicity', 'Gender', 'Sexual Orientation', 'Political']
    
    # Tokenize texts
    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
    encoded_data = tokenizer(
        list(data['Text']),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    
    # Prepare labels
    labels = torch.tensor(data[label_columns].values, dtype=torch.float32)
    
    # Split data
    train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
        encoded_data['input_ids'], 
        encoded_data['attention_mask'],
        labels,
        test_size=0.2,
        random_state=42
    )
    
    # Create dataloaders
    train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
    val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
    
    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=32)
    
    # Initialize and train model
    model = MultiLabelClassifier(num_labels=len(label_columns))
    train_model(model, train_dataloader, val_dataloader, label_columns, epochs=55)

if __name__ == "__main__":
    main()

Using device: cuda




Epoch 1/55, Training Loss: 0.4458373368732513
Epoch 2/55, Training Loss: 0.34477542459018645
Validation Loss: 0.3423271421343088

Classification Report:
                    precision    recall  f1-score   support

              Race       0.00      0.00      0.00        64
         Behaviour       0.58      0.54      0.56       194
          Physical       0.00      0.00      0.00        30
             Class       0.00      0.00      0.00        39
          Religion       0.00      0.00      0.00        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.00      0.00      0.00       110
            Gender       0.00      0.00      0.00        45
Sexual Orientation       0.00      0.00      0.00        36
         Political       0.54      0.26      0.35       106

         micro avg       0.57      0.19      0.28       716
         macro avg       0.11      0.08      0.09       716
      weighted avg       0.24      0.19      0.20       716
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/55, Training Loss: 0.31791016837907216
Epoch 4/55, Training Loss: 0.2944117555069545
Validation Loss: 0.29346819780766964

Classification Report:
                    precision    recall  f1-score   support

              Race       0.50      0.02      0.03        64
         Behaviour       0.61      0.51      0.56       194
          Physical       0.00      0.00      0.00        30
             Class       0.00      0.00      0.00        39
          Religion       0.64      0.51      0.57        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.57      0.38      0.46       110
            Gender       0.55      0.13      0.21        45
Sexual Orientation       0.00      0.00      0.00        36
         Political       0.75      0.66      0.70       106

         micro avg       0.64      0.36      0.46       716
         macro avg       0.36      0.22      0.25       716
      weighted avg       0.52      0.36      0.41       716
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/55, Training Loss: 0.27701051556874834
Epoch 6/55, Training Loss: 0.2703271082469395
Validation Loss: 0.29967109579592943

Classification Report:
                    precision    recall  f1-score   support

              Race       0.00      0.00      0.00        64
         Behaviour       0.64      0.45      0.53       194
          Physical       0.00      0.00      0.00        30
             Class       0.00      0.00      0.00        39
          Religion       0.72      0.41      0.52        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.66      0.21      0.32       110
            Gender       0.47      0.16      0.23        45
Sexual Orientation       0.60      0.25      0.35        36
         Political       0.73      0.60      0.66       106

         micro avg       0.67      0.31      0.42       716
         macro avg       0.38      0.21      0.26       716
      weighted avg       0.52      0.31      0.38       716
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 7/55, Training Loss: 0.2561719933199504
Epoch 8/55, Training Loss: 0.24840444917716678
Validation Loss: 0.30620233342051506

Classification Report:
                    precision    recall  f1-score   support

              Race       0.42      0.08      0.13        64
         Behaviour       0.62      0.56      0.59       194
          Physical       0.00      0.00      0.00        30
             Class       0.50      0.03      0.05        39
          Religion       0.66      0.33      0.44        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.52      0.39      0.45       110
            Gender       0.50      0.18      0.26        45
Sexual Orientation       0.75      0.17      0.27        36
         Political       0.68      0.60      0.64       106

         micro avg       0.61      0.37      0.46       716
         macro avg       0.46      0.23      0.28       716
      weighted avg       0.56      0.37      0.42       716
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 9/55, Training Loss: 0.2354184625640748
Epoch 10/55, Training Loss: 0.22159323947770254
Validation Loss: 0.31245482712984085

Classification Report:
                    precision    recall  f1-score   support

              Race       0.38      0.08      0.13        64
         Behaviour       0.68      0.49      0.57       194
          Physical       0.00      0.00      0.00        30
             Class       0.50      0.10      0.17        39
          Religion       0.64      0.53      0.58        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.60      0.35      0.44       110
            Gender       0.54      0.16      0.24        45
Sexual Orientation       0.62      0.36      0.46        36
         Political       0.64      0.68      0.66       106

         micro avg       0.63      0.39      0.48       716
         macro avg       0.46      0.27      0.32       716
      weighted avg       0.57      0.39      0.44       716
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 11/55, Training Loss: 0.205495275438778
Epoch 12/55, Training Loss: 0.1927917001266328
Validation Loss: 0.32752672769129276

Classification Report:
                    precision    recall  f1-score   support

              Race       0.41      0.14      0.21        64
         Behaviour       0.63      0.58      0.61       194
          Physical       0.00      0.00      0.00        30
             Class       0.44      0.10      0.17        39
          Religion       0.64      0.54      0.59        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.58      0.35      0.44       110
            Gender       0.65      0.24      0.35        45
Sexual Orientation       0.60      0.50      0.55        36
         Political       0.68      0.49      0.57       106

         micro avg       0.62      0.41      0.49       716
         macro avg       0.46      0.30      0.35       716
      weighted avg       0.57      0.41      0.46       716
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 13/55, Training Loss: 0.18086657613988907
Epoch 14/55, Training Loss: 0.16603801837043156
Validation Loss: 0.3492587376385927

Classification Report:
                    precision    recall  f1-score   support

              Race       0.38      0.19      0.25        64
         Behaviour       0.63      0.56      0.59       194
          Physical       0.00      0.00      0.00        30
             Class       0.38      0.13      0.19        39
          Religion       0.58      0.54      0.56        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.51      0.47      0.49       110
            Gender       0.55      0.40      0.46        45
Sexual Orientation       0.61      0.47      0.53        36
         Political       0.67      0.61      0.64       106

         micro avg       0.58      0.45      0.51       716
         macro avg       0.43      0.34      0.37       716
      weighted avg       0.53      0.45      0.48       716
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 15/55, Training Loss: 0.15606361651231374
Epoch 16/55, Training Loss: 0.14345517636291563
Validation Loss: 0.3744709473103285

Classification Report:
                    precision    recall  f1-score   support

              Race       0.44      0.12      0.20        64
         Behaviour       0.59      0.57      0.58       194
          Physical       0.00      0.00      0.00        30
             Class       0.38      0.13      0.19        39
          Religion       0.59      0.60      0.60        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.57      0.25      0.34       110
            Gender       0.52      0.29      0.37        45
Sexual Orientation       0.65      0.42      0.51        36
         Political       0.66      0.53      0.59       106

         micro avg       0.59      0.40      0.47       716
         macro avg       0.44      0.29      0.34       716
      weighted avg       0.54      0.40      0.44       716
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 17/55, Training Loss: 0.12908654908339182
Epoch 18/55, Training Loss: 0.12055867993169361
Validation Loss: 0.3849548101425171

Classification Report:
                    precision    recall  f1-score   support

              Race       0.42      0.08      0.13        64
         Behaviour       0.59      0.62      0.60       194
          Physical       0.00      0.00      0.00        30
             Class       0.25      0.05      0.09        39
          Religion       0.64      0.57      0.60        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.49      0.27      0.35       110
            Gender       0.44      0.24      0.31        45
Sexual Orientation       0.60      0.42      0.49        36
         Political       0.67      0.62      0.65       106

         micro avg       0.58      0.41      0.48       716
         macro avg       0.41      0.29      0.32       716
      weighted avg       0.52      0.41      0.44       716
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 19/55, Training Loss: 0.11019885788361232
Epoch 20/55, Training Loss: 0.10439523356774497
Validation Loss: 0.4090309012681246

Classification Report:
                    precision    recall  f1-score   support

              Race       0.36      0.06      0.11        64
         Behaviour       0.57      0.62      0.60       194
          Physical       0.00      0.00      0.00        30
             Class       0.33      0.08      0.12        39
          Religion       0.59      0.63      0.61        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.55      0.27      0.36       110
            Gender       0.40      0.40      0.40        45
Sexual Orientation       0.56      0.50      0.53        36
         Political       0.67      0.66      0.66       106

         micro avg       0.57      0.44      0.49       716
         macro avg       0.40      0.32      0.34       716
      weighted avg       0.51      0.44      0.45       716
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 21/55, Training Loss: 0.09278861320917568
Epoch 22/55, Training Loss: 0.08155515405630308
Validation Loss: 0.41994452476501465

Classification Report:
                    precision    recall  f1-score   support

              Race       0.44      0.11      0.17        64
         Behaviour       0.61      0.53      0.57       194
          Physical       0.12      0.03      0.05        30
             Class       0.50      0.05      0.09        39
          Religion       0.64      0.40      0.49        81
        Disability       0.00      0.00      0.00        11
         Ethnicity       0.55      0.34      0.42       110
            Gender       0.52      0.27      0.35        45
Sexual Orientation       0.52      0.44      0.48        36
         Political       0.65      0.61      0.63       106

         micro avg       0.59      0.38      0.46       716
         macro avg       0.46      0.28      0.33       716
      weighted avg       0.55      0.38      0.44       716
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 23/55, Training Loss: 0.07507275466969798
Epoch 24/55, Training Loss: 0.06317263313879569
Validation Loss: 0.43767437525093555
Early stopping triggered
