In [None]:
import ast 
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from collections import Counter
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
import shutil
import os 
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer, RobertaForSequenceClassification
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import ast
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
train=pd.read_csv("/train_df_preprocessed_EPIC-2.csv")
test=pd.read_csv("/test_df_preprocessed_EPIC-2.csv")
val=pd.read_csv("/val_df_preprocessed_EPIC-2.csv")

In [None]:
from huggingface_hub import login

login(token="")

In [6]:
model_name = 'roberta-large'
model_name_filename = model_name.replace("/", "-")

In [7]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

In [None]:



def process_annotators(df, exclude_annotator=None):
    """
    Process the 'mapped_labels' column by converting string representations to lists,
    calculate the maximum number of annotators, and expand the 'mapped_labels' into separate columns.
    
    Args:
        df (pd.DataFrame): DataFrame containing the 'mapped_labels' column.
        exclude_annotator (int): Annotator index to exclude from processing (1-based index).
    
    Returns:
        pd.DataFrame: Updated DataFrame with expanded annotator columns.
    """
 
    df['mapped_labels'] = df['mapped_labels'].apply(
        lambda value: ast.literal_eval(value) if isinstance(value, str) else value
    )
    
  
    max_num_annotators = df['mapped_labels'].apply(lambda x: len(x) if isinstance(x, list) else 0).max()
    print(f"Maximum number of annotators: {max_num_annotators}")
    
    
    annotator_columns = [f"answer{i+1}" for i in range(max_num_annotators)]
    
  
    for i, annotator in enumerate(annotator_columns):
    
        if exclude_annotator and i + 1 == exclude_annotator:
            continue
        df[annotator] = df['mapped_labels'].apply(lambda x: x[i] if i < len(x) else 0)
    
  
    if exclude_annotator:
        excluded_col = f"answer{exclude_annotator}"
        if excluded_col in df.columns:
            df.drop(columns=[excluded_col], inplace=True)
    
    return df




train = process_annotators(train, exclude_annotator=8)  
test = process_annotators(test)  
val = process_annotators(val)  




Maximum number of annotators: 8
Maximum number of annotators: 7
Maximum number of annotators: 7


In [None]:

train_ = Dataset.from_pandas(train, preserve_index=False)
test_ = Dataset.from_pandas(test, preserve_index=False)
val_ = Dataset.from_pandas(val, preserve_index=False)

In [19]:
dataset = DatasetDict({'train': train_, 'test': test_, 'val': val_})

In [None]:
def tokenize_func(examples):
    tokenized_inputs = tokenizer(
        examples['merged_text'], 
        padding='max_length', 
        truncation=True, 
        max_length=512
    )
    
   
    num_annotators = len([col for col in examples if col.startswith('answer')])  
    for i in range(1, num_annotators + 1):
        annotator_label = f'answer{i}'  
        if annotator_label in examples:
            tokenized_inputs[annotator_label] = examples[annotator_label]
    
    return tokenized_inputs


train_tokenized = train_.map(tokenize_func, batched=True)
val_tokenized = val_.map(tokenize_func, batched=True)
test_tokenized = test_.map(tokenize_func, batched=True)

Map: 100%|██████████| 2100/2100 [00:01<00:00, 1897.09 examples/s]
Map: 100%|██████████| 450/450 [00:00<00:00, 1382.74 examples/s]
Map: 100%|██████████| 450/450 [00:00<00:00, 1566.55 examples/s]


In [21]:
train_dataset = train_tokenized
val_dataset = val_tokenized

In [None]:
def create_annotator_dataset(dataset, annotator_label):
   
    if 'label' in dataset.column_names:
        dataset = dataset.remove_columns(['label'])
 
    dataset = dataset.rename_column(annotator_label, 'label')
    dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    return dataset


def create_annotator_specific_datasets(tokenized_dataset, annotator_column):
    return create_annotator_dataset(tokenized_dataset, annotator_column)


max_num_annotators = len([col for col in train_dataset.column_names if col.startswith('answer')])

datasets = {}
for i in range(1, max_num_annotators + 1):
    if i == 8:
        print(f"Skipping answer8 for annotator {i}")
        continue
    
    annotator_label = f'answer{i}'
    datasets[f'train_a{i}'] = create_annotator_specific_datasets(train_tokenized, annotator_label)
    datasets[f'val_a{i}'] = create_annotator_specific_datasets(val_tokenized, annotator_label)
    datasets[f'test_a{i}'] = create_annotator_specific_datasets(test_tokenized, annotator_label)


In [25]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='macro')
    
 
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy() 
    cross_entropy = -np.sum(np.eye(probs.shape[1])[labels] * np.log(probs + 1e-9)) / len(labels)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'cross_entropy': cross_entropy
    }

In [27]:
output_dir = f"./multiclassification_ensemble_epic{model_name_filename}/results/human"

In [None]:

def train_annotator_model(train_dataset, val_dataset, output_dir):
    model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=2)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=8,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        learning_rate=5e-5,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        metric_for_best_model="eval_f1",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    
    trainer.train()
   

    best_model_dir = trainer.state.best_model_checkpoint
    if best_model_dir:
        for checkpoint in os.listdir(output_dir):
            checkpoint_path = os.path.join(output_dir, checkpoint)
            if checkpoint_path != best_model_dir and os.path.isdir(checkpoint_path):
                shutil.rmtree(checkpoint_path)

    return trainer.model

In [None]:
def train_for_all_annotators(max_num_annotators):

    models = {}

  
    for i in range(1, max_num_annotators + 1):
      
        train_data = datasets[f'train_a{i}']
        val_data = datasets[f'val_a{i}']

       
        output_dir = f"output_a{i}"

       
        model = train_annotator_model(train_data, val_data, output_dir)

       
        models[f'model_a{i}'] = model

    return models



max_num_annotators = len([col for col in train_tokenized.column_names if col.startswith('answer')])


models = train_for_all_annotators(max_num_annotators)

In [None]:

def calculate_confidences_ensemble(df, models, tokenizer, device):
   
    confidences = []  
    softmax_probs = []  

  
    for model in models:
        model.to(device)
        model.eval()


    for text in df['merged_text']:
       
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

        ensemble_logits = None
      
        for model in models:
            with torch.no_grad():
                logits = model(**inputs).logits
                if ensemble_logits is None:
                    ensemble_logits = logits
                else:
                    ensemble_logits += logits  

  
        probabilities = torch.nn.functional.softmax(ensemble_logits, dim=-1).cpu().numpy()[0]
        confidences.append(probabilities.max())  
        softmax_probs.append(probabilities) 

 
    df['confidence_scores'] = confidences
    df['softmax_probs'] = softmax_probs

    return df

In [None]:


def get_predictions(model, tokenizer, text, device):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()[0]
        predicted_class = np.argmax(probabilities)
    return predicted_class, probabilities

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report




def majority_vote(predictions):
    
    counts = Counter(predictions)
    return counts.most_common(1)[0][0]

def evaluate_performance_conf(df, ensemble_models, tokenizer, device):
   
   
   
    df['majority_label'] = df['majority_label'].astype(int)  

 
    for model in ensemble_models:
        model.to(device)
        model.eval()

 
    majority_preds = []
    confidences = []

  
    for text in df['merged_text']:
     
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

        individual_predictions = [] 
        individual_confidences = []  

        for model in ensemble_models:
            with torch.no_grad():
                logits = model(**inputs).logits
                probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()[0]
                individual_predictions.append(np.argmax(probabilities))
                individual_confidences.append(probabilities.max())

       
        majority_pred = majority_vote(individual_predictions)
        majority_preds.append(majority_pred)

        
        confidences.append(np.mean(individual_confidences))

 
    df['majority_preds'] = majority_preds
    df['confidence_scores'] = confidences

  
    y_true = df['majority_label']
    y_pred = np.array(df['majority_preds'], dtype=int)

 
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro') 
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro') 

  
    print("Accuracy:", accuracy * 100)
    print("Precision:", precision * 100)
    print("Recall:", recall * 100)
    print("F1 Score:", f1 * 100)

 
    conf_matrix = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    class_report = classification_report(y_true, y_pred)
    print("Classification Report:")
    print(class_report)

    
    correct_confidence = np.mean([conf for pred, true, conf in zip(y_pred, y_true, confidences) if pred == true])
    incorrect_confidence = np.mean([conf for pred, true, conf in zip(y_pred, y_true, confidences) if pred != true])
    avg_confidence = np.mean(confidences)

    print("Average Confidence Score:", avg_confidence * 100)
    print("Average Confidence for Correct Predictions:", correct_confidence * 100)
    print("Average Confidence for Incorrect Predictions:", incorrect_confidence * 100)