In [1]:
import pandas as pd
from datasets import load_dataset, DatasetDict, Dataset
import logging
import time
import os
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from tqdm import tqdm
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
import torch.nn.functional as F
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train=pd.read_csv("/home/bmuscato/venv_b/share/doc/networkx-3.1/dataset/code_TACL/train_sum_soft.csv")
test=pd.read_csv("/home/bmuscato/venv_b/share/doc/networkx-3.1/dataset/code_TACL/test_sum_soft.csv")
val=pd.read_csv("/home/bmuscato/venv_b/share/doc/networkx-3.1/dataset/code_TACL/val_sum_soft.csv")

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
from huggingface_hub import login

login(token="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")

In [5]:
model_name = 'google-bert/bert-large-uncased'
model_name_filename = model_name.replace("/", "-")

In [6]:
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-large-uncased')

In [7]:
from datasets import Dataset, DatasetDict


def no_maj(df):
    return df.loc[df['majority_label'] != 'No majority']


train, test, val = map(no_maj, [train, test, val])

label_encoding = {
    'Pro': 0,
    'Against': 1,
    'Neutral': 2,
    'Not-about': 3
}


for annotator in ['answer1', 'answer2', 'answer3']:
    train[f'{annotator}_label'] = train[annotator].map(label_encoding)
    val[f'{annotator}_label'] = val[annotator].map(label_encoding)
    test[f'{annotator}_label'] = test[annotator].map(label_encoding)
    
    


train['labels'] = train['majority_label'].map(label_encoding)
val['labels'] = val['majority_label'].map(label_encoding)
test['labels'] = test['majority_label'].map(label_encoding)

train['labels'] = train['labels'].astype(int)
test['labels'] = test['labels'].astype(int)
val['labels'] = val['labels'].astype(int)
    






In [8]:

train['answer3_label'] = train['answer3_label'].fillna(0) 

# Verify the results
print("NaN values in val['answer3_label'] after replacement:", val['answer3_label'].isna().sum())


NaN values in val['answer3_label'] after replacement: 0


In [9]:

for column in ['answer1_label', 'answer2_label', 'answer3_label']:
    train[column] = train[column].astype(int)
    test[column] = test[column].astype(int)
    val[column] = val[column].astype(int)


In [10]:
# Convert DataFrames to Hugging Face Dataset objects
train_ = Dataset.from_pandas(train, preserve_index=False)
test_ = Dataset.from_pandas(test, preserve_index=False)
val_ = Dataset.from_pandas(val, preserve_index=False)

In [11]:
# Combine datasets into a DatasetDict
dataset = DatasetDict({'train': train_, 'test': test_, 'val': val_})

In [12]:

def tokenize_func(examples):
    tokenized_inputs = tokenizer(
        examples['Input'], 
        padding='max_length', 
        truncation=True, 
        max_length=512
    )
    # Add labels to the tokenized inputs
    tokenized_inputs['a1_label'] = examples['answer1_label']
    tokenized_inputs['a2_label'] = examples['answer2_label']
    tokenized_inputs['a3_label'] = examples['answer3_label']
    return tokenized_inputs

# Tokenize datasets
train_tokenized = train_.map(tokenize_func, batched=True)
val_tokenized = val_.map(tokenize_func, batched=True)
test_tokenized = test_.map(tokenize_func, batched=True)



Map: 100%|██████████| 619/619 [00:04<00:00, 140.76 examples/s]
Map: 100%|██████████| 139/139 [00:00<00:00, 140.02 examples/s]
Map: 100%|██████████| 139/139 [00:01<00:00, 114.06 examples/s]


In [13]:
train_dataset = train_tokenized
val_dataset = val_tokenized

In [14]:
def create_annotator_dataset(dataset, annotator_label):
    # Ensure no conflict by removing 'label' if it exists
    if 'label' in dataset.column_names:
        dataset = dataset.remove_columns(['label'])
    # Rename the specified annotator label to 'label'
    dataset = dataset.rename_column(annotator_label, 'label')
    dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    return dataset

# Create annotator-specific datasets
train_a1 = create_annotator_dataset(train_tokenized, 'a1_label')
val_a1 = create_annotator_dataset(val_tokenized, 'a1_label')

train_a2 = create_annotator_dataset(train_tokenized, 'a2_label')
val_a2 = create_annotator_dataset(val_tokenized, 'a2_label')

train_a3 = create_annotator_dataset(train_tokenized, 'a3_label')
val_a3 = create_annotator_dataset(val_tokenized, 'a3_label')

test_a1 = create_annotator_dataset(test_tokenized, 'a1_label')
test_a2 = create_annotator_dataset(test_tokenized, 'a2_label')
test_a3 = create_annotator_dataset(test_tokenized, 'a3_label')



In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='macro')
    
    # Compute cross-entropy loss
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy() 
    cross_entropy = -np.sum(np.eye(probs.shape[1])[labels] * np.log(probs + 1e-9)) / len(labels)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'cross_entropy': cross_entropy
    }

In [17]:
output_dir = f"./multiclassification_stance_ensemble/{model_name_filename}/results/human"

In [18]:
import shutil
def train_annotator_model(train_dataset, val_dataset, output_dir):
    model = BertForSequenceClassification.from_pretrained('google-bert/bert-large-uncased', num_labels=4)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=8,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        learning_rate=5e-5,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        metric_for_best_model="eval_f1",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    
    trainer.train()
   

    best_model_dir = trainer.state.best_model_checkpoint
    if best_model_dir:
        for checkpoint in os.listdir(output_dir):
            checkpoint_path = os.path.join(output_dir, checkpoint)
            if checkpoint_path != best_model_dir and os.path.isdir(checkpoint_path):
                shutil.rmtree(checkpoint_path)

    return trainer.model

    

In [19]:
model_a1 = train_annotator_model(train_a1, val_a1, "output_a1")
model_a2 = train_annotator_model(train_a2, val_a2, "output_a2")
model_a3 = train_annotator_model(train_a3, val_a3, "output_a3")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Cross Entropy
1,1.3843,1.3633,0.338129,0.154448,1.363194
2,1.379,1.360293,0.366906,0.177451,1.360282
3,1.3438,1.342178,0.366906,0.134211,1.342207
4,1.3233,1.367497,0.330935,0.265958,1.368196
5,1.2353,1.347052,0.402878,0.322618,1.347641
6,1.1214,1.315094,0.395683,0.32376,1.314963
7,0.9656,1.388647,0.417266,0.372779,1.389425
8,0.8677,1.612297,0.417266,0.386488,1.610416


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Cross Entropy
1,1.5848,1.515302,0.18705,0.078788,1.515625
2,1.3814,1.372032,0.266187,0.228931,1.372046
3,1.3743,1.342614,0.338129,0.165896,1.343105
4,1.3538,1.328308,0.366906,0.159863,1.32862
5,1.3232,1.351398,0.266187,0.229932,1.351978
6,1.2182,1.295206,0.366906,0.257541,1.296008
7,1.1735,1.294911,0.374101,0.322149,1.296128
8,0.9629,1.341864,0.345324,0.328215,1.342599


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Cross Entropy
1,1.541,1.528831,0.165468,0.070988,1.528138
2,1.4036,1.357978,0.294964,0.236536,1.35867
3,1.3349,1.347343,0.28777,0.147492,1.34855
4,1.2982,1.325724,0.302158,0.21789,1.327365
5,1.2324,1.334041,0.323741,0.270174,1.33522
6,1.1351,1.306092,0.345324,0.307985,1.308791
7,1.0319,1.330214,0.359712,0.346553,1.331084
8,0.8914,1.488855,0.338129,0.272172,1.490191




In [20]:
device = torch.device("cpu")

In [21]:
import torch

def calculate_confidences_ensemble(df, models, tokenizer, device):
    """
    Calculate confidence scores for each input using an ensemble of models.
    Args:
        df (pd.DataFrame): DataFrame containing the input texts.
        models (list): List of trained models for the ensemble.
        tokenizer (PreTrainedTokenizer): Tokenizer for preprocessing the input texts.
        device (torch.device): Device to run the models on (e.g., "cpu" or "cuda").
    Returns:
        pd.DataFrame: Updated DataFrame with 'confidence_scores' and 'softmax_probs'.
    """
    confidences = []  
    softmax_probs = []  # List to store softmax probabilities for each input

    # Move all models to the device and set them to evaluation mode
    for model in models:
        model.to(device)
        model.eval()

    # Loop through each input text
    for text in df['Input']:
        # Tokenize and preprocess the text input
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

        ensemble_logits = None
        # Aggregate logits from all models
        for model in models:
            with torch.no_grad():
                logits = model(**inputs).logits
                if ensemble_logits is None:
                    ensemble_logits = logits
                else:
                    ensemble_logits += logits  # Aggregate logits across models

        # Calculate ensemble probabilities
        probabilities = torch.nn.functional.softmax(ensemble_logits, dim=-1).cpu().numpy()[0]
        confidences.append(probabilities.max())  # Max confidence for the predicted class
        softmax_probs.append(probabilities)  # Save softmax probabilities for all classes

    # Update the DataFrame
    df['confidence_scores'] = confidences
    df['softmax_probs'] = softmax_probs

    return df




In [22]:
from collections import Counter
device = torch.device("cpu")
def get_predictions(model, tokenizer, text, device):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()[0]
        predicted_class = np.argmax(probabilities)
    return predicted_class, probabilities

# Function to aggregate predictions by majority vote
def majority_vote(predictions):
    """
    Aggregate predictions using majority vote.
    If there is a tie, the model with the higher confidence will be selected.
    """
    counts = Counter(predictions)
    return counts.most_common(1)[0][0] 


In [23]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

from collections import Counter


def majority_vote(predictions):
    """
    Aggregate predictions using majority vote.
    If there is a tie, the first class with the tie count will be returned.
    """
    counts = Counter(predictions)
    return counts.most_common(1)[0][0]

def evaluate_performance_conf(df, ensemble_models, tokenizer, device):
    """
    Evaluate the performance of an ensemble of models using majority voting.
    
    Args:
        df (pd.DataFrame): DataFrame containing the test set.
        ensemble_models (list): List of trained models for the ensemble.
        tokenizer (PreTrainedTokenizer): Tokenizer for preprocessing input texts.
        device (torch.device): Device to run the models on (e.g., "cpu" or "cuda").
    
    Returns:
        None: Prints evaluation metrics and updates the DataFrame with predictions and confidence scores.
    """
    # Label encoding for majority labels
    label_encoding = {'Pro': 0, 'Against': 1, 'Neutral': 2, 'Not-about': 3}
    df['labels'] = df['majority_label'].map(label_encoding)
    df['labels'] = df['labels'].astype(int)  # Convert 'labels' column to integers

    # Move all models to the device and set to evaluation mode
    for model in ensemble_models:
        model.to(device)
        model.eval()

    # Store predictions and confidences
    majority_preds = []
    confidences = []

    # Iterate through the test set and get predictions for each input
    for text in df['Input']:
        # Tokenize and preprocess the text input
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

        individual_predictions = []  # To store predictions from each model
        individual_confidences = []  # To store confidences from each model

        for model in ensemble_models:
            with torch.no_grad():
                logits = model(**inputs).logits
                probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()[0]
                individual_predictions.append(np.argmax(probabilities))
                individual_confidences.append(probabilities.max())

        # Perform majority vote for the final prediction
        majority_pred = majority_vote(individual_predictions)
        majority_preds.append(majority_pred)

        # Store average confidence of the models for this prediction
        confidences.append(np.mean(individual_confidences))

    # Add predictions and confidences to the DataFrame
    df['majority_preds'] = majority_preds
    df['confidence_scores'] = confidences

    # Get the true labels and majority predictions
    y_true = df['labels']
    y_pred = np.array(df['majority_preds'], dtype=int)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')  # 'macro' for multiclass
    recall = recall_score(y_true, y_pred, average='macro')  # 'macro' for multiclass
    f1 = f1_score(y_true, y_pred, average='macro')  # 'macro' for multiclass

    # Print performance metrics
    print("Accuracy:", accuracy * 100)
    print("Precision:", precision * 100)
    print("Recall:", recall * 100)
    print("F1 Score:", f1 * 100)

    # Confusion Matrix and Classification Report
    conf_matrix = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    class_report = classification_report(y_true, y_pred)
    print("Classification Report:")
    print(class_report)

    # Confidence Analysis
    correct_confidence = np.mean([conf for pred, true, conf in zip(y_pred, y_true, confidences) if pred == true])
    incorrect_confidence = np.mean([conf for pred, true, conf in zip(y_pred, y_true, confidences) if pred != true])
    avg_confidence = np.mean(confidences)

    print("Average Confidence Score:", avg_confidence * 100)
    print("Average Confidence for Correct Predictions:", correct_confidence * 100)
    print("Average Confidence for Incorrect Predictions:", incorrect_confidence * 100)


In [24]:
model_a1.save_pretrained("./model_a1")
tokenizer.save_pretrained("./model_a1")

model_a2.save_pretrained("./model_a2")
tokenizer.save_pretrained("./model_a2")

model_a3.save_pretrained("./model_a3")
tokenizer.save_pretrained("./model_a3")


('./model_a3/tokenizer_config.json',
 './model_a3/special_tokens_map.json',
 './model_a3/vocab.txt',
 './model_a3/added_tokens.json')

In [25]:
model_a1.push_to_hub("bmuscato/stance_ensemble_a1")
model_a2.push_to_hub("bmuscato/stance_ensemble_a2")
model_a3.push_to_hub("bmuscato/stance_ensemble_a3")

tokenizer.push_to_hub("bmuscato/stance_ensemble_a1")
tokenizer.push_to_hub("bmuscato/stance_ensemble_a2")
tokenizer.push_to_hub("bmuscato/stance_ensemble_a3")

model.safetensors: 100%|██████████| 1.34G/1.34G [00:35<00:00, 38.0MB/s]
model.safetensors: 100%|██████████| 1.34G/1.34G [00:29<00:00, 45.2MB/s]
model.safetensors: 100%|██████████| 1.34G/1.34G [00:32<00:00, 41.8MB/s]


CommitInfo(commit_url='https://huggingface.co/bmuscato/stance_ensemble_a3/commit/0d1b5fed73e6609d41686f4f97d7dc2030552587', commit_message='Upload tokenizer', commit_description='', oid='0d1b5fed73e6609d41686f4f97d7dc2030552587', pr_url=None, repo_url=RepoUrl('https://huggingface.co/bmuscato/stance_ensemble_a3', endpoint='https://huggingface.co', repo_type='model', repo_id='bmuscato/stance_ensemble_a3'), pr_revision=None, pr_num=None)

In [26]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

ensemble_models = [
    AutoModelForSequenceClassification.from_pretrained("bmuscato/stance_ensemble_a1"),
    AutoModelForSequenceClassification.from_pretrained("bmuscato/stance_ensemble_a2"),
    AutoModelForSequenceClassification.from_pretrained("bmuscato/stance_ensemble_a3")
]
tokenizer = AutoTokenizer.from_pretrained("bmuscato/stance_ensemble_a1")


evaluate_performance_conf(test, ensemble_models, tokenizer, device)


Accuracy: 43.16546762589928
Precision: 47.11174242424243
Recall: 43.49438652766639
F1 Score: 43.303133037630204
Confusion Matrix:
[[13  3 21  6]
 [ 7 11  8  3]
 [ 6  1 24 12]
 [ 4  1  7 12]]
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.30      0.36        43
           1       0.69      0.38      0.49        29
           2       0.40      0.56      0.47        43
           3       0.36      0.50      0.42        24

    accuracy                           0.43       139
   macro avg       0.47      0.43      0.43       139
weighted avg       0.46      0.43      0.43       139

Average Confidence Score: 50.075799226760864
Average Confidence for Correct Predictions: 50.352948904037476
Average Confidence for Incorrect Predictions: 49.86530244350433


In [27]:
test.to_csv("/home/bmuscato/venv_b/share/doc/networkx-3.1/dataset/code_TACL/results_ensemble_roberta_stance.csv")