<a href="https://colab.research.google.com/github/battemuulenn/DataScienceThesis/blob/main/Experiments_DistillBERT_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Due to the computing power available from the Google Collab, we have to mount to the drive to save the model files.
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
#Here we control the randomness of our experiments, therefore we can be reproducable for further research.
import random
import numpy as np
import torch

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)


In [4]:
#Addding to the reprodicibility and transparency of the model training, we can utilize the WandB library to check on important metrics of the experiments.
#Use of WandB is not neccessary, but it will help us to make good decisions and not waste valuable resources.
import wandb
wandb.login()

import os
os.environ["WANDB_PROJECT"] = "moral-foundations-all-NEW-5epoch"


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbattemuulenn[0m ([33mbattemuulenn-university-of-amsterdam[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
#This step is crucial so we know which transformations are used to convert the existing dataset suitable for the research.
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

df1 = pd.read_csv('filtered_moral_annotations_MFRC.csv')
df2 = pd.read_csv('filtered_moral_annotations_MFTC.csv')

shared_labels = ["care", "fairness", "loyalty", "authority", "non-moral"]

def filter_and_normalize_labels(label_str, shared_labels):
    labels = [l.strip().lower() for l in label_str.split(',')]
    return [l for l in labels if l in shared_labels]

df1['labels'] = df1['annotation'].apply(lambda x: filter_and_normalize_labels(x, shared_labels))
df2['labels'] = df2['annotation'].apply(lambda x: filter_and_normalize_labels(x, shared_labels))

df1 = df1[df1['labels'].map(len) > 0].reset_index(drop=True)
df2 = df2[df2['labels'].map(len) > 0].reset_index(drop=True)

mlb = MultiLabelBinarizer(classes=shared_labels)
mlb.fit([shared_labels])

df1['label_vec'] = mlb.transform(df1['labels']).tolist()
df2['label_vec'] = mlb.transform(df2['labels']).tolist()


In [7]:
#Using a dataset class allows us to handle tokenization and padding in a consistent manner, thus for experiments ensuring fair comparison for models.
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item


In [8]:
#Here we import needed pre-trained models for our experiments, also metrics needed for classification results. Especially, this is
#important for our multi label classification problem and using Binary Cross Entropy loss function.
from transformers import (
    DistilBertTokenizerFast, DistilBertForSequenceClassification,
    BertTokenizerFast, BertForSequenceClassification,
    Trainer, TrainingArguments
)
from sklearn.metrics import f1_score, accuracy_score

def get_model_and_tokenizer(model_name, num_labels):
    if 'distilbert' in model_name:
        tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
        model = DistilBertForSequenceClassification.from_pretrained(
            model_name, num_labels=num_labels, problem_type="multi_label_classification"
        )
    else:
        tokenizer = BertTokenizerFast.from_pretrained(model_name)
        model = BertForSequenceClassification.from_pretrained(
            model_name, num_labels=num_labels, problem_type="multi_label_classification"
        )
    return tokenizer, model

def get_datasets(df, tokenizer):
    return TextDataset(
        texts=df['text'].tolist(),
        labels=df['label_vec'].tolist(),
        tokenizer=tokenizer
    )

def compute_metrics(pred):
    logits, labels = pred
    probs = 1 / (1 + np.exp(-logits))
    preds = (probs > 0.5).astype(int)
    f1 = f1_score(labels, preds, average='micro', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {'f1': f1, 'accuracy': acc}


In [9]:
def train_and_evaluate_scenario(
    train_df, test_df, model_name, mlb, epochs=5,
    run_name=None, save_model=False, save_dir=None, scenario_name=None, save_test_df_path=None
):
    num_labels = len(mlb.classes_)
    tokenizer, model = get_model_and_tokenizer(model_name, num_labels)
    train_dataset = get_datasets(train_df, tokenizer)
    test_dataset = get_datasets(test_df, tokenizer)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        logging_steps=50,
        learning_rate=2e-5,
        weight_decay=0.01,
        disable_tqdm=False,
        report_to="wandb",
        run_name=run_name,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=lambda p: compute_metrics((p.predictions, p.label_ids)),
    )

    trainer.train()
    eval_results = trainer.evaluate()
    print(f"Results for {model_name}: {eval_results}")

    #Save model and tokenizer
    if save_model and save_dir is not None:
        save_path = f"/content/drive/MyDrive/{save_dir}"
        os.makedirs(save_path, exist_ok=True)
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        print(f"Model and tokenizer saved to {save_path}")

    preds = trainer.predict(test_dataset)
    pred_logits = preds.predictions
    pred_probs = 1 / (1 + np.exp(-pred_logits))
    pred_bin = (pred_probs > 0.5).astype(int)
    pred_labels = mlb.inverse_transform(pred_bin)

    # Save test DataFrame with predictions
    test_results_df = test_df.copy()
    test_results_df['pred_labels'] = pred_labels
    test_results_df['pred_label_vec'] = list(pred_bin)
    if save_test_df_path is not None:
        test_results_df.to_pickle(save_test_df_path)
        print(f"Test results DataFrame saved to {save_test_df_path}")

    eval_results['scenario'] = scenario_name if scenario_name else run_name

    # Print and save metrics
    metrics_path = "/content/drive/MyDrive/model_performance_metrics_NEW_5epoch.csv"
    print(f"Performance metrics for {eval_results['scenario']}:")
    for k, v in eval_results.items():
        if k != 'scenario':
            print(f"{k}: {v}")

    import pandas as pd
    metrics_df = pd.DataFrame([eval_results])
    if not os.path.exists(metrics_path):
        metrics_df.to_csv(metrics_path, index=False)
    else:
        metrics_df.to_csv(metrics_path, mode='a', header=False, index=False)
    print(f"Metrics saved to {metrics_path}")

    return eval_results, test_results_df


In [11]:
eval_results_distilbert_mfrc_mftc, test_results_distilbert_mfrc_mftc = train_and_evaluate_scenario(
    df1, df2, 'distilbert-base-uncased', mlb,
    epochs=5,
    run_name="distilbert-mfrc-mftc",
    save_model=False,
    save_dir="distilbert-mfrc-mftc",
    scenario_name="DistilBERT MFRC→MFTC",
    save_test_df_path="/content/drive/MyDrive/test_results_distilbert_mfrc_mftc.pkl"
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5132
100,0.4085
150,0.3896
200,0.3858
250,0.3229
300,0.3627
350,0.3794
400,0.3488
450,0.3452
500,0.3272


Results for distilbert-base-uncased: {'eval_loss': 0.4208658039569855, 'eval_f1': 0.6723858659787629, 'eval_accuracy': 0.650197628458498, 'eval_runtime': 96.9386, 'eval_samples_per_second': 845.607, 'eval_steps_per_second': 52.858, 'epoch': 5.0}
Test results DataFrame saved to /content/drive/MyDrive/test_results_distilbert_mfrc_mftc.pkl
Performance metrics for DistilBERT MFRC→MFTC:
eval_loss: 0.4208658039569855
eval_f1: 0.6723858659787629
eval_accuracy: 0.650197628458498
eval_runtime: 96.9386
eval_samples_per_second: 845.607
eval_steps_per_second: 52.858
epoch: 5.0
Metrics saved to /content/drive/MyDrive/model_performance_metrics_NEW_5epoch.csv


In [13]:
eval_results_distilbert_mftc_mfrc, test_results_distilbert_mftc_mfrc = train_and_evaluate_scenario(
    df2, df1, 'distilbert-base-uncased', mlb,
    epochs=5,
    run_name="distilbert-mftc-mfrc",
    save_model=False,
    save_dir="distilbert-mftc-mfrc",
    scenario_name="DistilBERT MFTC→MFRC",
    save_test_df_path="/content/drive/MyDrive/test_results_distilbert_mftc_mfrc.pkl"
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.4697
100,0.3616
150,0.3644
200,0.3051
250,0.3176
300,0.3136
350,0.298
400,0.2613
450,0.2711
500,0.2565


Results for distilbert-base-uncased: {'eval_loss': 0.6880589723587036, 'eval_f1': 0.6232889030994584, 'eval_accuracy': 0.6427438871734957, 'eval_runtime': 59.681, 'eval_samples_per_second': 803.137, 'eval_steps_per_second': 50.2, 'epoch': 5.0}
Test results DataFrame saved to /content/drive/MyDrive/test_results_distilbert_mftc_mfrc.pkl
Performance metrics for DistilBERT MFTC→MFRC:
eval_loss: 0.6880589723587036
eval_f1: 0.6232889030994584
eval_accuracy: 0.6427438871734957
eval_runtime: 59.681
eval_samples_per_second: 803.137
eval_steps_per_second: 50.2
epoch: 5.0
Metrics saved to /content/drive/MyDrive/model_performance_metrics_NEW_5epoch.csv


In [16]:
eval_results_bert_mfrc_mftc, test_results_bert_mfrc_mftc = train_and_evaluate_scenario(
    df1, df2, 'bert-base-uncased', mlb,
    epochs=5,
    run_name="bert-mfrc-mftc",
    save_model=False,
    save_dir="bert-mfrc-mftc",
    scenario_name="BERT MFRC→MFTC",
    save_test_df_path="/content/drive/MyDrive/test_results_bert_mfrc_mftc.pkl"
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5073
100,0.3994
150,0.3821
200,0.3849
250,0.3266
300,0.363
350,0.3834
400,0.3465
450,0.343
500,0.3292


Results for bert-base-uncased: {'eval_loss': 0.416174978017807, 'eval_f1': 0.6728965287367268, 'eval_accuracy': 0.6491240911530767, 'eval_runtime': 165.5918, 'eval_samples_per_second': 495.024, 'eval_steps_per_second': 30.944, 'epoch': 5.0}
Test results DataFrame saved to /content/drive/MyDrive/test_results_bert_mfrc_mftc.pkl
Performance metrics for BERT MFRC→MFTC:
eval_loss: 0.416174978017807
eval_f1: 0.6728965287367268
eval_accuracy: 0.6491240911530767
eval_runtime: 165.5918
eval_samples_per_second: 495.024
eval_steps_per_second: 30.944
epoch: 5.0
Metrics saved to /content/drive/MyDrive/model_performance_metrics_NEW_5epoch.csv


In [18]:
eval_results_bert_mftc_mfrc, test_results_bert_mftc_mfrc = train_and_evaluate_scenario(
    df2, df1, 'bert-base-uncased', mlb,
    epochs=5,
    run_name="bert-mftc-mfrc",
    save_model=False,
    save_dir="bert-mftc-mfrc",
    scenario_name="BERT MFTC→MFRC",
    save_test_df_path="/content/drive/MyDrive/test_results_bert_mftc_mfrc.pkl"
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.4569
100,0.3486
150,0.3416
200,0.3001
250,0.3251
300,0.3105
350,0.3067
400,0.2628
450,0.2737
500,0.2654


Step,Training Loss
50,0.4569
100,0.3486
150,0.3416
200,0.3001
250,0.3251
300,0.3105
350,0.3067
400,0.2628
450,0.2737
500,0.2654


Results for bert-base-uncased: {'eval_loss': 0.6483014225959778, 'eval_f1': 0.6238737388872241, 'eval_accuracy': 0.6425561211716598, 'eval_runtime': 100.7873, 'eval_samples_per_second': 475.576, 'eval_steps_per_second': 29.726, 'epoch': 5.0}
Test results DataFrame saved to /content/drive/MyDrive/test_results_bert_mftc_mfrc.pkl
Performance metrics for BERT MFTC→MFRC:
eval_loss: 0.6483014225959778
eval_f1: 0.6238737388872241
eval_accuracy: 0.6425561211716598
eval_runtime: 100.7873
eval_samples_per_second: 475.576
eval_steps_per_second: 29.726
epoch: 5.0
Metrics saved to /content/drive/MyDrive/model_performance_metrics_NEW_5epoch.csv
