In [1]:
from tuwnlp.dataset import PropagandaDataset
from tuwnlp.utils import Language
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import DataLoader
import torch

In [2]:
dataset = PropagandaDataset("../data", languages=[Language.EN], tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased'))
train, val, test = torch.utils.data.random_split(dataset, [100, 50, 50])

200it [00:00, 5050.31it/s]


Language.EN LabelLevel.NARATIVES (200, 23)
(200, 23)


In [3]:
from torch import nn
class CustomBertClassifier(torch.nn.Module):
    def __init__(self, hidden_size, out_size):
        super(CustomBertClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained(
            "distilbert-base-multilingual-cased", 
        )
        self.dropout1 = nn.Dropout()
        self.linear1 = nn.Linear(in_features=768, out_features=hidden_size, bias=False)
        self.batch_norm1 = nn.BatchNorm1d(num_features=512)
        self.relu1 =  nn.ReLU()
        self.dropout2 = nn.Dropout(p=0.8)
        self.linear2 = nn.Linear(in_features=hidden_size, out_features=out_size, bias=False)
        self.batch_norm2 = nn.BatchNorm1d(num_features=512)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.bert(**x)[0]
        x = self.dropout1(x)
        x = self.linear1(x)
        x = self.batch_norm1(x)
        x = self.relu1(x)
        x = self.dropout2(x)
        x = self.linear2(x)
        x = self.batch_norm2(x)
        x = torch.max(x,1, keepdim=False).values
        return self.sigmoid(x)
    
    def freeze_bert(self):
        """
        Freezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        only the wieghts of the custom classifier are modified.
        """
        for param in self.bert.named_parameters():
            param[1].requires_grad=False
    
    def unfreeze_bert(self):
        """
        Unfreezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        both the wieghts of the custom classifier and of the underlying BERT are modified.
        """
        for param in self.bert.named_parameters():
            param[1].requires_grad=True

    def freeze_embeddings(self):
        """
        Freezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        only the wieghts of the custom classifier are modified.
        """
        for param in self.bert.embeddings.named_parameters():
            param[1].requires_grad=False
    
    def unfreeze_embeddings(self):
        """
        Unfreezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        both the wieghts of the custom classifier and of the underlying BERT are modified.
        """
        for param in self.bert.embeddings.named_parameters():
            param[1].requires_grad=True

In [4]:
from tqdm import tqdm
from torch.optim import AdamW
import wandb

def training_step(dataloader, model, optimizer, loss_fn, freeze_bert = False, freeze_embeddings = False, runner = None):
    """Method to train the model"""
    model.train()
    model.freeze_bert() if freeze_bert else model.unfreeze_bert()
    model.freeze_embeddings() if not freeze_bert and freeze_embeddings else model.unfreeze_bert()
      
    epoch_loss = 0
 
    for x, y in tqdm(dataloader):        
        output = model(x)
        optimizer.zero_grad()
        loss = loss_fn(output, y.float())
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
        loss_val = loss.item()
        if runner is not None:
            runner.log({"train/loss": loss_val})

def eval_step(dataloader, model, loss_fn, runner = None):
    """Method to eval the model"""
    model.eval()
    epoch_loss = []
    
    with torch.no_grad():
        for x, y in tqdm(dataloader):        
            output = model(x)
            loss = loss_fn(output, y.float())
            loss_val = loss.item()
            epoch_loss.append(loss_val)
            if runner is not None:
                runner.log({"eval/loss": loss_val})
    return torch.mean(torch.tensor(epoch_loss)).item()

def eval_model(dataloader, model, scoring_functions):
    outputs = []
    ys = []
    model.eval()
    with torch.no_grad():
        for x,y in tqdm(dataloader):
            outputs.append(model(x))
            ys.append(y)

    y_pred = torch.concat(outputs).detach().numpy()
    ys = torch.concat(ys).float()
    res = {}
    for sf in scoring_functions:
        name = sf.__name__
        res[name] = sf(y_pred > 0.5, ys.float(), average="macro")

    return res

In [5]:
from sklearn.metrics import f1_score, recall_score, precision_score
scoring_functions = [f1_score, recall_score, precision_score]


lr = 0.0001
batch_size = 5
epochs = 2

runner = wandb.init(
    # set the wandb project where this run will be logged
    project="tuw-nlp2024-multilanguage-bert",

    # track hyperparameters and run metadata
    config={
    "learning_rate": lr,
    "architecture": "distiled-bert",
    "epochs": epochs
    }
)


model = CustomBertClassifier(256, 22)
optimizer = AdamW(model.parameters(), lr=lr)
loss_fn = loss_fn = torch.nn.BCELoss()
train_dl = DataLoader(train, batch_size=batch_size)
eval_dl = DataLoader(val, batch_size=batch_size)
test_dl = DataLoader(test, batch_size=batch_size)

best_loss = torch.inf
for i in range(epochs):
    training_step(train_dl, model, optimizer, loss_fn, freeze_bert=True, runner=runner)
    eval_loss = eval_step(eval_dl, model, loss_fn, runner=runner)

runner.finish()
eval_model(test_dl, model, scoring_functions)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33me12325298[0m ([33me12325298-tu-wien[0m). Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 20/20 [02:39<00:00,  7.96s/it]
100%|██████████| 10/10 [00:19<00:00,  1.92s/it]
100%|██████████| 20/20 [02:43<00:00,  8.20s/it]
100%|██████████| 10/10 [00:19<00:00,  1.95s/it]


0,1
eval/loss,▂▃▄▇▂▆▃▂▂▁▃▄▅█▂▇▄▃▂▂
train/loss,█▇▆▆▅▅▄▃▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▁▁▁▁▁▁

0,1
eval/loss,0.75951
train/loss,1.61688


100%|██████████| 10/10 [00:19<00:00,  1.92s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'f1_score': 0.09826925532132913,
 'recall_score': 0.06090909090909092,
 'precision_score': 0.5454545454545454}

In [6]:
dataset = PropagandaDataset("../data", languages=[Language.EN, Language.HI, Language.PT, Language.BG], tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased'))
train, val, test = torch.utils.data.random_split(dataset, [400, 163, 163])

200it [00:00, 4901.63it/s]


Language.EN LabelLevel.NARATIVES (200, 23)


115it [00:00, 1959.03it/s]


Language.HI LabelLevel.NARATIVES (115, 23)


200it [00:00, 1692.93it/s]


Language.PT LabelLevel.NARATIVES (200, 23)


211it [00:00, 3306.34it/s]


Language.BG LabelLevel.NARATIVES (211, 23)
(726, 23)


In [7]:
lr = 0.0001
batch_size = 5
epochs = 3

runner = wandb.init(
    # set the wandb project where this run will be logged
    project="tuw-nlp2024-multilanguage-bert",

    # track hyperparameters and run metadata
    config={
    "learning_rate": lr,
    "architecture": "distiled-bert",
    "epochs": epochs
    }
)


model = CustomBertClassifier(256, 22)
optimizer = AdamW(model.parameters(), lr=lr)
loss_fn = loss_fn = torch.nn.BCELoss()
train_dl = DataLoader(train, batch_size=batch_size)
eval_dl = DataLoader(val, batch_size=batch_size)
test_dl = DataLoader(test, batch_size=batch_size)

best_loss = torch.inf
for i in range(epochs):
    training_step(train_dl, model, optimizer, loss_fn, freeze_bert=True, runner=runner)
    eval_loss = eval_step(eval_dl, model, loss_fn, runner=runner)

runner.finish()
eval_model(test_dl, model, scoring_functions)

100%|██████████| 80/80 [10:11<00:00,  7.65s/it]
100%|██████████| 33/33 [00:58<00:00,  1.78s/it]
100%|██████████| 80/80 [10:03<00:00,  7.55s/it]
100%|██████████| 33/33 [00:58<00:00,  1.79s/it]
100%|██████████| 80/80 [10:35<00:00,  7.95s/it]
100%|██████████| 33/33 [01:05<00:00,  1.98s/it]


0,1
eval/loss,▄▃▂▅▅▆▆▅▃▂▃▅▃▂▆▅▆▃▆█▇▃▃▅▃▅▃▂▃▁▄▅▅▆▆▅▆▃▃▄
train/loss,██▆▆▅▅▅▄▄▅▃▃▃▄▃▃▃▃▃▂▂▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁

0,1
eval/loss,0.78916
train/loss,1.24011


100%|██████████| 33/33 [00:59<00:00,  1.81s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'f1_score': 0.09960433906754468,
 'recall_score': 0.05688789737869493,
 'precision_score': 0.5909090909090909}

In [8]:
dataset = PropagandaDataset("../data", languages=[Language.EN], tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased'))
train, val, test = torch.utils.data.random_split(dataset, [100, 50, 50])

200it [00:00, 5003.97it/s]


Language.EN LabelLevel.NARATIVES (200, 23)
(200, 23)


In [9]:
test_dl = DataLoader(test, batch_size=batch_size)
eval_model(test_dl, model, scoring_functions)

100%|██████████| 10/10 [00:19<00:00,  1.95s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'f1_score': 0.09748743842629337,
 'recall_score': 0.059090909090909104,
 'precision_score': 0.5909090909090909}