## LIBERT
More details about LIBERT you can find them in this link:
https://github.com/anlausch/LIBERT

In [1]:
import os
os.environ["WANDB_MODE"] = "dryrun"

In [7]:
from config import CONFIG
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments,BertConfig 
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AutoConfig
from torch.nn import CrossEntropyLoss
import torch 
import numpy as np
import pandas as pd
import os
from config import CONFIG

In [20]:
#Config Variables 
data_set= "../data/kialo_references.pickle" #"../data/microtext_references.pickle" 

# specify the directory where model files are saved
model_dir = "./models/LIBERT/"

# Load the model
tokenizer =  BertTokenizerFast.from_pretrained(model_dir)
config = BertConfig.from_pretrained(model_dir)
libert_model = BertForSequenceClassification.from_pretrained(model_dir,
                                                            config=config)


Some weights of the model checkpoint at ./models/LIBERT/ were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at 

In [21]:
# Program
torch.manual_seed(0)

class RelationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __getitem__(self, index):
        premise = self.data["premise"].iloc[index]
        claim = self.data["claim"].iloc[index]

        encoding = self.tokenizer.encode_plus(
            premise,
            claim,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )


        if 'label' in self.data.columns:
            
            label = torch.tensor(0 if self.data["label"].iloc[index] == "Attack" else 1, dtype=torch.int64)
            
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
         }
            
        else:
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
                    

    def __len__(self):
        return len(self.data)

In [22]:
def create_dataset(mode: str, tokenizer, shuffle=False):
    
    df = pd.read_pickle(data_set)
    split = df[df['mode'] == mode]
    split = split[split['label'].isin(['Attack', 'Support'])]
    #split.reset_index(drop=True)
    
    return RelationDataset(split, tokenizer)

In [23]:
train_dataset = create_dataset("train", tokenizer, True)
validate_dataset = create_dataset("validate", tokenizer, False)

In [24]:
# Load datasets
df = pd.read_pickle(data_set)

train_df = df[df['mode'] == 'train']
labels = train_df[train_df['label'].isin(['Attack', 'Support'])]['label']
print('Train dataset class distribution:\n', labels.value_counts())

Train dataset class distribution:
 label
Attack     89164
Support    83416
Name: count, dtype: int64


In [25]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(labels),
                                        y = labels                                                    
                                    )
dic_class_weights = dict(zip(np.unique(labels), class_weights))
print("Class weights: ", dic_class_weights)

# Convert the list to a tensor
class_weights = torch.tensor(class_weights, dtype=torch.float)
class_weights = class_weights.to(device)

Class weights:  {'Attack': 0.9677672603292808, 'Support': 1.0344538218087658}


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    
class CustomTrainer(Trainer):
    def __init__(self, model, args, train_dataset, loss_fct, **kwargs):
        super().__init__(model, args, train_dataset=train_dataset, **kwargs)
        self.loss_fct = loss_fct

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

# Define your loss function
loss_fct = CrossEntropyLoss(weight=class_weights)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for the learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    learning_rate=3e-5,
    #logging_dir='./logs',
    logging_steps=10,
)

# Initialize your trainer with your custom loss function
trainer = CustomTrainer(
    model=libert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    compute_metrics=compute_metrics,
    loss_fct=loss_fct,
)

# Train the model
trainer.train()



Step,Training Loss
10,0.7952
20,0.7077
30,0.711
40,0.6995
50,0.6624
60,0.7124
70,0.7498
80,0.7202
90,0.739
100,0.6838


In [18]:
from sklearn.metrics import classification_report

mapping = {'Attack': 0, 'Support': 1}

# Load the test dataset
df = pd.read_pickle("../data/microtext_references.pickle")
split = df[df['mode'] == 'test']
split['label'] = split['label'].map(mapping)

test_dataset = RelationDataset(split, tokenizer)

# Make predictions
raw_pred, _, _ = trainer.predict(test_dataset)
#preds = np.argmax(raw_pred[0], axis=1)
preds = raw_pred.argmax(axis=1)

# Print classification report
report = classification_report(split['label'].values, preds)
print(report)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  split['label'] = split['label'].map(mapping)


              precision    recall  f1-score   support

           0       0.69      0.60      0.64        15
           1       0.83      0.88      0.86        34

    accuracy                           0.80        49
   macro avg       0.76      0.74      0.75        49
weighted avg       0.79      0.80      0.79        49



In [19]:
trainer.save_model("./models/LIBERT/LIBERT_MICRO/")

In [None]:
## LIBERT
More details about LIBERT you can find them in this link:
https://github.com/anlausch/LIBERT

#Config Variables 
data_set= "../data/kialo_references.pickle" #"../data/microtext_references.pickle" 

# specify the directory where model files are saved
model_dir = "./models/LIBERT/"

# Load the model
tokenizer =  BertTokenizerFast.from_pretrained(model_dir)
config = BertConfig.from_pretrained(model_dir)
libert_model = BertForSequenceClassification.from_pretrained(model_dir,
                                                            config=config)

# Program
torch.manual_seed(0)

class RelationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __getitem__(self, index):
        premise = self.data["premise"].iloc[index]
        claim = self.data["claim"].iloc[index]

        encoding = self.tokenizer.encode_plus(
            premise,
            claim,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )


        if 'label' in self.data.columns:
            
            label = torch.tensor(0 if self.data["label"].iloc[index] == "Attack" else 1, dtype=torch.int64)
            
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
         }
            
        else:
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
                    

    def __len__(self):
        return len(self.data)

def create_dataset(mode: str, tokenizer, shuffle=False):
    
    df = pd.read_pickle(data_set)
    split = df[df['mode'] == mode]
    split = split[split['label'].isin(['Attack', 'Support'])]
    #split.reset_index(drop=True)
    
    return RelationDataset(split, tokenizer)

train_dataset = create_dataset("train", tokenizer, True)
validate_dataset = create_dataset("validate", tokenizer, False)
    
# Load datasets
df = pd.read_pickle(data_set)

train_df = df[df['mode'] == 'train']
labels = train_df[train_df['label'].isin(['Attack', 'Support'])]['label']
print('Train dataset class distribution:\n', labels.value_counts())


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    
class CustomTrainer(Trainer):
    def __init__(self, model, args, train_dataset, loss_fct, **kwargs):
        super().__init__(model, args, train_dataset=train_dataset, **kwargs)
        self.loss_fct = loss_fct

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

# Define your loss function
loss_fct = CrossEntropyLoss(weight=class_weights)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for the learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    learning_rate=3e-5,
    #logging_dir='./logs',
    logging_steps=10,
)

# Initialize your trainer with your custom loss function
trainer = CustomTrainer(
    model=libert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    compute_metrics=compute_metrics,
    loss_fct=loss_fct,
)

# Train the model
trainer.train()