## ERNIE
More details about ERNIE you can find them in this link:
https://huggingface.co/docs/transformers/model_doc/ernie

In [1]:
import os
os.environ["WANDB_MODE"] = "dryrun"

In [2]:
from config import CONFIG
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AutoConfig
from torch.nn import CrossEntropyLoss
import torch 
import numpy as np
import pandas as pd
import os
from config import CONFIG


In [3]:
import wandb
wandb.login(key="78d69a339f5c9e47e83b23695b39e1f41fbe1fb3")


False

In [3]:
#Config Variables 
data_set= "../data/kialo_references.pickle" #"../data/microtext_references.pickle" 




In [10]:
# Program
torch.manual_seed(0)

class RelationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __getitem__(self, index):
        premise = self.data["premise"].iloc[index]
        claim = self.data["claim"].iloc[index]

        encoding = self.tokenizer.encode_plus(
            premise,
            claim,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )


        if 'label' in self.data.columns:
            
            label = torch.tensor(0 if self.data["label"].iloc[index] == "Attack" else 1, dtype=torch.int64)
            
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
         }
            
        else:
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
                    

    def __len__(self):
        return len(self.data)

In [12]:
def create_dataset(mode: str, tokenizer, shuffle=False):
    
    df = pd.read_pickle(data_set)
    split = df[df['mode'] == mode]
    split = split[split['label'].isin(['Attack', 'Support'])]
    #split.reset_index(drop=True)
    
    return RelationDataset(split, tokenizer)

In [13]:
#Create the datasets
tokenizer = BertTokenizerFast.from_pretrained('nghuyong/ernie-2.0-large-en')
train_dataset = create_dataset("train", tokenizer, False)
validate_dataset = create_dataset("validate", tokenizer, False)

In [15]:
# Load datasets
df = pd.read_pickle(data_set)

train_df = df[df['mode'] == 'train']
labels = train_df[train_df['label'].isin(['Attack', 'Support'])]['label']
print('Train dataset class distribution:\n', labels.value_counts())

Train dataset class distribution:
 label
Attack     89164
Support    83416
Name: count, dtype: int64


In [16]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(labels),
                                        y = labels                                                    
                                    )
dic_class_weights = dict(zip(np.unique(labels), class_weights))
print("Class weights: ", dic_class_weights)

# Convert the list to a tensor
class_weights = torch.tensor(class_weights, dtype=torch.float)
class_weights = class_weights.to(device)

Class weights:  {'Attack': 0.9677672603292808, 'Support': 1.0344538218087658}


In [17]:
# Load the pre-trained ERNIE model for sequence classification
ernie_model = BertForSequenceClassification.from_pretrained('nghuyong/ernie-2.0-large-en', 
                                                            num_labels=2, 
                                                            output_attentions = False, 
                                                            output_hidden_states = False)

You are using a model of type ernie to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at nghuyong/ernie-2.0-large-en were not used when initializing BertForSequenceClassification: ['ernie.encoder.layer.1.attention.self.query.bias', 'ernie.encoder.layer.7.output.dense.weight', 'ernie.encoder.layer.19.output.LayerNorm.weight', 'ernie.encoder.layer.4.attention.output.dense.weight', 'ernie.encoder.layer.14.output.dense.bias', 'ernie.encoder.layer.1.attention.self.value.weight', 'ernie.encoder.layer.2.attention.output.dense.bias', 'ernie.encoder.layer.6.intermediate.dense.bias', 'ernie.encoder.layer.10.intermediate.dense.bias', 'ernie.encoder.layer.8.intermediate.dense.bias', 'ernie.encoder.layer.8.attention.output.LayerNorm.weight', 'ernie.encoder.layer.14.output.LayerNorm.bias', 'ernie.encoder.layer.18.intermediate.dense.bias', 'ernie.encoder.layer.4.attention.self.query.bias', 'ernie.enco

In [107]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    
class CustomTrainer(Trainer):
    def __init__(self, model, args, train_dataset, loss_fct, **kwargs):
        super().__init__(model, args, train_dataset=train_dataset, **kwargs)
        self.loss_fct = loss_fct

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

# Define your loss function
loss_fct = CrossEntropyLoss(weight=class_weights)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for the learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    learning_rate=3e-5,
    #logging_dir='./logs',
    logging_steps=10,
)

# Initialize your trainer with your custom loss function
trainer = CustomTrainer(
    model=ernie_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    compute_metrics=compute_metrics,
    loss_fct=loss_fct,
)

# Train the model
trainer.train()



Step,Training Loss
10,0.6846
20,0.8718
30,0.8235
40,0.76
50,1.0779
60,0.8777
70,1.3418
80,1.4278
90,0.7896
100,1.0299


TrainOutput(global_step=365, training_loss=0.8089660997260106, metrics={'train_runtime': 59.8257, 'train_samples_per_second': 24.154, 'train_steps_per_second': 6.101, 'total_flos': 336660205002240.0, 'train_loss': 0.8089660997260106, 'epoch': 5.0})

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for the learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    learning_rate=3e-5,
    #logging_dir='./logs',
    logging_steps=10,
)

# Create the Trainer and train
trainer = Trainer(
    model=ernie_model,               # the instantiated Transformers model to be trained
    args=training_args,              # training arguments, defined above
    train_dataset=train_dataset,     # training dataset
    #eval_dataset=validate_dataset,   # evaluation dataset
    compute_metrics=compute_metrics
)

trainer.train()



Step,Training Loss
10,0.6172
20,0.5955
30,0.5948
40,0.5596
50,0.5903


TrainOutput(global_step=50, training_loss=0.5914792919158935, metrics={'train_runtime': 35.5472, 'train_samples_per_second': 41.916, 'train_steps_per_second': 1.407, 'total_flos': 347144432839680.0, 'train_loss': 0.5914792919158935, 'epoch': 5.0})

In [108]:
from sklearn.metrics import classification_report

# Load the test dataset
mapping = {'Attack': 0, 'Support': 1}
df = pd.read_pickle(data_set)
split = df[df['mode'] == 'test']
split['label'] = split['label'].map(mapping)

test_dataset = RelationDataset(split, tokenizer)

# Make predictions
raw_pred, _, _ = trainer.predict(test_dataset)
preds = raw_pred.argmax(axis=1)

# Print classification report
report = classification_report(split['label'].values, preds)
print(report)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  split['label'] = split['label'].map(mapping)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        15
           1       0.69      1.00      0.82        34

    accuracy                           0.69        49
   macro avg       0.35      0.50      0.41        49
weighted avg       0.48      0.69      0.57        49



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [93]:
split['label'].values

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1])

In [94]:
preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0])

In [114]:
# Save the model
trainer.save_model("./models/kialo")
tokenizer.save_pretrained("./models/")

In [58]:
trainer.evaluate()

{'eval_loss': 0.8110434412956238,
 'eval_accuracy': 0.6170212765957447,
 'eval_f1': 0.75,
 'eval_precision': 0.675,
 'eval_recall': 0.84375,
 'eval_runtime': 0.4056,
 'eval_samples_per_second': 115.879,
 'eval_steps_per_second': 2.466,
 'epoch': 8.0}

In [None]:
## ERNIE
More details about ERNIE you can find them in this link:
https://huggingface.co/docs/transformers/model_doc/ernie

# Program
torch.manual_seed(0)

class RelationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __getitem__(self, index):
        premise = self.data["premise"].iloc[index]
        claim = self.data["claim"].iloc[index]

        encoding = self.tokenizer.encode_plus(
            premise,
            claim,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )


        if 'label' in self.data.columns:
            
            label = torch.tensor(0 if self.data["label"].iloc[index] == "Attack" else 1, dtype=torch.int64)
            
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
         }
            
        else:
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
                    

    def __len__(self):
        return len(self.data)

def create_dataset(mode: str, tokenizer, shuffle=False):
    
    df = pd.read_pickle(data_set)
    split = df[df['mode'] == mode]
    split = split[split['label'] != 'Support: Example']
    #split.reset_index(drop=True)
    
    return RelationDataset(split, tokenizer)
    
#Create the datasets
tokenizer = BertTokenizerFast.from_pretrained('nghuyong/ernie-2.0-large-en')
train_dataset = create_dataset("train", tokenizer, False)
validate_dataset = create_dataset("validate", tokenizer, False)

# Load datasets
df = pd.read_pickle(data_set)

train_df = df[df['mode'] == 'train']
labels = train_df[train_df['label'] != 'Support: Example']['label']
print('Train dataset class distribution:\n', labels.value_counts())

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(labels),
                                        y = labels                                                    
                                    )
dic_class_weights = dict(zip(np.unique(labels), class_weights))
print("Class weights: ", dic_class_weights)

# Convert the list to a tensor
class_weights = torch.tensor(class_weights, dtype=torch.float)
class_weights = class_weights.to(device)

# Load the pre-trained ERNIE model for sequence classification
ernie_model = BertForSequenceClassification.from_pretrained('nghuyong/ernie-2.0-large-en', 
                                                            num_labels=2, 
                                                            output_attentions = False, 
                                                            output_hidden_states = False)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    
class CustomTrainer(Trainer):
    def __init__(self, model, args, train_dataset, loss_fct, **kwargs):
        super().__init__(model, args, train_dataset=train_dataset, **kwargs)
        self.loss_fct = loss_fct

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

# Define your loss function
loss_fct = CrossEntropyLoss(weight=class_weights)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for the learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    learning_rate=3e-5,
    #logging_dir='./logs',
    logging_steps=10,
)

# Initialize your trainer with your custom loss function
trainer = CustomTrainer(
    model=ernie_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    compute_metrics=compute_metrics,
    loss_fct=loss_fct,
)

# Train the model
trainer.train()