## ERNIE
More details about ERNIE you can find them in this link:
https://huggingface.co/docs/transformers/model_doc/ernie

In [8]:
import os
os.environ["WANDB_MODE"] = "dryrun"

In [97]:
from config import CONFIG
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import torch 
import numpy as np
import pandas as pd
import os
from config import CONFIG


In [98]:
import wandb
wandb.login(key="78d69a339f5c9e47e83b23695b39e1f41fbe1fb3")




True

In [107]:
# Program
torch.manual_seed(0)

class RelationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __getitem__(self, index):
        premise = self.data["premise"].iloc[index]
        claim = self.data["claim"].iloc[index]

        encoding = self.tokenizer.encode_plus(
            premise,
            claim,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )


        if 'label' in self.data.columns:
            
            label = torch.tensor(0 if self.data["label"].iloc[index] == "Attack" else 1, dtype=torch.int64)
            
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
         }
            
        else:
            return {
            'input_ids': encoding['input_ids'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
                    

    def __len__(self):
        return len(self.data)

In [108]:
def create_dataset(mode: str, tokenizer, shuffle=False):
    
    df = pd.read_pickle("../data/kialo_references.pickle")
    split = df[df['mode'] == mode]
        
    return RelationDataset(split, tokenizer)

In [109]:
#Create the datasets
tokenizer = BertTokenizerFast.from_pretrained('nghuyong/ernie-2.0-large-en')
train_dataset = create_dataset("train", tokenizer, True)
validate_dataset = create_dataset("validate", tokenizer, False)

In [110]:
# Load the pre-trained ERNIE model for sequence classification
ernie_model = BertForSequenceClassification.from_pretrained('nghuyong/ernie-2.0-large-en', num_labels=2)

You are using a model of type ernie to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at nghuyong/ernie-2.0-large-en were not used when initializing BertForSequenceClassification: ['ernie.encoder.layer.7.attention.self.query.weight', 'ernie.encoder.layer.11.attention.self.query.bias', 'ernie.encoder.layer.14.output.dense.bias', 'ernie.encoder.layer.15.attention.self.key.weight', 'ernie.encoder.layer.0.output.LayerNorm.weight', 'ernie.encoder.layer.7.attention.output.LayerNorm.bias', 'ernie.encoder.layer.13.attention.output.dense.weight', 'ernie.encoder.layer.9.intermediate.dense.weight', 'ernie.encoder.layer.9.attention.self.query.weight', 'ernie.encoder.layer.4.attention.output.LayerNorm.weight', 'ernie.encoder.layer.12.intermediate.dense.weight', 'ernie.encoder.layer.3.attention.self.value.weight', 'ernie.encoder.layer.20.output.dense.bias', 'ernie.encoder.layer.1.output.LayerNorm.bia

In [111]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    learning_rate=2e-5,
    logging_dir='./logs',            
    logging_steps=10,
)

# Create the Trainer and train
trainer = Trainer(
    model=ernie_model,                   # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=validate_dataset,             # evaluation dataset
    compute_metrics=compute_metrics      
)

trainer.train()



Step,Training Loss
10,0.7094
20,0.7021
30,0.6974
40,0.7205
50,0.7035
60,0.6802
70,0.6891
80,0.7267
90,0.6962
100,0.7374


TrainOutput(global_step=27070, training_loss=0.6987362545444642, metrics={'train_runtime': 21182.8478, 'train_samples_per_second': 40.89, 'train_steps_per_second': 1.278, 'total_flos': 2.0180274724345344e+17, 'train_loss': 0.6987362545444642, 'epoch': 5.0})

In [104]:
from sklearn.metrics import classification_report

mapping = {'Attack': 0, 'Support': 1}

# Load the test dataset
df = pd.read_pickle("../data/kialo_references.pickle")
split = df[df['mode'] == 'test']
split['label'] = split['label'].map(mapping)

test_dataset = RelationDataset(split, tokenizer)

# Make predictions
raw_pred, _, _ = trainer.predict(test_dataset)
#preds = np.argmax(raw_pred, axis=1)
preds = torch.nn.functional.softmax(torch.from_numpy(raw_pred), dim=-1).argmax(dim=-1).numpy()

# Print classification report
report = classification_report(split['label'].values, preds)
print(report)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  split['label'] = split['label'].map(mapping)


              precision    recall  f1-score   support

           0       0.44      0.47      0.45        15
           1       0.76      0.74      0.75        34

    accuracy                           0.65        49
   macro avg       0.60      0.60      0.60        49
weighted avg       0.66      0.65      0.66        49



In [106]:
split['label'].values

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1])

In [105]:
preds

array([1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1])

In [112]:
from sklearn.metrics import classification_report

mapping = {'Attack': 0, 'Support': 1}

# Load the test dataset
df = pd.read_pickle("../data/kialo_references.pickle")
#split = df[df['mode'] == 'test']
#true_labels = split['label'].map(mapping)
split = df[(df['mode'] == 'test') & (df['label'] != 'Rephrase')] #Kialo data set
true_labels = split['label'].map(mapping) #kialo data set
test_dataset = RelationDataset(split[['premise', 'claim']], tokenizer)

# Make predictions
raw_pred, x, y = trainer.predict(test_dataset)
preds = np.argmax(raw_pred, axis=1)
predicted_labels = [pred for pred in preds]

# Print classification report
report = classification_report(true_labels, predicted_labels)
print(report)


              precision    recall  f1-score   support

           0       0.50      1.00      0.66     13237
           1       0.00      0.00      0.00     13480

    accuracy                           0.50     26717
   macro avg       0.25      0.50      0.33     26717
weighted avg       0.25      0.50      0.33     26717



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [113]:
predicted_labels[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [114]:
# Save the model
trainer.save_model("./models/kialo")
#tokenizer.save_pretrained("./models/")

In [92]:
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_pickle("../data/v.pickle")
split = df[df['mode'] != 'test']
data = split
#data['labels'] = data['label'].map({'Support': 0, 'Attack': 1})

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(data[['premise', 'claim']], data['label'], test_size=0.2)

# Load tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('nghuyong/ernie-2.0-en')
model = BertForSequenceClassification.from_pretrained('nghuyong/ernie-2.0-en')

# Preprocess data
train_encodings = tokenizer(train_texts['premise'].tolist(), train_texts['claim'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts['premise'].tolist(), test_texts['claim'].tolist(), truncation=True, padding=True)

# Create PyTorch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor([self.labels[idx], 1-self.labels[idx]])  
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_df = pd.concat([train_texts, train_labels], axis=1)
test_df = pd.concat([test_texts, test_labels], axis=1)

train_dataset = RelationDataset(train_df, tokenizer)
test_dataset = RelationDataset(test_df, tokenizer)

# Initialize trainer
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    learning_rate=3e-5,
    logging_dir='./logs',            
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset            # evaluation dataset
)

# Train model
trainer.train()


You are using a model of type ernie to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at nghuyong/ernie-2.0-en were not used when initializing BertForSequenceClassification: ['ernie.encoder.layer.10.attention.self.query.bias', 'ernie.encoder.layer.10.attention.output.LayerNorm.bias', 'ernie.encoder.layer.7.attention.self.query.weight', 'ernie.encoder.layer.9.attention.output.LayerNorm.bias', 'ernie.encoder.layer.11.attention.self.query.bias', 'ernie.encoder.layer.0.attention.self.key.bias', 'ernie.encoder.layer.8.attention.self.key.bias', 'ernie.encoder.layer.9.attention.self.key.bias', 'ernie.encoder.layer.7.attention.output.dense.weight', 'ernie.encoder.layer.1.attention.output.LayerNorm.bias', 'ernie.encoder.layer.8.attention.output.dense.weight', 'ernie.encoder.layer.0.output.LayerNorm.weight', 'ernie.encoder.layer.11.intermediate.dense.weight', 'ernie.encoder.layer.7.output.LayerNo

Step,Training Loss
10,0.6872
20,0.6085
30,0.6391
40,0.5229
50,0.5963
60,0.5653
70,0.6028
80,0.5557
90,0.5963


TrainOutput(global_step=90, training_loss=0.5971285078260634, metrics={'train_runtime': 12.0908, 'train_samples_per_second': 114.136, 'train_steps_per_second': 7.444, 'total_flos': 90773314099200.0, 'train_loss': 0.5971285078260634, 'epoch': 5.0})

In [93]:
from sklearn.metrics import classification_report

# Load test data and preprocess
# test_data is your test dataframe
df = pd.read_pickle("../data/microtext_references.pickle")
split = df[df['mode'] == 'test']
test_data = split
test_texts = test_data[['premise', 'claim']]
test_labels = split['label'].map({'Support': 1, 'Attack': 0})


test_encodings = tokenizer(test_texts['premise'].tolist(), test_texts['claim'].tolist(), truncation=True, padding=True)
test_dataset = Dataset(encodings=test_encodings)

# Make predictions
predictions, _, _ = trainer.predict(test_dataset)

# The predictions are a (num_examples, num_classes) dimensional tensor. 
# To convert these into a label (0 or 1 in your case), you can use argmax to get the most likely class label
predicted_labels = np.argmax(predictions, axis=1)

# Compare with true labels
print(classification_report(test_labels, predicted_labels, target_names=['Support', 'Attack']))


              precision    recall  f1-score   support

     Support       0.00      0.00      0.00        15
      Attack       0.69      1.00      0.82        34

    accuracy                           0.69        49
   macro avg       0.35      0.50      0.41        49
weighted avg       0.48      0.69      0.57        49



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [94]:
predictions

array([[-0.47765657,  0.6284036 ],
       [-0.3827848 ,  0.5059931 ],
       [-0.51059556,  0.60775185],
       [-0.4582515 ,  0.6417749 ],
       [-0.43594295,  0.68597144],
       [-0.31045875,  0.5102212 ],
       [-0.573338  ,  0.77391195],
       [-0.28459394,  0.44321075],
       [-0.5249832 ,  0.6675417 ],
       [-0.50010026,  0.6454623 ],
       [-0.46879306,  0.5562685 ],
       [-0.34884903,  0.5171508 ],
       [-0.35918817,  0.554414  ],
       [-0.490783  ,  0.67655236],
       [-0.30465046,  0.4759059 ],
       [-0.30493003,  0.58166105],
       [-0.4115145 ,  0.6375058 ],
       [-0.2485849 ,  0.43692085],
       [-0.4644021 ,  0.6671212 ],
       [-0.37350306,  0.48364866],
       [-0.52495414,  0.741391  ],
       [-0.42503536,  0.5445449 ],
       [-0.20830788,  0.40509287],
       [-0.4156199 ,  0.61989534],
       [-0.3139051 ,  0.53776634],
       [-0.33418903,  0.582731  ],
       [-0.47801027,  0.67387575],
       [-0.5094697 ,  0.7961269 ],
       [-0.3214899 ,

In [95]:
predicted_labels

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1])

In [96]:
test_labels

0     0
1     0
2     0
3     0
4     0
5     1
6     1
7     0
8     0
9     1
10    1
11    1
12    1
13    0
14    0
15    0
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    0
28    0
29    1
30    1
31    1
32    1
33    1
34    1
35    1
36    1
37    1
38    1
39    0
40    1
41    1
42    1
43    0
44    1
45    1
46    1
47    0
48    1
Name: label, dtype: int64