In [1]:
from datasets import load_dataset, load_from_disk
from transformers import BertModel, AutoTokenizer, DataCollatorWithPadding, PreTrainedModel, AutoConfig, Trainer, EarlyStoppingCallback, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import time, datetime
from datasets import load_metric
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

In [2]:
sst = load_from_disk("../data/sst5")
sst

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'ternary_label'],
        num_rows: 8544
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'ternary_label'],
        num_rows: 2210
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'ternary_label'],
        num_rows: 1101
    })
})

In [3]:
sst.set_format("pandas")

In [4]:
print(sst['train']['ternary_label'].value_counts(normalize=True))
print(sst['validation']['ternary_label'].value_counts(normalize=True))
print(sst['test']['ternary_label'].value_counts(normalize=True))

2    0.422519
0    0.387406
1    0.190075
Name: ternary_label, dtype: float64
2    0.403270
0    0.388738
1    0.207993
Name: ternary_label, dtype: float64
0    0.412670
2    0.411312
1    0.176018
Name: ternary_label, dtype: float64


In [5]:
sst['test'][:5]

Unnamed: 0,text,label,label_text,ternary_label
0,"no movement , no yuks , not much of anything .",1,negative,0
1,"a gob of drivel so sickly sweet , even the eag...",0,very negative,0
2,` how many more voyages can this limping but d...,2,neutral,1
3,so relentlessly wholesome it made me want to s...,2,neutral,1
4,"gangs of new york is an unapologetic mess , wh...",0,very negative,0


In [6]:
sst.set_format()

In [7]:
# max_sequence_length = 2
# batch_size = 32
# learning_rate=2e-05
# num_epochs=2
# num_log_steps = 1000
# output_dir = "../output/"
# model_dir = "../models/"
# checkpoint = "bert-base-uncased"
# eval_steps = 1000

In [8]:
max_sequence_length = 6
batch_size = 8
eval_steps = 5
learning_rate=2e-05
num_train_epochs=3
output_dir = "../output/"
model_dir = "../models/"
checkpoint = "bert-base-uncased"
early_stopping_patience = 10

In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
def tokenize_function(example):
    return tokenizer(example["text"],  truncation=True, padding="max_length", max_length=max_sequence_length)

tokenized_datasets = sst.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at ../data/sst5/train/cache-511cc9f3c86622a9.arrow
Loading cached processed dataset at ../data/sst5/test/cache-caf78cf3dbbb038b.arrow
Loading cached processed dataset at ../data/sst5/validation/cache-f216719faa867d9e.arrow


In [11]:
tokenized_datasets = tokenized_datasets.remove_columns([ 'label_text', 'label'])
tokenized_datasets = tokenized_datasets.rename_column("ternary_label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [12]:
class Classifier(PreTrainedModel):
    def __init__(self, checkpoint, n_classes):
        
        super().__init__(AutoConfig.from_pretrained(checkpoint))
        self.n_classes = n_classes
        self.model = BertModel.from_pretrained(checkpoint, num_labels=self.n_classes)
        self.dropout = nn.Dropout(0.1) 
        self.hidden_dim = self.model.embeddings.word_embeddings.embedding_dim
        self.classifier_layer = nn.Linear(self.hidden_dim, self.n_classes)
        
    def forward(self, input_ids=None, attention_mask=None,labels=None, token_type_ids=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, )
        # print(attention_mask)
        # cls_rep = outputs['last_hidden_state'][:,0,:]
        # cls_rep = self.dropout(cls_rep)
        # logits =self.classifier_layer(cls_rep)
        
        embeds = outputs.last_hidden_state[:,1:,:]
        attentions = attention_mask[:,1:].unsqueeze(-1)
        # print(embeds.shape)
        # print(attentions.shape)
        mean_embeds = torch.sum(embeds * attentions, axis=1)/ (torch.sum(attentions,axis=1))
        logits =self.classifier_layer(mean_embeds)
        
        loss = None
        if labels is not None:
            # print(logits.shape, labels.shape)
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

In [13]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [14]:

model=Classifier(checkpoint=checkpoint,n_classes=3).to(device)
device

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


device(type='cpu')

In [15]:
def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average="macro")
    precision = precision_score(y_true=labels, y_pred=pred, average="macro")
    f1 = f1_score(y_true=labels, y_pred=pred,average="macro")    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [16]:
eval_steps

5

In [17]:
training_args = TrainingArguments(
   output_dir+"bert-base-uncased-sst3",
   evaluation_strategy ='steps',
   eval_steps = eval_steps , # Evaluation and Save happens every eval_steps steps
   save_total_limit = 1, # Only last  model is saved. Older ones are deleted.
   learning_rate=learning_rate,
   per_device_train_batch_size=batch_size,
   per_device_eval_batch_size=batch_size,
   num_train_epochs=num_train_epochs,
   metric_for_best_model = 'f1',
   load_best_model_at_end=True,
   save_steps = eval_steps)

In [18]:
from transformers import Trainer, EarlyStoppingCallback

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"].select(range(32)),
    eval_dataset=tokenized_datasets["validation"].select(range(32)),
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)],
    compute_metrics=compute_metrics,
)


In [19]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `Classifier.forward` and have been ignored: text. If text are not expected by `Classifier.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 32
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 12


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
5,No log,1.126506,0.34375,0.27381,0.328283,0.237179
10,No log,1.152663,0.34375,0.27381,0.328283,0.237179


The following columns in the evaluation set  don't have a corresponding argument in `Classifier.forward` and have been ignored: text. If text are not expected by `Classifier.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ../output/bert-base-uncased-sst3/checkpoint-5
Configuration saved in ../output/bert-base-uncased-sst3/checkpoint-5/config.json
Model weights saved in ../output/bert-base-uncased-sst3/checkpoint-5/pytorch_model.bin
tokenizer config file saved in ../output/bert-base-uncased-sst3/checkpoint-5/tokenizer_config.json
Special tokens file saved in ../output/bert-base-uncased-sst3/checkpoint-5/special_tokens_map.json
Deleting older checkpoint [../output/bert-base-uncased-sst3/checkpoint-3] due to args.save_total_limit
Deleting older checkpoint [../output/bert-base-uncased-sst3/checkpoint-4] due to args.save_total_limit
The followi

TrainOutput(global_step=12, training_loss=0.9552624225616455, metrics={'train_runtime': 14.232, 'train_samples_per_second': 6.745, 'train_steps_per_second': 0.843, 'total_flos': 296002594944.0, 'train_loss': 0.9552624225616455, 'epoch': 3.0})

In [20]:
trainer_eval = Trainer(
    model,
    training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
predictions = trainer_eval.predict(tokenized_datasets["test"].shuffle().select(range(1000)))
print(predictions.predictions.shape, predictions.label_ids.shape)
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(predictions.label_ids, preds))

In [None]:
model.save_pretrained(model_dir+"bert-base-uncased-sst3")

In [None]:
#model.push_to_hub("redacted/bert-base-uncased-sst3")

In [None]:
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification
model_finetuned = BertForSequenceClassification.from_pretrained("../output/huggingface/test_model")

In [21]:
import transformers
transformers.__version__

'4.17.0'