In [1]:
from datasets import load_dataset, DatasetDict, load_from_disk, Value
import math
from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding

In [2]:
sst = load_dataset('sst')

No config specified, defaulting to: sst/default
Reusing dataset sst (/home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
sst

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 8544
    })
    validation: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 2210
    })
})

In [4]:
checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
max_sequence_length = 128
batch_size = 32
eval_steps = 100
learning_rate=2e-05
num_train_epochs=5
output_dir = "../output/"
model_dir = "../models/"
early_stopping_patience = 10

In [6]:
treebank_detok = TreebankWordDetokenizer()

sst_mod = sst.map(
    lambda row: {
        "text": treebank_detok.detokenize(row["sentence"].split()),
        "label": min(math.floor(row["label"] / 0.2), 4.0),
    })

def tokenize_function(example):
    return tokenizer(example["text"],  truncation=True, padding="max_length", max_length=max_sequence_length )


tokenized_datasets = sst_mod.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-e1b5b68499f4feaa.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-aae790b7cb8bdaf4.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-360686b5b4f4ee4d.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-4386bdc35c8d6088.arrow


  0%|          | 0/2 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-5cb0c5a475d55d24.arrow


In [7]:
tokenized_datasets['train']

Dataset({
    features: ['sentence', 'label', 'tokens', 'tree', 'text', 'input_ids', 'attention_mask'],
    num_rows: 8544
})

In [8]:
tokenized_datasets = tokenized_datasets.remove_columns(["text", "tree", "tokens", "sentence" ])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.cast_column("labels", Value("int32"))
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-187ac6628165b53a.arrow


Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-a7dbbcbbe5f1a44e.arrow


['labels', 'input_ids', 'attention_mask']

In [9]:
# from torch.utils.data import DataLoader

# train_dataloader = DataLoader(
#     tokenized_datasets["train"], shuffle=True, batch_size=16, collate_fn=data_collator
# )
# dev_dataloader = DataLoader(
#     tokenized_datasets["dev"], batch_size=16, collate_fn=data_collator
# )

In [10]:
# for batch in train_dataloader:
#     break
# {k: v.shape for k, v in batch.items()}

In [11]:
from transformers import AutoModelForSequenceClassification, RobertaForSequenceClassification
model = RobertaForSequenceClassification.from_pretrained(checkpoint, num_labels=5)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [12]:
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np

In [13]:
# def compute_metrics(eval_preds):
#     metric = load_metric("imdb)
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average="macro")
    precision = precision_score(y_true=labels, y_pred=pred, average="macro")
    f1 = f1_score(y_true=labels, y_pred=pred, average="macro")    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [14]:

# training_args = TrainingArguments('bert-base-uncased-imdb', 
#                                   learning_rate=2e-05,
#                                   per_device_train_batch_size =batch_size, 
#                                   per_device_eval_batch_size =batch_size,

#                           )

training_args = TrainingArguments(
   output_dir+"roberta-base-sst",
   evaluation_strategy ='steps',
   eval_steps = eval_steps , # Evaluation and Save happens every eval_steps steps
   save_total_limit = 1, # Only last  model is saved. Older ones are deleted.
   learning_rate=learning_rate,
   per_device_train_batch_size=batch_size,
   per_device_eval_batch_size=batch_size,
   num_train_epochs=num_train_epochs,
   metric_for_best_model = 'f1',
   load_best_model_at_end=True)

In [15]:
from transformers import Trainer, EarlyStoppingCallback

In [16]:

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)],
    compute_metrics=compute_metrics,
)


In [17]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Runtime,Samples Per Second
100,No log,1.218976,0.448683,0.452156,0.446105,0.401833,5.4464,202.152
200,No log,1.167417,0.47139,0.473426,0.461793,0.455763,5.5186,199.508
300,No log,1.208803,0.480472,0.474123,0.490661,0.460842,5.5423,198.655
400,No log,1.203087,0.499546,0.508124,0.489437,0.453265,5.5675,197.754
500,1.106400,1.119519,0.531335,0.525532,0.506224,0.492585,5.5664,197.795
600,1.106400,1.140512,0.527702,0.52206,0.536847,0.521905,5.5771,197.414
700,1.106400,1.139394,0.526794,0.519161,0.523742,0.513698,5.5559,198.168
800,1.106400,1.116919,0.541326,0.545,0.540185,0.535082,5.5649,197.849
900,1.106400,1.194856,0.550409,0.54616,0.551357,0.537165,5.7609,191.116
1000,0.788200,1.271916,0.522252,0.513949,0.539139,0.515242,5.8247,189.024


TrainOutput(global_step=1335, training_loss=0.8695027669270833, metrics={'train_runtime': 838.2639, 'train_samples_per_second': 1.593, 'total_flos': 4089619704913920, 'epoch': 5.0})

In [18]:
model.save_pretrained(model_dir+"roberta-base-sst")

In [19]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(2210, 5) (2210,)


In [20]:
from sklearn.metrics import classification_report
preds = np.argmax(predictions.predictions, axis=-1)

In [21]:
print(classification_report(predictions.label_ids, preds))

              precision    recall  f1-score   support

           0       0.57      0.47      0.51       279
           1       0.60      0.67      0.63       633
           2       0.44      0.35      0.39       389
           3       0.58      0.62      0.60       510
           4       0.65      0.68      0.67       399

    accuracy                           0.58      2210
   macro avg       0.57      0.56      0.56      2210
weighted avg       0.57      0.58      0.57      2210

