In [None]:
%load_ext autoreload
%autoreload 2

First we prepare the dataset

In [None]:
import sys
sys.path.append("src/")


In [None]:
from datasets.tripadvisor import TripAdvisorDataset

dataset = TripAdvisorDataset(
        text_processor='word2vec', 
        text_processor_filters=['lowercase', 'stopwordsfilter'],
        embedding_path='../data/embeddings/word2vec/glove.6B.50d.txt',
        data_path='../data/')

In [None]:
texts = []
labels = []
annotators = []

for datapoint in dataset:
    annotators.append(datapoint['annotator'])
    texts.append(datapoint['text'])
    labels.append(datapoint['label'].item())

Now we do train, validation and test split. (0.6, 0.2, 0.2)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Test set is 20%, validation set is 25% of the rest (0.25 x 0.8 = 0.2)
# which translates to 20%, training set is the 60%

x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=1)

In [None]:
from transformers import LongformerTokenizerFast

tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')

In [None]:
x_train_encoded = tokenizer(x_train, truncation=True, padding=True)
x_val_encoded = tokenizer(x_val, truncation=True, padding=True)
x_test_encoded = tokenizer(x_test, truncation=True, padding=True)

Now we create a class that will allow us to get the data ready for transformers.Trainer()

In [None]:
import torch 

# https://huggingface.co/transformers/custom_datasets.html

class CurrentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)


train_dataset = CurrentDataset(x_train_encoded, y_train)
val_dataset = CurrentDataset(x_val_encoded, y_val)
test_dataset = CurrentDataset(x_test_encoded, y_test)

The following is just to inform the terminal of the stage of training

In [None]:
print('COMENCING TRAINING & FINE-TUNING')

### Fine-tuning

Now we set the model, the training arguments and we instantiate a Trainer. :)

In [None]:
from transformers import LongformerForSequenceClassification, Trainer, TrainingArguments

# https://huggingface.co/transformers/custom_datasets.html

training_args = TrainingArguments(
    output_dir='../models',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
%tensorboard --logdir runs