# Regression training

Since the sentiment analysis tool aims to be able to return binary, 5 stars, or decimal results, the problem will be treated as a regression problem.

In [2]:
import os
from typing import Dict

import wandb
from datasets import load_dataset, Dataset, Value, concatenate_datasets
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [3]:
os.environ['HF_DATASETS_OFFLINE'] = "1"
wandb.init(project="sentiment-analysis")

[34m[1mwandb[0m: Currently logged in as: [33malbertocarot1[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
sa_datasets: Dict[str, Dataset] = {}

sa_datasets['amzpol'] = load_dataset("amazon_polarity")
sa_datasets['sst'] = load_dataset("sst")
sa_datasets['ds_imdb'] = load_dataset("imdb")
sa_datasets['movrat'] = load_dataset("movie_rationales")
sa_datasets['tweet'] = load_dataset("tweet_eval", "sentiment")
sa_datasets['rotten'] = load_dataset("rotten_tomatoes")

Found cached dataset amazon_polarity (/home/alberto/.cache/huggingface/datasets/amazon_polarity/amazon_polarity/3.0.0/a27b32b7e7b88eb274a8fa8ba0f654f1fe998a87c22547557317793b5d2772dc)


  0%|          | 0/2 [00:00<?, ?it/s]

No config specified, defaulting to: sst/default
Found cached dataset sst (/home/alberto/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset imdb (/home/alberto/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset movie_rationales (/home/alberto/.cache/huggingface/datasets/movie_rationales/default/0.1.0/70ed6b72496c90835e8ee73ebf8d0e49f5ad3aa93f302c8a4b6c886143cfb779)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset tweet_eval (/home/alberto/.cache/huggingface/datasets/tweet_eval/sentiment/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset rotten_tomatoes (/home/alberto/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)


  0%|          | 0/3 [00:00<?, ?it/s]

Two of the six datasets don't have a validation set. A part of the train set is used for validation.

In [5]:
train_val_dataset =  sa_datasets['ds_imdb']['train'].train_test_split(test_size=0.125, stratify_by_column='label')
sa_datasets['ds_imdb']['train'] = train_val_dataset['train']
sa_datasets['ds_imdb']['validation'] = train_val_dataset['test']

train_val_dataset =  sa_datasets['amzpol']['train'].train_test_split(test_size=0.125, stratify_by_column='label')
sa_datasets['amzpol']['train'] = train_val_dataset['train']
sa_datasets['amzpol']['validation'] = train_val_dataset['test']


The models used are based on the best performing ones on the task, taking into consideration also that the service is expected to be somehow fast, hence discarding the computationally heavy ones.
RoBERTa-XLM and XLNet are the two strongest candidates.

In [6]:
max_num_tokens = 512
starting_model = "xlm-roberta-base"  # "xlnet-large-cased"
tokenizer = AutoTokenizer.from_pretrained(starting_model, model_max_length=max_num_tokens, truncation=True)
model = AutoModelForSequenceClassification.from_pretrained(starting_model, num_labels=1)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

The datasets are tokenized, and the labels turned into float values.

In [None]:
def normalize(samples):
    # Normalize values for tweets, to have them between 0 and 1
    if samples["label"] == 1:
        samples["label"] = 0.5
    elif samples["label"] == 2:
        samples["label"] = 1
    return samples

for ds_name, ds in sa_datasets.items():
    for split in ds.column_names.keys():
        if split not in ['train', 'test', 'validation']:
            continue
        # Get all relevant text in 'text' column and remove useless columns
        if ds_name in ['sst', 'financ']:
            ds[split] = ds[split].rename_column("sentence", "text")
            ds[split] = ds[split].remove_columns(["tokens", "tree"])
        elif ds_name == 'movrat':
            ds[split] = ds[split].rename_column("review", "text")
            ds[split] = ds[split].remove_columns("evidences")
        elif ds_name == 'amzpol':
            df = ds[split].to_pandas()
            titles_with_punct = df['title'].str.endswith(('.', '!', '?'))
            df.loc[~titles_with_punct, 'title'] += '. '
            df.loc[titles_with_punct, 'title'] += ' '
            df['text'] = df['title'] + df['content']
            ds[split] = Dataset.from_pandas(df)
            ds[split] = ds[split].remove_columns(["title", "content"])
        elif ds_name == "tweet":
            ds[split] = ds[split].map(normalize)

        ds[split] = ds[split].cast_column('label', Value("float32"))

        ds[split] = ds[split].map(lambda rows: tokenizer(rows['text'], padding='max_length', max_length=max_num_tokens, truncation=True), batched=True)
        print(f"{ds_name = }, {split = } completed.")

Casting the dataset:   0%|          | 0/3150 [00:00<?, ?ba/s]

  0%|          | 0/3150 [00:00<?, ?ba/s]

ds_name = 'amzpol', split = 'train' completed.


Casting the dataset:   0%|          | 0/400 [00:00<?, ?ba/s]

  0%|          | 0/400 [00:00<?, ?ba/s]

In [None]:
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mae = mean_absolute_error(labels, logits)
    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)

    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()

    # Compute accuracy, reducing the problem to binary classification on values that are higher/lower than 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)

    return {"rmse": rmse, "mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}

In [None]:
batch_size = 4
training_args = TrainingArguments(
    output_dir ='./trained-models',
    num_train_epochs = 10,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    learning_rate = 2e-5,
    save_total_limit = 3,
    load_best_model_at_end = True,
    metric_for_best_model = 'rmse',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    # gradient_accumulation_steps=6,
    report_to="wandb"
)


In [None]:
train_sets = []
validation_sets = []
for ds_name, ds in sa_datasets.items():
    for split in ds.column_names.keys():
        if split == 'train':
            train_sets.append(ds[split])
        elif split == 'validation':
            validation_sets.append(ds[split])

train_set = concatenate_datasets(train_sets)
validation_set = concatenate_datasets(validation_sets)

In [None]:
# Call the Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_set , eval_dataset= validation_set,
    compute_metrics = compute_metrics_for_regression,
)

trainer.train()
best_accuracy = trainer.state.best_metric

In [None]:
import torch

In [None]:
torch.cuda.max_memory_allocated(0)

In [None]:
torch.cuda.get_device_properties(0)

In [None]:
torch.cuda.max_memory_reserved(0)

In [None]:
torch.cuda.empty_cache()