# Regression training

Since the sentiment analysis tool aims to be able to return binary, 5 stars, or decimal results, the problem will be treated as a regression problem.

In [1]:
import os
from typing import Dict

import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ['HF_DATASETS_OFFLINE'] = "1"
wandb.init(project="sentiment-analysis")

[34m[1mwandb[0m: Currently logged in as: [33malbertocarot1[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
sa_datasets: Dict[str, Dataset] = {}

sa_datasets['sst'] = load_dataset("sst")
sa_datasets['ds_imdb'] = load_dataset("imdb")
sa_datasets['movrat'] = load_dataset("movie_rationales")
sa_datasets['tweet'] = load_dataset("tweet_eval", "sentiment")
sa_datasets['rotten'] = load_dataset("rotten_tomatoes")
sa_datasets['amzpol'] = load_dataset("amazon_polarity")

No config specified, defaulting to: sst/default
Found cached dataset sst (/home/alberto/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)
100%|██████████| 3/3 [00:00<00:00, 535.03it/s]
Found cached dataset imdb (/home/alberto/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
100%|██████████| 3/3 [00:00<00:00, 318.76it/s]
Downloading readme: 100%|██████████| 6.65k/6.65k [00:00<00:00, 4.82MB/s]


TypeError: can only concatenate str (not "int") to str

In [None]:

max_num_tokens = 512
starting_model = "xlm-roberta-base"  # "xlnet-large-cased"
tokenizer = AutoTokenizer.from_pretrained(starting_model, model_max_length=max_num_tokens, truncation=True)
model = AutoModelForSequenceClassification.from_pretrained(starting_model, num_labels=1)

In [None]:
def preprocess(samples, dataset_name):
    samples = tokenizer(samples["text"], truncation=True, padding="max_length", max_length=max_num_tokens)

    if dataset_name == "tweet":
        if samples["label"] == 1:
            samples["label"] = 0.5
        elif samples["label"] == 2:
            samples["label"] = 1

    samples["label"] = float(samples["label"])
    return samples


for ds_name, ds in sa_datasets.items():

    df = pd.DataFrame()
    for split in ds.column_names.keys():
        if split not in ['train', 'test', 'validation']:
            continue
        # Get all relevant text in 'text' column and remove useless columns
        if ds_name in ['sst', 'financ']:
            ds[split] = ds[split].rename_column("sentence", "text")
            ds[split] = ds[split].remove_columns(["tokens", "tree"])
        elif ds_name == 'movrat':
            ds[split] = ds[split].rename_column("review", "text")
            ds[split] = ds[split].remove_columns("evidences")
        elif ds_name == 'amzpol':
            df = ds[split].to_pandas()
            titles_with_punct = df['title'].str.endswith(('.', '!', '?'))
            df.loc[~titles_with_punct, 'title'] += '. '
            df.loc[titles_with_punct, 'title'] += ' '
            df['text'] = df['title'] + df['content']
            ds[split] = Dataset.from_pandas(df)

        ds[split] = ds[split].map(preprocess, fn_kwargs={'dataset_name': ds_name})

In [None]:
# def compute_metrics_for_regression(eval_pred):
#     logits, labels = eval_pred
#     labels = labels.reshape(-1, 1)
#
#     mse = mean_squared_error(labels, logits)
#     mae = mean_absolute_error(labels, logits)
#     r2 = r2_score(labels, logits)
#     single_squared_errors = ((logits - labels).flatten()**2).tolist()
#
#     # Compute accuracy
#     # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
#     accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
#
#     return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}