In [12]:
!pip install --quiet datasets

# Imports

In [13]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments
    )
from datasets import load_dataset
from sklearn.metrics import f1_score
from transformers import pipeline

# Data Preprocessing

In [14]:
dataset = load_dataset('emotion').rename_column('label', 'labels')

In [15]:
labels = dataset['train'].features['labels'].names
label2indx = {label: indx for indx, label in enumerate(labels)}
indx2label = {indx:label for indx, label in enumerate(labels)}

In [16]:
train_df = dataset['train'].to_pandas()

sample = train_df.head(10)
sample['labels'] = sample['labels'].apply(lambda x: indx2label[x])
sample

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['labels'] = sample['labels'].apply(lambda x: indx2label[x])


Unnamed: 0,text,labels
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
5,ive been feeling a little burdened lately wasn...,sadness
6,ive been taking or milligrams or times recomme...,surprise
7,i feel as confused about life as a teenager or...,fear
8,i have been with petronas for years i feel tha...,joy
9,i feel romantic too,love


In [17]:
checkpoint = 'FacebookAI/roberta-base'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [18]:
def tokenizer_wrapper(tokenizer, max_length, truncation=True):
  def inner(example):
    return tokenizer(example['text'], max_length=max_length, truncation=truncation)
  return inner

max_length = 512
truncation = True

wrapped_tokenizer = tokenizer_wrapper(tokenizer, max_length, truncation)
dataset = dataset.map(wrapped_tokenizer, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

# Loading the Pretrained Model

In [19]:
checkpoint = 'FacebookAI/roberta-base'
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    label2id=label2indx,
    id2label=indx2label
    ).to('cuda')


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training

In [23]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average='weighted')
  return {"f1": f1}

In [24]:
train_config = dict(
  output_dir = '/results/models',
  num_train_epochs = 6,
  learning_rate = 2e-5,
  per_device_train_batch_size = 64,
  per_device_eval_batch_size = 512,
  evaluation_strategy = 'epoch',
  fp16 = True,
)
training_arguments = TrainingArguments(**train_config)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,0.183943,0.924923
2,0.173600,0.172448,0.928752
3,0.173600,0.161127,0.935228
4,0.108100,0.133779,0.943093
5,0.108100,0.143453,0.933861
6,0.074700,0.149766,0.936771


TrainOutput(global_step=1500, training_loss=0.11879023742675782, metrics={'train_runtime': 99.6779, 'train_samples_per_second': 963.102, 'train_steps_per_second': 15.048, 'total_flos': 2763982233070848.0, 'train_loss': 0.11879023742675782, 'epoch': 6.0})

In [33]:
checkpoint_dir = '/results/models/checkpoint-1500'
pipe = pipeline('text-classification', model=checkpoint_dir)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [77]:
def print_output(sentence, pipeline):
    label = pipeline(sentence)[0]['label']
    score = pipeline(sentence)[0]['score']
    print(f"input: {sentence} | label: {label} , score: {score: .2%}")

In [88]:
inputs = [
    "this is so good",
    "i feel so gloomy",
    "i love you with all my heart",
    "get the hell out of here",
    "im so scared help me please",
    "another beautiful day",
    "another shit day",
    "im so puzzled",
]

for input in inputs:
    print_output(input, pipe)

input: this is so good | label: joy , score:  99.94%
input: i feel so gloomy | label: sadness , score:  99.95%
input: i love you with all my heart | label: love , score:  95.11%
input: get the hell out of here | label: anger , score:  97.12%
input: im so scared help me please | label: fear , score:  99.88%
input: another beautiful day | label: joy , score:  96.81%
input: another shit day | label: anger , score:  96.70%
input: im so puzzled | label: surprise , score:  76.92%
