In [1]:
from datasets import load_dataset, Dataset, DatasetDict 
from transformers import AutoTokenizer, DataCollatorWithPadding
import pandas as pd

dataset = load_dataset('sentiment140')
dictionary = {}
mapping_dict = {0:0, 2: 1, 4: 2}

test_dataset = dataset["test"].to_pandas()
train_negative_df = test_dataset.query("sentiment == 0")[:50]
train_neutral_df  = test_dataset.query("sentiment == 2")[:50]
train_positive_df = test_dataset.query("sentiment == 4")[:50]

dictionary["train"] = pd.concat([train_negative_df, train_neutral_df, train_positive_df])

dictionary["train"]["sentiment"] = dictionary["train"]["sentiment"].map(mapping_dict)

validation_dataset = dataset["test"].to_pandas()
validation_negative_df = validation_dataset.query("sentiment == 0")[50:100]
validation_neutral_df  = validation_dataset.query("sentiment == 2")[50:100]
validation_positive_df = validation_dataset.query("sentiment == 4")[50:100]

dictionary["validation"] = pd.concat([validation_negative_df, validation_neutral_df, validation_positive_df])

dictionary["validation"]["sentiment"] = dictionary["validation"]["sentiment"].map(mapping_dict)

train_dataset = Dataset.from_dict(dictionary["train"])
validation_dataset = Dataset.from_dict(dictionary["validation"])
dataset = DatasetDict({"train":train_dataset, "validation":validation_dataset})

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], padding=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_datasets = tokenized_datasets.rename_column("sentiment", "label")

Map: 100%|██████████| 150/150 [00:00<00:00, 21647.65 examples/s]
Map: 100%|██████████| 150/150 [00:00<00:00, 26806.37 examples/s]


In [3]:
from transformers import TrainingArguments

output_dir = "./output"
training_args = TrainingArguments(
  output_dir=output_dir,
  num_train_epochs=3,  # Number of training epochs
  per_device_train_batch_size=8,  # Batch size per GPU
  per_device_eval_batch_size=8,   # Batch size for evaluation per GPU
  warmup_steps=400,   # Number of warmup steps for learning rate scheduler
  weight_decay=0.01,   # Strength of weight decay
  logging_dir="./logs",   # Directory for storing logs
  logging_steps=100,   # Log every N steps
  evaluation_strategy="steps",   # Evaluation strategy during training
  eval_steps=200,   # Run evaluation every N steps
  save_total_limit=2,   # Only save the last N checkpoints
  save_steps=200,   # Save checkpoint every N steps
  load_best_model_at_end=True,   # Load the best model at the end of training
  metric_for_best_model="accuracy",   # Metric to use for the best model
)


In [4]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import Trainer
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [6]:
trainer.train()

  0%|          | 0/57 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 57/57 [00:05<00:00, 10.60it/s]

{'train_runtime': 5.3767, 'train_samples_per_second': 83.695, 'train_steps_per_second': 10.601, 'train_loss': 1.0769747014631306, 'epoch': 3.0}





TrainOutput(global_step=57, training_loss=1.0769747014631306, metrics={'train_runtime': 5.3767, 'train_samples_per_second': 83.695, 'train_steps_per_second': 10.601, 'train_loss': 1.0769747014631306, 'epoch': 3.0})

In [7]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

  0%|          | 0/19 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:00<00:00, 59.16it/s]

(150, 3) (150,)





In [8]:
trainer.evaluate()

100%|██████████| 19/19 [00:00<00:00, 60.45it/s]


{'eval_loss': 0.998936653137207,
 'eval_accuracy': 0.5266666666666666,
 'eval_runtime': 0.3437,
 'eval_samples_per_second': 436.43,
 'eval_steps_per_second': 55.281,
 'epoch': 3.0}

In [9]:
trainer.save_model()