In [8]:
import pandas as pd
import numpy as np
import torch

from transformers import (
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    BertTokenizer,
    DataCollatorWithPadding,
)
from datasets import Dataset

In [15]:
max_length = 200

In [2]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", max_length=max_length)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def tokenize(examples):
    return tokenizer(
        examples["sentences"], truncation=True, padding=True, max_length=max_length
    )


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


trainset, testset = [
    Dataset.from_pandas(pd.read_csv(f"./data/processed/{d}.csv")).map(
        tokenize, batched=True
    )
    for d in ["train", "test"]
]

Map: 100%|██████████| 1413/1413 [00:00<00:00, 4379.79 examples/s]
Map: 100%|██████████| 354/354 [00:00<00:00, 4260.94 examples/s]


In [4]:
def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)[
        "accuracy"
    ]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [20]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=testset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

Step,Training Loss


('./sentiment_model/tokenizer_config.json',
 './sentiment_model/special_tokens_map.json',
 './sentiment_model/vocab.txt',
 './sentiment_model/added_tokens.json')

In [26]:
m_trained = BertForSequenceClassification.from_pretrained("./sentiment_model")
t_trained = BertTokenizer.from_pretrained("./sentiment_model")

In [27]:
# Define the sentence to classify
sentence = "My day is the very best i'm so happy."

# Tokenize the sentence
inputs = t_trained(
    sentence, padding=True, truncation=True, return_tensors="pt", max_length=max_length
)

# Get the model's prediction (regression score)
with torch.no_grad():
    outputs = m_trained(**inputs)
    print(outputs.logits)
    print(np.argmax(outputs.logits))

tensor([[-1.5591,  1.3360,  0.2557]])
tensor(1)
