<a href="https://colab.research.google.com/github/audreyemmely/ia-generativa/blob/main/RoBERTa_iag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## TRAINING

In [None]:
!pip install transformers datasets huggingface_hub tensorboard
!pip install accelerate -U
!sudo apt-get install git-lfs --yes

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_id = "roberta-base"
dataset_id = "ag_news"
repository_id = "audreyvasconcelos/iag-class"  # Change this to your own repository

In [None]:
# Load dataset
dataset = load_dataset(dataset_id)

# Training and testing datasets
train_dataset = dataset['train']
test_dataset = dataset["test"].shard(num_shards=2, index=0)

# Validation dataset
val_dataset = dataset['test'].shard(num_shards=2, index=1)

# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

In [None]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3800 [00:00<?, ? examples/s]

Map:   0%|          | 0/3800 [00:00<?, ? examples/s]

In [None]:
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset["train"].features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}

config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [None]:
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Cloning https://huggingface.co/audreyvasconcelos/iag-class into local empty directory.


In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.1363,0.27148


TrainOutput(global_step=15000, training_loss=0.34263842718203863, metrics={'train_runtime': 5589.3283, 'train_samples_per_second': 21.469, 'train_steps_per_second': 2.684, 'total_flos': 1.578694680576e+16, 'train_loss': 0.34263842718203863, 'epoch': 1.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.27148041129112244,
 'eval_runtime': 53.7326,
 'eval_samples_per_second': 70.721,
 'eval_steps_per_second': 8.84,
 'epoch': 1.0}

In [None]:
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

trainer.push_to_hub()

Upload file logs/events.out.tfevents.1692123324.089f8db985e6.2587.0:  14%|#3        | 32.0k/235k [00:00<?, ?B/…

Upload file logs/events.out.tfevents.1692128967.089f8db985e6.2587.1: 100%|##########| 311/311 [00:00<?, ?B/s]

To https://huggingface.co/audreyvasconcelos/iag-class
   6e472bb..1d42001  main -> main

   6e472bb..1d42001  main -> main



'https://huggingface.co/audreyvasconcelos/iag-class/commit/1d420011a4faf03011cbf13715535124311d9c70'

## TESTING

In [None]:
#!pip install transformers xformers

In [None]:
from transformers import pipeline
import pandas as pd

repository_id = "audreyvasconcelos/iag-class"
classifier = pipeline('text-classification', repository_id)

data_pairs = [
    ("Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his innocence and vowing: quot;After the crucifixion comes the resurrection. quot; ..", "Sports"),
    ("Diplomatic tensions escalate as leaders exchange heated words.", "World"),
    ("Tensions rise as diplomatic talks stall between neighboring countries over a disputed border region, raising concerns of a potential conflict escalation.", "World"),
    ("In an unexpected upset, the underdog team clinches victory in the championship match, leaving fans exhilarated and experts in awe of their remarkable performance.", "Sports"),
    ("Scientists make a groundbreaking discovery of a new exoplanet with Earth-like conditions, igniting discussions about the possibility of extraterrestrial life.", "Sci/Tech"),
    ("Humanitarian organizations rush to provide aid to regions devastated by a powerful earthquake, coordinating efforts to deliver essential supplies and medical assistance.", "World"),
    ("Celebrations ensue as a legendary athlete sets a new world record in track and field, solidifying their position as one of the greatest sportspeople of all time.", "Sports"),
    ("Startup secures substantial funding from venture capitalists for its innovative approach to sustainable packaging, promising to revolutionize the industry.", "Business"),
    ("United Nations convenes an emergency session to address the worsening humanitarian crisis in a conflict-stricken region, calling for international intervention and aid.", "World"),
    ("The upcoming international sports event faces uncertainty as concerns over athlete safety and global health precautions take center stage amidst a lingering pandemic.", "Sports"),
    ("E-commerce giant announces plans to acquire a leading grocery chain, signaling a major shift in the retail landscape and intensifying competition in the sector.", "Business"),
    ("Autonomous vehicles reach a significant milestone, logging millions of accident-free miles, bolstering confidence in their potential to reshape the future of transportation.", "Sci/Tech"),
    ("Climate summit concludes with participating nations pledging to accelerate efforts to reduce carbon emissions, though skepticism remains about the enforceability of these commitments.", "World"),
    ("In a surprising trade, a star player moves from a long-time team, leaving fans divided and analysts speculating about the impact on team dynamics.", "Sports"),
    ("As remote work becomes the norm, tech companies introduce innovative tools to enhance virtual collaboration, promising increased productivity and work-life balance.", "Business")
]

data = []

for text, true_label in data_pairs:
    result = classifier(text)
    data.append({
        "text": text,
        "true_label": true_label,
        "predicted": result[0]["label"],
        "score": result[0]["score"]
    })

df = pd.DataFrame(data)
df

Unnamed: 0,text,true_label,predicted,score
0,Kederis proclaims innocence Olympic champion K...,Sports,Sports,0.999062
1,Diplomatic tensions escalate as leaders exchan...,World,World,0.988531
2,Tensions rise as diplomatic talks stall betwee...,World,World,0.98862
3,"In an unexpected upset, the underdog team clin...",Sports,Sports,0.997818
4,Scientists make a groundbreaking discovery of ...,Sci/Tech,Sci/Tech,0.985414
5,Humanitarian organizations rush to provide aid...,World,World,0.983708
6,Celebrations ensue as a legendary athlete sets...,Sports,Sports,0.994682
7,Startup secures substantial funding from ventu...,Business,Sci/Tech,0.927047
8,United Nations convenes an emergency session t...,World,World,0.997521
9,The upcoming international sports event faces ...,Sports,Sports,0.98438


Referências

MORAITES, A. Fine-tuning RoBERTa for Topic Classification with Hugging Face Transformers and Datasets Library. Disponível em: https://medium.com/@achillesmoraites/fine-tuning-roberta-for-topic-classification-with-hugging-face-transformers-and-datasets-library-c6f8432d0820