In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset, Dataset
import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score
from google.colab import files

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##### **Load Datasets**

In [2]:
def upload_dataset_from_system():
  uploaded = files.upload()
  data = Dataset.from_pandas(pd.read_parquet(list(uploaded.keys())[0]))
  return data

# --- return dataset classes ---
def dataset_classes(dataset):
  return list(set([item['label'] for item in dataset]))

# --- model name
model_name = "roberta-base"

# --- tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [4]:
# --- load dataset 20-classes ---
train_data = upload_dataset_from_system()
val_data = upload_dataset_from_system()
test_data = upload_dataset_from_system()  # max_length -> 40

Saving train-00000-of-00001.parquet to train-00000-of-00001.parquet


Saving validation-00000-of-00001.parquet to validation-00000-of-00001.parquet


Saving test-00000-of-00001.parquet to test-00000-of-00001.parquet


In [3]:
# --- load sst5 ---
sst = load_dataset("SetFit/sst5")  # max_length -> 60

README.md:   0%|          | 0.00/421 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


train.jsonl: 0.00B [00:00, ?B/s]

dev.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/8544 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1101 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2210 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

##### **preprocess & Dataset-utilities**

In [5]:

def preproces_data(example,max_len,text ='text',label = 'label'):
  tokenized_item = tokenizer(example[text],truncation=True,max_length=max_len,padding="max_length")
  inp_ids = tokenized_item['input_ids']
  attn_msk = tokenized_item['attention_mask']
  label = example[label]

  return {
         "input_ids": inp_ids, # Convert list to tensor
          "attention_mask": attn_msk, # Convert list to tensor
          "labels": label # Return the original label index
    }

In [6]:
# --- 20 classes dataset ---
train_dataset = train_data.map(preproces_data, fn_kwargs={'max_len': 40}, remove_columns=['text','label'])
val_dataset = val_data.map(preproces_data, fn_kwargs={'max_len': 40}, remove_columns=['text','label'])
test_dataset = test_data.map(preproces_data, fn_kwargs={'max_len': 40}, remove_columns=['text','label'])

Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [7]:
# --- preprocess sst5 ---
sst_train = sst['train'].map(preproces_data,fn_kwargs={'max_len': 60},remove_columns=['text','label','label_text'])
sst_val = sst['validation'].map(preproces_data,fn_kwargs={'max_len': 60},remove_columns=['text','label','label_text'])
sst_test = sst['test'].map(preproces_data,fn_kwargs={'max_len': 60},remove_columns=['text','label','label_text'])

Map:   0%|          | 0/8544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [8]:
def collate_fn(batch):
    input_ids = [
        torch.tensor(item['input_ids'], dtype=torch.long)
        if not isinstance(item['input_ids'], torch.Tensor) else item['input_ids'].long()
        for item in batch
    ]
    attention_mask = [
        torch.tensor(item['attention_mask'], dtype=torch.long)
        if not isinstance(item['attention_mask'], torch.Tensor) else item['attention_mask'].long()
        for item in batch
    ]

    labels = torch.stack([
        torch.tensor(item['labels'], dtype=torch.long) # Changed to torch.long
        if not isinstance(item['labels'], torch.Tensor) else item['labels'].long()
        for item in batch
    ])


    input_ids_padded = pad_sequence(
        input_ids, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask_padded = pad_sequence(
        attention_mask, batch_first=True, padding_value=0
    )

    return {
        "input_ids": input_ids_padded,        # (batch_size, seq_len)
        "attention_mask": attention_mask_padded,  # (batch_size, seq_len)
        "labels": labels                      # (batch_size, num_classes)
    }

##### **train , test methods**

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
    }

In [10]:
def FineTune(model,train_data,val_data,num_epochs,batch_size,lr):

  training_args = TrainingArguments(
      output_dir='./results',
      num_train_epochs=num_epochs,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      learning_rate=lr,
      weight_decay=0.01,
      eval_strategy="epoch",
      save_strategy="epoch",
      load_best_model_at_end=True,
      logging_steps=500,
      report_to = "tensorboard"
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_data,
      eval_dataset=val_data,
      compute_metrics=compute_metrics,
      data_collator=collate_fn
  )

  trainer.train()

  return trainer.model

In [11]:
def test_func(model, test_data, batch_size):
    model.eval()
    args = TrainingArguments(
        output_dir='./results',
        per_device_eval_batch_size=batch_size,
        logging_steps=500,
        report_to = "tensorboard"
    )
    trainer = Trainer(
        model=model,
        args=args,
        eval_dataset=test_data,
        compute_metrics=compute_metrics,
        data_collator=collate_fn
    )

    eval_results = trainer.evaluate()
    return eval_results

##### **RoBERTa & sst5**

In [12]:
# --- load model ---
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5).to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# --- test before fine-tune
eval_res = test_func(model,sst_test,16)
print(eval_res)

{'eval_loss': 1.6069600582122803, 'eval_model_preparation_time': 0.0028, 'eval_accuracy': 0.23076923076923078, 'eval_runtime': 8.1455, 'eval_samples_per_second': 271.314, 'eval_steps_per_second': 17.065}


In [19]:
# --- fine tune
tuned_model = FineTune(model,sst_train,sst_val,3,32,3e-5)

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.48028,0.491371
2,0.655500,1.393746,0.500454
3,0.655500,1.434074,0.520436


In [20]:
# --- test after fine-tune
eval_res = test_func(tuned_model,sst_test,16)
print(eval_res)

{'eval_loss': 1.2211315631866455, 'eval_model_preparation_time': 0.0024, 'eval_accuracy': 0.5285067873303168, 'eval_runtime': 7.4432, 'eval_samples_per_second': 296.913, 'eval_steps_per_second': 18.675}


##### **RoBERTa & 20-classes**

In [21]:
# --- load model ---
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=20).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# --- test before fine-tune
eval_res = test_func(model,test_dataset,32)
print(eval_res)

{'eval_loss': 3.0298681259155273, 'eval_model_preparation_time': 0.0024, 'eval_accuracy': 0.02494, 'eval_runtime': 110.2237, 'eval_samples_per_second': 453.623, 'eval_steps_per_second': 14.18}


In [23]:
# --- fine tune
tuned_model = FineTune(model,train_dataset,val_dataset,3,32,3e-5)

Epoch,Training Loss,Validation Loss,Accuracy
1,1.9186,2.393428,0.2768
2,1.6861,2.299417,0.294
3,1.5201,2.280521,0.3088


In [24]:
# --- test after fine-tune
eval_res = test_func(tuned_model,test_dataset,32)
print(eval_res)

{'eval_loss': 1.7327722311019897, 'eval_model_preparation_time': 0.0024, 'eval_accuracy': 0.4726, 'eval_runtime': 108.2832, 'eval_samples_per_second': 461.752, 'eval_steps_per_second': 14.434}
