In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from datasets import Dataset
from transformers import (
    Trainer,
    TrainingArguments,
    XLMRobertaConfig,
    XLMRobertaForSequenceClassification,
)

In [2]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [3]:
import ast


def convert_to_int_list(row):
    return ast.literal_eval(row)[0]


df_train = pd.read_csv("../data/processed/processed_train_complete.csv")[:100]
df_test = pd.read_csv("../data/processed/processed_dev_complete.csv")[:100]


training = df_train[["input_ids", "attention_mask", "class_label"]].copy()
testing = df_test[["input_ids", "attention_mask", "class_label"]].copy()

training["label"] = training["class_label"].apply(lambda x: int(x))
testing["label"] = testing["class_label"].apply(lambda x: int(x))

training = training.drop(columns=["class_label"])
testing = testing.drop(columns=["class_label"])

training["input_ids"] = training["input_ids"].apply(convert_to_int_list)
training["attention_mask"] = training["attention_mask"].apply(convert_to_int_list)

training

Unnamed: 0,input_ids,attention_mask,label
0,"[0, 136, 221, 87, 3714, 450, 903, 80399, 765, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,"[0, 5036, 2633, 186, 40197, 70, 20334, 136, 59...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
2,"[0, 87, 2806, 1884, 47, 41392, 1632, 13580, 2,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",0
3,"[0, 87, 8110, 98911, 764, 70, 138235, 7, 765, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
4,"[0, 136, 47, 5646, 10, 18227, 38074, 5036, 186...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
...,...,...,...
95,"[0, 5299, 291, 140363, 18, 71407, 2685, 186, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
96,"[0, 642, 765, 10, 232432, 7565, 23, 218334, 27...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
97,"[0, 1284, 903, 86052, 1733, 136, 1733, 13438, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
98,"[0, 136, 87, 186, 9525, 642, 186, 738, 23, 70,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...",0


In [4]:
train_dataset = Dataset.from_pandas(training)
test_dataset = Dataset.from_pandas(testing)

In [5]:
torch.cuda.empty_cache()
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

In [10]:
config = XLMRobertaConfig.from_pretrained("xlm-roberta-large")
model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-large", config=config
)

training_args = TrainingArguments(
    output_dir="./results",  # output directory
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=10,  # batch size per device during training
    per_device_eval_batch_size=10,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    use_cpu=True,
    logging_steps=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()
trainer.save_model("./results")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/39 [00:00<?, ?it/s]

{'loss': 0.743, 'grad_norm': 19.147266387939453, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.77}
{'loss': 0.7174, 'grad_norm': 10.972291946411133, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.54}
{'loss': 0.6994, 'grad_norm': 8.403948783874512, 'learning_rate': 3e-06, 'epoch': 2.31}
{'train_runtime': 167.1015, 'train_samples_per_second': 1.795, 'train_steps_per_second': 0.233, 'train_loss': 0.6773959428836138, 'epoch': 3.0}
