In [1]:
import torch
import pandas as pd
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login
HF = '...'
# notebook_login()



In [None]:
# !pip install tqdm
# !pip install datasets

In [3]:
model_id = "FacebookAI/roberta-base"
# dataset_id = "Maradona"
# relace the value with your model: ex <hugging-face-user>/<model-name>
repository_id = "YuvrajSingh9886/roberta-base_Maradona"

In [4]:

# model = RobertaForSequenceClassification.from_pretrained(model_id)
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

In [5]:
df = pd.read_excel('./data/Maradon Hand of God Labels with Llama3.1_8b_Instruct using Alpaca Prompt Fine Tuned (10).xlsx')
df = df[['Comments', 'labels_to_numbers_original']]

In [None]:
df

In [7]:
from datasets import Dataset, DatasetDict
train_ds = Dataset.from_pandas(df)
# Split dataset (80% train, 10% val, 10% test)
split_dataset = train_ds.train_test_split(test_size=0.2, seed=42)
val_test_split = split_dataset["test"].train_test_split(test_size=0.5, seed=42)

In [8]:
# Merge into a DatasetDict
final_dataset = DatasetDict({
    "train": split_dataset["train"],
    "val": val_test_split["train"],
    "test": val_test_split["test"]
})

In [None]:
final_dataset

In [None]:
def tokenize(batch):
    return tokenizer(batch["Comments"], padding=True, truncation=True, max_length=512, return_tensors='pt')

train_dataset = final_dataset['train'].map(tokenize, batched=True, batch_size=len(final_dataset['train']))
val_dataset = final_dataset['val'].map(tokenize, batched=True, batch_size=len(final_dataset['val']))
test_dataset = final_dataset['test'].map(tokenize, batched=True, batch_size=len(final_dataset['test']))

In [11]:
# train_dataset['label']

In [12]:

train_dataset = train_dataset.remove_columns(["Comments"])  
train_dataset = train_dataset.rename_column("labels_to_numbers_original", "label") 


val_dataset = val_dataset.remove_columns(["Comments"])  
val_dataset = val_dataset.rename_column("labels_to_numbers_original", "label") 


test_dataset = test_dataset.remove_columns(["Comments"])  
test_dataset = test_dataset.rename_column("labels_to_numbers_original", "label") 


In [None]:
train_dataset.set_format('torch',  columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch',  columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch',  columns=['input_ids', 'attention_mask', 'label'])
train_dataset = train_dataset.map(lambda x: {k: v.to('cuda') for k, v in x.items()})
val_dataset = val_dataset.map(lambda x: {k: v.to('cuda') for k, v in x.items()})
test_dataset = test_dataset.map(lambda x: {k: v.to('cuda') for k, v in x.items()})

In [None]:
num_labels = 4
class_names = ['Favor', 'Against', 'Neutral', 'Irrelevant']
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

In [None]:
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config, device_map='cuda:0')
model.to('cuda')

In [None]:
# TrainingArguments
training_args = TrainingArguments(
    output_dir='./outputs',
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    # logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=1,
    learning_rate=2e-4,
    weight_decay=0.01,
    eval_steps=10,
    # warmup_steps=20,
    warmup_ratio = 0.03,
    save_strategy="steps",
    # optim = 'adamw_torch',
    gradient_checkpointing=True,
    fp16=True,
    # load_best_model_at_end=True,
    # save_total_limit=2,
    # report_to="wandb",
    # push_to_hub=True,
    do_eval=True,
    # hub_strategy="every_save",
    # hub_model_id=repository_id,
    # hub_token='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',
    overwrite_output_dir = True
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
print(train_dataset['input_ids'].device) 

In [None]:
print(trainer.model.device)


In [None]:
trainer.train()

In [None]:
torch.cuda.is_available()