In [14]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

wandb.login()



True

In [15]:
'''data preparation'''
import os

df = pd.read_json(os.getcwd()+'/SubtaskA/subtaskA_dev_monolingual.jsonl', lines=True)
# Just interested so far in text and label
df = df[['text', 'label']]

'So testing doesnt takes too much time processing, 10k seems ok'
df=df.sample(1000)
# Split the data into training and validation datasets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
# test_df, val_df = train_test_split(val_df, test_size=0.5, random_state=42)

val_df = pd.read_json(os.getcwd()+'/SubtaskA/subtaskA_dev_monolingual.jsonl', lines=True)
val_df = val_df[['text', 'label']]
# Shuffles the data obtained
val_df = val_df.sample(frac=1).reset_index(drop=True)

from imblearn.under_sampling import RandomUnderSampler
# Initialize RandomUnderSampler

print(f'Dataset size before balancing: {train_df.shape}')
counts = train_df['label'].value_counts()

sampler = RandomUnderSampler(random_state=42)
x_text, y = sampler.fit_resample(train_df[['text']], train_df['label'])

print(f'Dataset size after balancing: {x_text.shape}')
print(f'Entried dropped: {train_df.shape[0]-x_text.shape[0]}')

# Create a new balanced DataFrame
train_df = pd.DataFrame({'text': x_text['text'], 'label': y})

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(train_df['label'].value_counts())

Dataset size before balancing: (800, 2)
Dataset size after balancing: (796, 1)
Entried dropped: 4

Balanced DataFrame:
label
0    398
1    398
Name: count, dtype: int64


In [16]:
'''create custom dataset'''

from torch.utils.data import Dataset
import torch

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])  # Using .iloc for safer row access
        label = self.targets.iloc[index]   # Using .iloc for safer row access
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [17]:
'''Initialize model and tokenizer'''

MODEL_NAME = 'roberta-base'  # you can replace this with any model from Hugging Face Model Hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
from transformers import AutoConfig
config = AutoConfig.from_pretrained('roberta-base', num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,config=config)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
'''create dataset object'''
MAX_LEN = 256  # you can adjust this based on your specific dataset and model

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
# test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)


In [19]:
'''metrics'''
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import numpy as np

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }


In [20]:
'''Define training arguments and initialize trainer'''

from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1500,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    logging_first_step=False,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="wandb"  # enable logging to W&B
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)



In [21]:
'''test the model'''

results = trainer.evaluate()

# Print results
print(results)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 625/625 [00:20<00:00, 30.61it/s]

{'eval_loss': 0.6920899152755737, 'eval_accuracy': 0.5198, 'eval_f1': 0.6727545318249966, 'eval_auc': 0.5198, 'eval_precision': 0.5102336158776101, 'eval_recall': 0.9872, 'eval_runtime': 20.6281, 'eval_samples_per_second': 242.388, 'eval_steps_per_second': 30.298}





In [22]:
trainer.train()

                                                  
 10%|█         | 100/1000 [00:30<01:27, 10.23it/s]

{'eval_loss': 0.23567959666252136, 'eval_accuracy': 0.9236, 'eval_f1': 0.9276789095039758, 'eval_auc': 0.9236, 'eval_precision': 0.8806613946800863, 'eval_recall': 0.98, 'eval_runtime': 20.5866, 'eval_samples_per_second': 242.877, 'eval_steps_per_second': 30.36, 'epoch': 1.0}


                                                  
 20%|██        | 200/1000 [01:01<01:09, 11.59it/s]

{'eval_loss': 0.24740180373191833, 'eval_accuracy': 0.9514, 'eval_f1': 0.9535639212688707, 'eval_auc': 0.9514, 'eval_precision': 0.9129162092938163, 'eval_recall': 0.998, 'eval_runtime': 20.4246, 'eval_samples_per_second': 244.802, 'eval_steps_per_second': 30.6, 'epoch': 2.0}


                                                  
 30%|███       | 300/1000 [01:32<00:59, 11.68it/s]

{'eval_loss': 0.25485312938690186, 'eval_accuracy': 0.9594, 'eval_f1': 0.9609540296210809, 'eval_auc': 0.9593999999999999, 'eval_precision': 0.9255279733234532, 'eval_recall': 0.9992, 'eval_runtime': 20.6045, 'eval_samples_per_second': 242.665, 'eval_steps_per_second': 30.333, 'epoch': 3.0}


 30%|███       | 300/1000 [01:35<03:43,  3.13it/s]

{'train_runtime': 95.992, 'train_samples_per_second': 82.924, 'train_steps_per_second': 10.418, 'train_loss': 0.3094870249430339, 'epoch': 3.0}





TrainOutput(global_step=300, training_loss=0.3094870249430339, metrics={'train_runtime': 95.992, 'train_samples_per_second': 82.924, 'train_steps_per_second': 10.418, 'train_loss': 0.3094870249430339, 'epoch': 3.0})

In [23]:
results = trainer.evaluate()
print(results)

100%|██████████| 625/625 [00:20<00:00, 30.47it/s]

{'eval_loss': 0.23567959666252136, 'eval_accuracy': 0.9236, 'eval_f1': 0.9276789095039758, 'eval_auc': 0.9236, 'eval_precision': 0.8806613946800863, 'eval_recall': 0.98, 'eval_runtime': 20.5603, 'eval_samples_per_second': 243.187, 'eval_steps_per_second': 30.398, 'epoch': 3.0}





In [24]:
print(results)

{'eval_loss': 0.23567959666252136, 'eval_accuracy': 0.9236, 'eval_f1': 0.9276789095039758, 'eval_auc': 0.9236, 'eval_precision': 0.8806613946800863, 'eval_recall': 0.98, 'eval_runtime': 20.5603, 'eval_samples_per_second': 243.187, 'eval_steps_per_second': 30.398, 'epoch': 3.0}
