In [23]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import os
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import numpy as np

wandb.login()



True

In [24]:
df = pd.read_json(os.getcwd()+'/SubtaskA/subtaskA_train_monolingual.jsonl', lines=True)
# Just interested so far in text and label
df = df[['text', 'label']]

'So testing doesnt takes too much time processing, 10k seems ok'
df=df.sample(500)
# Shuff;es
df = df.sample(frac=1).reset_index(drop=True)
# Split the data into training and validation datasets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
# test_df, val_df = train_test_split(val_df, test_size=0.5, random_state=42)

from imblearn.under_sampling import RandomUnderSampler
# Initialize RandomUnderSampler

print(f'Dataset size before balancing: {train_df.shape}')
counts = train_df['label'].value_counts()

sampler = RandomUnderSampler(random_state=42)
x_text, y = sampler.fit_resample(train_df[['text']], train_df['label'])

print(f'Dataset size after balancing: {x_text.shape}')
print(f'Entried dropped: {train_df.shape[0]-x_text.shape[0]}')

# Create a new balanced DataFrame
train_df = pd.DataFrame({'text': x_text['text'], 'label': y})

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(train_df['label'].value_counts())

Dataset size before balancing: (400, 2)
Dataset size after balancing: (334, 1)
Entried dropped: 66

Balanced DataFrame:
label
0    167
1    167
Name: count, dtype: int64


In [25]:
'''create custom dataset'''

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])  # Using .iloc for safer row access
        label = self.targets.iloc[index]   # Using .iloc for safer row access
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [26]:
'''Hyperparameters'''

MODEL_NAME = 'roberta-base'
# MODEL_NAME = 'roberta-base-openai-detector'

N_LABELS=2
MAX_LEN = 256
EPOCHS=10
PATIENCE=3
LEARNING_RATE=.00005
WEIGHT_DECAY=.01
BATCH_SIZE=8
METRIC_FOR_BEST_MODEL='eval_loss'
if METRIC_FOR_BEST_MODEL=='eval_loss':
    GREATER_IS_BETTER = False
else:
    GREATER_IS_BETTER = True

In [27]:
'''Initialize model and tokenizer'''

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
from transformers import AutoConfig
config = AutoConfig.from_pretrained('roberta-base', num_labels=N_LABELS)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,config=config)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
'''create dataset object'''

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

In [29]:
'''metrics'''


def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }

In [30]:
'''Define training arguments and initialize trainer'''

from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_steps=500,
    weight_decay=WEIGHT_DECAY,
    metric_for_best_model=METRIC_FOR_BEST_MODEL,
    greater_is_better=GREATER_IS_BETTER,
    logging_dir='./logs',
    logging_steps=1500,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    logging_first_step=False,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="wandb"  # enable logging to W&B
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
)



In [31]:
trainer.train()

  0%|          | 0/420 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  _warn_prf(average, modifier, msg_start, len(result))

 10%|█         | 42/420 [00:04<00:36, 10.43it/s]

{'eval_loss': 0.683464527130127, 'eval_accuracy': 0.58, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.4326, 'eval_samples_per_second': 231.141, 'eval_steps_per_second': 30.048, 'epoch': 1.0}


 20%|██        | 84/420 [00:09<00:31, 10.83it/s]
 20%|██        | 84/420 [00:10<00:31, 10.83it/s]

{'eval_loss': 0.6444849967956543, 'eval_accuracy': 0.8, 'eval_f1': 0.7297297297297298, 'eval_auc': 0.7783251231527093, 'eval_precision': 0.84375, 'eval_recall': 0.6428571428571429, 'eval_runtime': 0.422, 'eval_samples_per_second': 236.939, 'eval_steps_per_second': 30.802, 'epoch': 2.0}


 30%|███       | 126/420 [00:15<00:26, 10.93it/s]
 30%|███       | 126/420 [00:16<00:26, 10.93it/s]

{'eval_loss': 0.38748547434806824, 'eval_accuracy': 0.85, 'eval_f1': 0.8484848484848484, 'eval_auc': 0.8706896551724138, 'eval_precision': 0.7368421052631579, 'eval_recall': 1.0, 'eval_runtime': 0.4221, 'eval_samples_per_second': 236.939, 'eval_steps_per_second': 30.802, 'epoch': 3.0}


 40%|███▉      | 167/420 [00:21<00:23, 10.58it/s]
 40%|████      | 168/420 [00:21<00:23, 10.58it/s]

{'eval_loss': 0.9636577367782593, 'eval_accuracy': 0.83, 'eval_f1': 0.8316831683168318, 'eval_auc': 0.853448275862069, 'eval_precision': 0.711864406779661, 'eval_recall': 1.0, 'eval_runtime': 0.418, 'eval_samples_per_second': 239.234, 'eval_steps_per_second': 31.1, 'epoch': 4.0}


 50%|████▉     | 209/420 [00:27<00:19, 10.62it/s]
 50%|█████     | 210/420 [00:27<00:19, 10.62it/s]

{'eval_loss': 0.7333021759986877, 'eval_accuracy': 0.88, 'eval_f1': 0.8750000000000001, 'eval_auc': 0.896551724137931, 'eval_precision': 0.7777777777777778, 'eval_recall': 1.0, 'eval_runtime': 0.4248, 'eval_samples_per_second': 235.431, 'eval_steps_per_second': 30.606, 'epoch': 5.0}


                                                 
 60%|██████    | 252/420 [00:33<00:16, 10.32it/s]

{'eval_loss': 0.611746072769165, 'eval_accuracy': 0.89, 'eval_f1': 0.8842105263157896, 'eval_auc': 0.9051724137931034, 'eval_precision': 0.7924528301886793, 'eval_recall': 1.0, 'eval_runtime': 0.423, 'eval_samples_per_second': 236.396, 'eval_steps_per_second': 30.731, 'epoch': 6.0}


 60%|██████    | 252/420 [00:35<00:23,  7.19it/s]

{'train_runtime': 35.0557, 'train_samples_per_second': 95.277, 'train_steps_per_second': 11.981, 'train_loss': 0.35268768431648373, 'epoch': 6.0}





TrainOutput(global_step=252, training_loss=0.35268768431648373, metrics={'train_runtime': 35.0557, 'train_samples_per_second': 95.277, 'train_steps_per_second': 11.981, 'train_loss': 0.35268768431648373, 'epoch': 6.0})

In [32]:
trainer.evaluate()

100%|██████████| 13/13 [00:00<00:00, 29.43it/s]


{'eval_loss': 0.38748547434806824,
 'eval_accuracy': 0.85,
 'eval_f1': 0.8484848484848484,
 'eval_auc': 0.8706896551724138,
 'eval_precision': 0.7368421052631579,
 'eval_recall': 1.0,
 'eval_runtime': 0.4997,
 'eval_samples_per_second': 200.111,
 'eval_steps_per_second': 26.014,
 'epoch': 6.0}