In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import os
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import numpy as np

wandb.login()

  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malberto-rodero557[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
df = pd.read_json(os.getcwd()+'/SubtaskA/subtaskA_train_monolingual.jsonl', lines=True)
# Just interested so far in text and label
df = df[['text', 'label']]

df=df.sample(1000)
# Shuff;es
df = df.sample(frac=1).reset_index(drop=True)
# Split the data into training and validation datasets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
# test_df, val_df = train_test_split(val_df, test_size=0.5, random_state=42)

from imblearn.under_sampling import RandomUnderSampler
# Initialize RandomUnderSampler

print(f'Dataset size before balancing: {train_df.shape}')
counts = train_df['label'].value_counts()

sampler = RandomUnderSampler(random_state=42)
x_text, y = sampler.fit_resample(train_df[['text']], train_df['label'])

print(f'Dataset size after balancing: {x_text.shape}')
print(f'Entried dropped: {train_df.shape[0]-x_text.shape[0]}')

# Create a new balanced DataFrame
train_df = pd.DataFrame({'text': x_text['text'], 'label': y})

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(train_df['label'].value_counts())

Dataset size before balancing: (800, 2)
Dataset size after balancing: (728, 1)
Entried dropped: 72

Balanced DataFrame:
label
0    364
1    364
Name: count, dtype: int64


In [5]:
'''create custom dataset'''

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])  # Using .iloc for safer row access
        label = self.targets.iloc[index]   # Using .iloc for safer row access
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [51]:
'''Hyperparameters'''

# MODEL_NAME = 'roberta-base-openai-detector'
# MODEL_NAME = 'roberta-base'
# MODEL_NAME = 'bert-base-uncased'
# MODEL_NAME = 'distilbert-base-uncased'
# MODEL_NAME = 'microsoft/deberta-large'
# MODEL_NAME = 'microsoft/deberta-v3-base'
# MODEL_NAME = 'Hello-SimpleAI/chatgpt-detector-roberta'
MODEL_NAME = 'roberta-large'

N_LABELS=2
MAX_LEN = 256
EPOCHS=10
PATIENCE=3
LEARNING_RATE=.00005
WEIGHT_DECAY=.01
BATCH_SIZE=8
METRIC_FOR_BEST_MODEL='eval_loss'
if METRIC_FOR_BEST_MODEL=='eval_loss':
    GREATER_IS_BETTER = False
else:
    GREATER_IS_BETTER = True

In [52]:
'''Initialize model, tokenizer and dataset object'''

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
from transformers import AutoConfig
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=N_LABELS)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,config=config)

Downloading (…)okenizer_config.json: 100%|██████████| 1.39k/1.39k [00:00<00:00, 1.39MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 858k/858k [00:00<00:00, 8.88MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 516k/516k [00:00<00:00, 51.2MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 2.23M/2.23M [00:00<00:00, 10.5MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 957/957 [00:00<?, ?B/s] 
Downloading (…)lve/main/config.json: 100%|██████████| 614/614 [00:00<?, ?B/s] 
Downloading pytorch_model.bin: 100%|██████████| 1.42G/1.42G [00:18<00:00, 77.0MB/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-large-bne and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
'''create dataset object'''

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

In [54]:
'''metrics'''

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }

In [55]:
'''Define training arguments and initialize trainer'''

from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_steps=500,
    weight_decay=WEIGHT_DECAY,
    metric_for_best_model=METRIC_FOR_BEST_MODEL,
    greater_is_better=GREATER_IS_BETTER,
    logging_dir='./logs',
    logging_steps=1500,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    logging_first_step=False,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="wandb"  # enable logging to W&B
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
)



In [56]:
trainer.evaluate()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 25/25 [00:02<00:00, 11.79it/s]


{'eval_loss': 0.759347140789032,
 'eval_accuracy': 0.46,
 'eval_f1': 0.6301369863013699,
 'eval_auc': 0.5,
 'eval_precision': 0.46,
 'eval_recall': 1.0,
 'eval_runtime': 2.2607,
 'eval_samples_per_second': 88.469,
 'eval_steps_per_second': 11.059}

In [57]:
trainer.train()

 10%|█         | 91/910 [00:22<03:18,  4.12it/s]
 10%|█         | 91/910 [00:24<03:18,  4.12it/s]

{'eval_loss': 0.9338648915290833, 'eval_accuracy': 0.57, 'eval_f1': 0.12244897959183672, 'eval_auc': 0.532608695652174, 'eval_precision': 1.0, 'eval_recall': 0.06521739130434782, 'eval_runtime': 2.118, 'eval_samples_per_second': 94.43, 'eval_steps_per_second': 11.804, 'epoch': 1.0}


 20%|██        | 182/910 [00:50<02:58,  4.08it/s]
 20%|██        | 182/910 [00:52<02:58,  4.08it/s]

{'eval_loss': 0.22780947387218475, 'eval_accuracy': 0.905, 'eval_f1': 0.8901734104046243, 'eval_auc': 0.8999597423510467, 'eval_precision': 0.9506172839506173, 'eval_recall': 0.8369565217391305, 'eval_runtime': 2.1579, 'eval_samples_per_second': 92.683, 'eval_steps_per_second': 11.585, 'epoch': 2.0}


 30%|███       | 273/910 [01:18<02:39,  3.98it/s]
 30%|███       | 273/910 [01:21<02:39,  3.98it/s]

{'eval_loss': 0.3389382064342499, 'eval_accuracy': 0.91, 'eval_f1': 0.9010989010989011, 'eval_auc': 0.9086151368760065, 'eval_precision': 0.9111111111111111, 'eval_recall': 0.8913043478260869, 'eval_runtime': 2.1449, 'eval_samples_per_second': 93.244, 'eval_steps_per_second': 11.656, 'epoch': 3.0}


 40%|████      | 364/910 [01:47<02:14,  4.07it/s]
 40%|████      | 364/910 [01:49<02:14,  4.07it/s]

{'eval_loss': 0.2164173275232315, 'eval_accuracy': 0.945, 'eval_f1': 0.9424083769633508, 'eval_auc': 0.947463768115942, 'eval_precision': 0.9090909090909091, 'eval_recall': 0.9782608695652174, 'eval_runtime': 2.1597, 'eval_samples_per_second': 92.605, 'eval_steps_per_second': 11.576, 'epoch': 4.0}


 50%|█████     | 455/910 [02:15<01:48,  4.18it/s]
 50%|█████     | 455/910 [02:17<01:48,  4.18it/s]

{'eval_loss': 0.7881721258163452, 'eval_accuracy': 0.46, 'eval_f1': 0.6301369863013699, 'eval_auc': 0.5, 'eval_precision': 0.46, 'eval_recall': 1.0, 'eval_runtime': 2.1179, 'eval_samples_per_second': 94.433, 'eval_steps_per_second': 11.804, 'epoch': 5.0}


  _warn_prf(average, modifier, msg_start, len(result))

 60%|██████    | 546/910 [02:46<01:29,  4.08it/s]

{'eval_loss': 0.6911664009094238, 'eval_accuracy': 0.54, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 2.2815, 'eval_samples_per_second': 87.662, 'eval_steps_per_second': 10.958, 'epoch': 6.0}


  _warn_prf(average, modifier, msg_start, len(result))

 70%|███████   | 637/910 [03:15<01:07,  4.03it/s]

{'eval_loss': 0.6905523538589478, 'eval_accuracy': 0.54, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 2.1177, 'eval_samples_per_second': 94.442, 'eval_steps_per_second': 11.805, 'epoch': 7.0}


 70%|███████   | 637/910 [03:19<01:25,  3.20it/s]

{'train_runtime': 199.2888, 'train_samples_per_second': 36.53, 'train_steps_per_second': 4.566, 'train_loss': 0.46886862091591447, 'epoch': 7.0}





TrainOutput(global_step=637, training_loss=0.46886862091591447, metrics={'train_runtime': 199.2888, 'train_samples_per_second': 36.53, 'train_steps_per_second': 4.566, 'train_loss': 0.46886862091591447, 'epoch': 7.0})

In [58]:
trainer.evaluate()

100%|██████████| 25/25 [00:02<00:00, 10.08it/s]


{'eval_loss': 0.2164173275232315,
 'eval_accuracy': 0.945,
 'eval_f1': 0.9424083769633508,
 'eval_auc': 0.947463768115942,
 'eval_precision': 0.9090909090909091,
 'eval_recall': 0.9782608695652174,
 'eval_runtime': 2.6474,
 'eval_samples_per_second': 75.546,
 'eval_steps_per_second': 9.443,
 'epoch': 7.0}