In [45]:
'''Import libraries'''
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import os
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
wandb.login()



True

In [46]:
'''Variables and parameters'''

# MODEL_NAME = 'roberta-base-openai-detector'
# MODEL_NAME = 'distilbert-base-uncased'
# MODEL_NAME = 'microsoft/deberta-v3-base'
# MODEL_NAME = 'Hello-SimpleAI/chatgpt-detector-roberta'

# MODEL_NAME = 'roberta-base'
# MODEL_NAME = 'roberta-large'
# MODEL_NAME = 'microsoft/deberta-large'
MODEL_NAME = 'bert-base-uncased'

SAMPLES_TO_TRAIN=5000

N_LABELS=2
MAX_LEN = 256
EPOCHS=10
PATIENCE=3
LEARNING_RATE=.00005
WEIGHT_DECAY=.01
BATCH_SIZE=8
METRIC_FOR_BEST_MODEL='eval_loss'
if METRIC_FOR_BEST_MODEL=='eval_loss':
    GREATER_IS_BETTER = False
else:
    GREATER_IS_BETTER = True

In [47]:
'''Preparing dataset'''

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_train_monolingual.jsonl', lines=True)
df = df[['text', 'label']]

train_df=df.sample(round(SAMPLES_TO_TRAIN))
test_train_df=df.sample(round(SAMPLES_TO_TRAIN*.2))

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_dev_monolingual.jsonl', lines=True)
df = df[['text', 'label']]

val_df= df.sample(round(SAMPLES_TO_TRAIN*.2))
test_dev_df= df.sample(round(SAMPLES_TO_TRAIN*.2))

# we balance the training set
print(f'Dataset size before balancing: {train_df.shape}')
counts = train_df['label'].value_counts()
sampler = RandomUnderSampler(random_state=42)
x_text, y = sampler.fit_resample(train_df[['text']], train_df['label'])

print(f'Dataset size after balancing: {x_text.shape}')
print(f'Entried dropped: {train_df.shape[0]-x_text.shape[0]}')

# Create a new balanced DataFrame
train_df = pd.DataFrame({'text': x_text['text'], 'label': y})

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(train_df['label'].value_counts())

Dataset size before balancing: (5000, 2)
Dataset size after balancing: (4736, 1)
Entried dropped: 264

Balanced DataFrame:
label
0    2368
1    2368
Name: count, dtype: int64


In [48]:
'''create custom dataset'''

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        label = self.targets.iloc[index]
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [49]:
'''Initialize model, tokenizer and dataset object'''

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
from transformers import AutoConfig
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=N_LABELS)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,config=config)

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_train_dataset = CustomDataset(test_train_df, tokenizer, MAX_LEN)
test_dev_dataset = CustomDataset(test_dev_df, tokenizer, MAX_LEN)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
'''metrics'''

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }

In [51]:
'''Define training arguments and initialize trainer'''

from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_steps=500,
    weight_decay=WEIGHT_DECAY,
    metric_for_best_model=METRIC_FOR_BEST_MODEL,
    greater_is_better=GREATER_IS_BETTER,
    logging_dir='./logs',
    logging_steps=1500,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    logging_first_step=False,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
)

In [52]:
trainer.evaluate(val_dataset)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 125/125 [00:04<00:00, 28.85it/s]


{'eval_loss': 0.7044677138328552,
 'eval_accuracy': 0.525,
 'eval_f1': 0.5216515609264855,
 'eval_auc': 0.5254706711065574,
 'eval_precision': 0.5384615384615384,
 'eval_recall': 0.505859375,
 'eval_runtime': 4.4462,
 'eval_samples_per_second': 224.911,
 'eval_steps_per_second': 28.114}

In [53]:
trainer.evaluate(test_train_dataset)

100%|██████████| 125/125 [00:04<00:00, 27.45it/s]


{'eval_loss': 0.6720004677772522,
 'eval_accuracy': 0.58,
 'eval_f1': 0.5783132530120482,
 'eval_auc': 0.5811469626018346,
 'eval_precision': 0.5538461538461539,
 'eval_recall': 0.6050420168067226,
 'eval_runtime': 4.6094,
 'eval_samples_per_second': 216.948,
 'eval_steps_per_second': 27.118}

In [54]:
trainer.train()

 10%|▉         | 591/5920 [00:56<08:00, 11.09it/s]
 10%|█         | 592/5920 [01:00<08:00, 11.09it/s]

{'eval_loss': 0.39613446593284607, 'eval_accuracy': 0.886, 'eval_f1': 0.8853118712273643, 'eval_auc': 0.8877573930335493, 'eval_precision': 0.8494208494208494, 'eval_recall': 0.9243697478991597, 'eval_runtime': 4.545, 'eval_samples_per_second': 220.022, 'eval_steps_per_second': 27.503, 'epoch': 1.0}


 20%|█▉        | 1183/5920 [01:57<07:10, 11.02it/s] 
 20%|██        | 1184/5920 [02:02<07:09, 11.02it/s]

{'eval_loss': 0.4487890899181366, 'eval_accuracy': 0.877, 'eval_f1': 0.8825214899713467, 'eval_auc': 0.8812864840592726, 'eval_precision': 0.809106830122592, 'eval_recall': 0.9705882352941176, 'eval_runtime': 4.676, 'eval_samples_per_second': 213.856, 'eval_steps_per_second': 26.732, 'epoch': 2.0}


 25%|██▌       | 1502/5920 [02:33<06:59, 10.52it/s]  

{'loss': 0.2951, 'learning_rate': 4.077490774907749e-05, 'epoch': 2.53}


 30%|███       | 1776/5920 [02:58<06:06, 11.31it/s]
 30%|███       | 1776/5920 [03:03<06:06, 11.31it/s]

{'eval_loss': 0.3517392575740814, 'eval_accuracy': 0.923, 'eval_f1': 0.922300706357215, 'eval_auc': 0.9246985053563411, 'eval_precision': 0.887378640776699, 'eval_recall': 0.9600840336134454, 'eval_runtime': 4.4143, 'eval_samples_per_second': 226.539, 'eval_steps_per_second': 28.317, 'epoch': 3.0}


 40%|████      | 2368/5920 [03:59<05:16, 11.22it/s]  
 40%|████      | 2368/5920 [04:04<05:16, 11.22it/s]

{'eval_loss': 0.9765346646308899, 'eval_accuracy': 0.849, 'eval_f1': 0.8610855565777369, 'eval_auc': 0.8551462569760729, 'eval_precision': 0.7659574468085106, 'eval_recall': 0.9831932773109243, 'eval_runtime': 4.5487, 'eval_samples_per_second': 219.845, 'eval_steps_per_second': 27.481, 'epoch': 4.0}


 50%|█████     | 2960/5920 [04:59<04:26, 11.12it/s]
 50%|█████     | 2960/5920 [05:04<04:26, 11.12it/s]

{'eval_loss': 0.8035147786140442, 'eval_accuracy': 0.885, 'eval_f1': 0.8897411313518695, 'eval_auc': 0.8891125152351017, 'eval_precision': 0.818342151675485, 'eval_recall': 0.9747899159663865, 'eval_runtime': 4.4865, 'eval_samples_per_second': 222.893, 'eval_steps_per_second': 27.862, 'epoch': 5.0}


 51%|█████     | 3002/5920 [05:09<04:44, 10.24it/s]

{'loss': 0.0572, 'learning_rate': 2.693726937269373e-05, 'epoch': 5.07}


 60%|█████▉    | 3551/5920 [06:01<03:39, 10.79it/s]
 60%|██████    | 3552/5920 [06:05<03:39, 10.79it/s]

{'eval_loss': 0.9513245820999146, 'eval_accuracy': 0.889, 'eval_f1': 0.8935762224352828, 'eval_auc': 0.8931217525178011, 'eval_precision': 0.8218694885361552, 'eval_recall': 0.9789915966386554, 'eval_runtime': 4.5183, 'eval_samples_per_second': 221.323, 'eval_steps_per_second': 27.665, 'epoch': 6.0}


 60%|██████    | 3552/5920 [06:06<04:04,  9.68it/s]

{'train_runtime': 366.9975, 'train_samples_per_second': 129.047, 'train_steps_per_second': 16.131, 'train_loss': 0.15005580342567718, 'epoch': 6.0}





TrainOutput(global_step=3552, training_loss=0.15005580342567718, metrics={'train_runtime': 366.9975, 'train_samples_per_second': 129.047, 'train_steps_per_second': 16.131, 'train_loss': 0.15005580342567718, 'epoch': 6.0})

In [55]:
trainer.evaluate(val_dataset)

100%|██████████| 125/125 [00:04<00:00, 29.49it/s]


{'eval_loss': 1.5337541103363037,
 'eval_accuracy': 0.719,
 'eval_f1': 0.6795895096921323,
 'eval_auc': 0.7223680840163934,
 'eval_precision': 0.8164383561643835,
 'eval_recall': 0.58203125,
 'eval_runtime': 4.296,
 'eval_samples_per_second': 232.773,
 'eval_steps_per_second': 29.097,
 'epoch': 6.0}

In [56]:
trainer.evaluate(test_train_dataset)

100%|██████████| 125/125 [00:04<00:00, 27.23it/s]


{'eval_loss': 0.3517392575740814,
 'eval_accuracy': 0.923,
 'eval_f1': 0.922300706357215,
 'eval_auc': 0.9246985053563411,
 'eval_precision': 0.887378640776699,
 'eval_recall': 0.9600840336134454,
 'eval_runtime': 4.651,
 'eval_samples_per_second': 215.006,
 'eval_steps_per_second': 26.876,
 'epoch': 6.0}

In [57]:
trainer.evaluate(test_dev_dataset)

100%|██████████| 125/125 [00:04<00:00, 29.17it/s]


{'eval_loss': 1.589037299156189,
 'eval_accuracy': 0.714,
 'eval_f1': 0.6705069124423964,
 'eval_auc': 0.7153915391539154,
 'eval_precision': 0.8016528925619835,
 'eval_recall': 0.5762376237623762,
 'eval_runtime': 4.3305,
 'eval_samples_per_second': 230.92,
 'eval_steps_per_second': 28.865,
 'epoch': 6.0}

In [58]:
trainer.save_model('SavedModels/'+(MODEL_NAME.split('/')[-1])+str(round(SAMPLES_TO_TRAIN/1000))+'k')