In [15]:
'''Import libraries'''
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import os
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
wandb.login()



True

In [16]:
'''Variables and parameters'''

# MODEL_NAME = 'roberta-base-openai-detector'
# MODEL_NAME = 'distilbert-base-uncased'
# MODEL_NAME = 'microsoft/deberta-v3-base'
MODEL_NAME = 'Hello-SimpleAI/chatgpt-detector-roberta'

# MODEL_NAME = 'roberta-base'
# MODEL_NAME = 'roberta-large'
# MODEL_NAME = 'microsoft/deberta-large'
# MODEL_NAME = 'bert-base-uncased'

SAMPLES_TO_TRAIN=5000

N_LABELS=2
MAX_LEN = 256
EPOCHS=10
PATIENCE=3
LEARNING_RATE=.00005
WEIGHT_DECAY=.01
BATCH_SIZE=8
METRIC_FOR_BEST_MODEL='eval_loss'
if METRIC_FOR_BEST_MODEL=='eval_loss':
    GREATER_IS_BETTER = False
else:
    GREATER_IS_BETTER = True

In [17]:
'''Preparing dataset'''

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_train_monolingual.jsonl', lines=True)
df = df[['text', 'label']]

train_df=df.sample(round(SAMPLES_TO_TRAIN))
test_train_df=df.sample(round(SAMPLES_TO_TRAIN*.2))

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_dev_monolingual.jsonl', lines=True)
df = df[['text', 'label']]

val_df= df.sample(round(SAMPLES_TO_TRAIN*.2))
test_dev_df= df.sample(round(SAMPLES_TO_TRAIN*.2))

# we balance the training set
print(f'Dataset size before balancing: {train_df.shape}')
counts = train_df['label'].value_counts()
sampler = RandomUnderSampler(random_state=42)
x_text, y = sampler.fit_resample(train_df[['text']], train_df['label'])

print(f'Dataset size after balancing: {x_text.shape}')
print(f'Entried dropped: {train_df.shape[0]-x_text.shape[0]}')

# Create a new balanced DataFrame
train_df = pd.DataFrame({'text': x_text['text'], 'label': y})

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(train_df['label'].value_counts())

Dataset size before balancing: (5000, 2)
Dataset size after balancing: (4736, 1)
Entried dropped: 264

Balanced DataFrame:
label
0    2368
1    2368
Name: count, dtype: int64


In [18]:
'''create custom dataset'''

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        label = self.targets.iloc[index]
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [19]:
'''Initialize model, tokenizer and dataset object'''

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
from transformers import AutoConfig
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=N_LABELS)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,config=config)

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_train_dataset = CustomDataset(test_train_df, tokenizer, MAX_LEN)
test_dev_dataset = CustomDataset(test_dev_df, tokenizer, MAX_LEN)

In [20]:
'''metrics'''

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }

In [21]:
'''Define training arguments and initialize trainer'''

from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_steps=500,
    weight_decay=WEIGHT_DECAY,
    metric_for_best_model=METRIC_FOR_BEST_MODEL,
    greater_is_better=GREATER_IS_BETTER,
    logging_dir='./logs',
    logging_steps=1500,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    logging_first_step=False,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
)

In [22]:
trainer.evaluate(val_dataset)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 125/125 [00:04<00:00, 28.23it/s]


{'eval_loss': 3.437922954559326,
 'eval_accuracy': 0.494,
 'eval_f1': 0.1275862068965517,
 'eval_auc': 0.5043705174180327,
 'eval_precision': 0.5441176470588235,
 'eval_recall': 0.072265625,
 'eval_runtime': 4.4795,
 'eval_samples_per_second': 223.241,
 'eval_steps_per_second': 27.905}

In [23]:
trainer.evaluate(test_train_dataset)

100%|██████████| 125/125 [00:04<00:00, 26.33it/s]


{'eval_loss': 1.0305818319320679,
 'eval_accuracy': 0.74,
 'eval_f1': 0.672544080604534,
 'eval_auc': 0.731798062736545,
 'eval_precision': 0.839622641509434,
 'eval_recall': 0.5609243697478992,
 'eval_runtime': 4.8103,
 'eval_samples_per_second': 207.886,
 'eval_steps_per_second': 25.986}

In [24]:
trainer.train()

                                                  
 10%|█         | 592/5920 [01:04<08:43, 10.17it/s]

{'eval_loss': 0.46244585514068604, 'eval_accuracy': 0.774, 'eval_f1': 0.807495741056218, 'eval_auc': 0.7841587016485982, 'eval_precision': 0.6790830945558739, 'eval_recall': 0.9957983193277311, 'eval_runtime': 4.7369, 'eval_samples_per_second': 211.111, 'eval_steps_per_second': 26.389, 'epoch': 1.0}


                                                    
 20%|██        | 1184/5920 [02:09<07:45, 10.17it/s]

{'eval_loss': 0.12385120987892151, 'eval_accuracy': 0.972, 'eval_f1': 0.9705263157894737, 'eval_auc': 0.9718391173263199, 'eval_precision': 0.9725738396624473, 'eval_recall': 0.9684873949579832, 'eval_runtime': 4.7456, 'eval_samples_per_second': 210.722, 'eval_steps_per_second': 26.34, 'epoch': 2.0}


 25%|██▌       | 1501/5920 [02:42<07:28,  9.85it/s]  

{'loss': 0.2762, 'learning_rate': 4.077490774907749e-05, 'epoch': 2.53}


                                                   
 30%|███       | 1776/5920 [03:14<06:46, 10.19it/s]

{'eval_loss': 0.1900913417339325, 'eval_accuracy': 0.96, 'eval_f1': 0.959349593495935, 'eval_auc': 0.9614471742895632, 'eval_precision': 0.9291338582677166, 'eval_recall': 0.9915966386554622, 'eval_runtime': 4.7381, 'eval_samples_per_second': 211.053, 'eval_steps_per_second': 26.382, 'epoch': 3.0}


                                                     
 40%|████      | 2368/5920 [04:19<05:47, 10.23it/s]

{'eval_loss': 0.24622778594493866, 'eval_accuracy': 0.95, 'eval_f1': 0.9497991967871486, 'eval_auc': 0.9520014112515236, 'eval_precision': 0.9096153846153846, 'eval_recall': 0.9936974789915967, 'eval_runtime': 4.7202, 'eval_samples_per_second': 211.855, 'eval_steps_per_second': 26.482, 'epoch': 4.0}


                                                     
 50%|█████     | 2960/5920 [05:24<04:47, 10.28it/s]

{'eval_loss': 0.09433721750974655, 'eval_accuracy': 0.979, 'eval_f1': 0.9780104712041885, 'eval_auc': 0.9790958368080057, 'eval_precision': 0.9749478079331941, 'eval_recall': 0.9810924369747899, 'eval_runtime': 4.7498, 'eval_samples_per_second': 210.536, 'eval_steps_per_second': 26.317, 'epoch': 5.0}


 51%|█████     | 3002/5920 [05:30<04:55,  9.88it/s]

{'loss': 0.102, 'learning_rate': 2.693726937269373e-05, 'epoch': 5.07}


                                                   
 60%|██████    | 3552/5920 [06:29<03:55, 10.07it/s]

{'eval_loss': 0.13166674971580505, 'eval_accuracy': 0.978, 'eval_f1': 0.9772727272727273, 'eval_auc': 0.978718968503432, 'eval_precision': 0.9613821138211383, 'eval_recall': 0.9936974789915967, 'eval_runtime': 4.7413, 'eval_samples_per_second': 210.911, 'eval_steps_per_second': 26.364, 'epoch': 6.0}


                                                   
 70%|███████   | 4144/5920 [07:34<02:47, 10.63it/s]

{'eval_loss': 0.10730905085802078, 'eval_accuracy': 0.984, 'eval_f1': 0.9833333333333334, 'eval_auc': 0.9843479376483417, 'eval_precision': 0.9752066115702479, 'eval_recall': 0.9915966386554622, 'eval_runtime': 4.6606, 'eval_samples_per_second': 214.567, 'eval_steps_per_second': 26.821, 'epoch': 7.0}


 76%|███████▌  | 4502/5920 [08:11<02:20, 10.11it/s]

{'loss': 0.0324, 'learning_rate': 1.3099630996309964e-05, 'epoch': 7.6}


                                                   
 80%|████████  | 4736/5920 [08:39<02:00,  9.86it/s]

{'eval_loss': 0.11233969032764435, 'eval_accuracy': 0.984, 'eval_f1': 0.9832985386221293, 'eval_auc': 0.984251715953557, 'eval_precision': 0.9771784232365145, 'eval_recall': 0.9894957983193278, 'eval_runtime': 4.7466, 'eval_samples_per_second': 210.678, 'eval_steps_per_second': 26.335, 'epoch': 8.0}


 80%|████████  | 4736/5920 [08:41<02:10,  9.09it/s]

{'train_runtime': 521.0124, 'train_samples_per_second': 90.9, 'train_steps_per_second': 11.362, 'train_loss': 0.1306033256790928, 'epoch': 8.0}





TrainOutput(global_step=4736, training_loss=0.1306033256790928, metrics={'train_runtime': 521.0124, 'train_samples_per_second': 90.9, 'train_steps_per_second': 11.362, 'train_loss': 0.1306033256790928, 'epoch': 8.0})

In [25]:
trainer.evaluate(val_dataset)

100%|██████████| 125/125 [00:04<00:00, 29.01it/s]


{'eval_loss': 1.7912003993988037,
 'eval_accuracy': 0.687,
 'eval_f1': 0.5919165580182529,
 'eval_auc': 0.6929911629098362,
 'eval_precision': 0.8901960784313725,
 'eval_recall': 0.443359375,
 'eval_runtime': 4.3715,
 'eval_samples_per_second': 228.753,
 'eval_steps_per_second': 28.594,
 'epoch': 8.0}

In [26]:
trainer.evaluate(test_train_dataset)

100%|██████████| 125/125 [00:04<00:00, 26.46it/s]


{'eval_loss': 0.09433721750974655,
 'eval_accuracy': 0.979,
 'eval_f1': 0.9780104712041885,
 'eval_auc': 0.9790958368080057,
 'eval_precision': 0.9749478079331941,
 'eval_recall': 0.9810924369747899,
 'eval_runtime': 4.7862,
 'eval_samples_per_second': 208.935,
 'eval_steps_per_second': 26.117,
 'epoch': 8.0}

In [27]:
trainer.evaluate(test_dev_dataset)

100%|██████████| 125/125 [00:04<00:00, 28.46it/s]


{'eval_loss': 1.6349717378616333,
 'eval_accuracy': 0.719,
 'eval_f1': 0.6420382165605096,
 'eval_auc': 0.7212221222122212,
 'eval_precision': 0.9,
 'eval_recall': 0.499009900990099,
 'eval_runtime': 4.4392,
 'eval_samples_per_second': 225.265,
 'eval_steps_per_second': 28.158,
 'epoch': 8.0}

In [28]:
trainer.save_model('SavedModels/'+(MODEL_NAME.split('/')[-1])+str(round(SAMPLES_TO_TRAIN/1000))+'k')