In [1]:
'''Import libraries'''
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import os
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
wandb.login()

  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malberto-rodero557[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
'''Variables and parameters'''

# MODEL_NAME = 'roberta-base-openai-detector'
# MODEL_NAME = 'Hello-SimpleAI/chatgpt-detector-roberta'
# MODEL_NAME = 'roberta-base'
MODEL_NAME = 'roberta-large'
# MODEL_NAME = 'microsoft/deberta-large'
# MODEL_NAME = 'bert-base-uncased'

SAMPLES_TO_TRAIN=10000

N_LABELS=2
MAX_LEN = 256
EPOCHS=10
PATIENCE=3
LEARNING_RATE=.00005
WEIGHT_DECAY=.01
BATCH_SIZE=8
METRIC_FOR_BEST_MODEL='eval_f1'
if METRIC_FOR_BEST_MODEL=='eval_loss':
    GREATER_IS_BETTER = False
else:
    GREATER_IS_BETTER = True

In [3]:
'''Preparing dataset'''

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_train_monolingual.jsonl', lines=True)
print(f'Original dataset')
print(df.info())
print(f'''\n{df['label'].value_counts()}''')
print(f'''\n{df['model'].value_counts()}''')
print(f'''\n{df['source'].value_counts()}''')

df = df[['text', 'label']]

train_df=df.sample(round(SAMPLES_TO_TRAIN))
test_train_df=df.sample(round(SAMPLES_TO_TRAIN*.2))
val_train_df=df.sample(round(SAMPLES_TO_TRAIN*.2))

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_dev_monolingual.jsonl', lines=True)
df = df[['text', 'label']]

test_dev_df= df

# we balance the training set
print(f'Dataset size before balancing: {train_df.shape}')
counts = train_df['label'].value_counts()
sampler = RandomUnderSampler(random_state=42)
x_text, y = sampler.fit_resample(train_df[['text']], train_df['label'])

print(f'Dataset size after balancing: {x_text.shape}')
print(f'Entried dropped: {train_df.shape[0]-x_text.shape[0]}')

# Create a new balanced DataFrame
train_df = pd.DataFrame({'text': x_text['text'], 'label': y})

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(train_df['label'].value_counts())

print(train_df.sample(5))

Original dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119757 entries, 0 to 119756
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    119757 non-null  object
 1   label   119757 non-null  int64 
 2   model   119757 non-null  object
 3   source  119757 non-null  object
 4   id      119757 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 4.6+ MB
None

label
0    63351
1    56406
Name: count, dtype: int64

model
human      63351
davinci    14343
chatGPT    14339
dolly      14046
cohere     13678
Name: count, dtype: int64

source
reddit       27500
wikihow      27499
arxiv        27497
wikipedia    25530
peerread     11731
Name: count, dtype: int64
Dataset size before balancing: (10000, 2)
Dataset size after balancing: (9462, 1)
Entried dropped: 538

Balanced DataFrame:
label
0    4731
1    4731
Name: count, dtype: int64
                                                     text  label
37108    The Sta

In [4]:
'''create custom dataset'''

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        label = self.targets.iloc[index]
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [5]:
'''Initialize model, tokenizer and dataset object'''

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
from transformers import AutoConfig
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=N_LABELS)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,config=config)

print(f'Tokenizer vocab size: {tokenizer.vocab_size}')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
val_train_dataset = CustomDataset(val_train_df, tokenizer, MAX_LEN)
test_train_dataset = CustomDataset(test_train_df, tokenizer, MAX_LEN)
test_dev_dataset = CustomDataset(test_dev_df, tokenizer, MAX_LEN)

print(train_df.shape,val_train_df.shape)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer vocab size: 50265
(9462, 2) (2000, 2)


In [6]:
'''metrics'''

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }

In [7]:
'''Define training arguments and initialize trainer'''

from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_steps=500,
    weight_decay=WEIGHT_DECAY,
    metric_for_best_model=METRIC_FOR_BEST_MODEL,
    greater_is_better=GREATER_IS_BETTER,
    logging_dir='./logs',
    logging_steps=1500,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    logging_first_step=False,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
)

In [8]:
trainer.train()

  0%|          | 0/11830 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
                                                     
 10%|█         | 1183/11830 [05:58<47:54,  3.70it/s]

{'eval_loss': 0.638405442237854, 'eval_accuracy': 0.6225, 'eval_f1': 0.3617920540997464, 'eval_auc': 0.6071314102564103, 'eval_precision': 0.9596412556053812, 'eval_recall': 0.22291666666666668, 'eval_runtime': 20.7556, 'eval_samples_per_second': 96.36, 'eval_steps_per_second': 12.045, 'epoch': 1.0}


 13%|█▎        | 1501/11830 [07:31<48:15,  3.57it/s]   

{'loss': 0.5254, 'learning_rate': 4.558693733451015e-05, 'epoch': 1.27}


                                                    
 20%|██        | 2366/11830 [11:51<40:26,  3.90it/s]

{'eval_loss': 0.6955238580703735, 'eval_accuracy': 0.48, 'eval_f1': 0.6486486486486487, 'eval_auc': 0.5, 'eval_precision': 0.48, 'eval_recall': 1.0, 'eval_runtime': 20.7727, 'eval_samples_per_second': 96.28, 'eval_steps_per_second': 12.035, 'epoch': 2.0}


 25%|██▌       | 3001/11830 [14:53<39:15,  3.75it/s]   

{'loss': 0.7061, 'learning_rate': 3.896734333627537e-05, 'epoch': 2.54}


                                                    
 30%|███       | 3549/11830 [17:41<34:02,  4.05it/s]

{'eval_loss': 0.6941173076629639, 'eval_accuracy': 0.48, 'eval_f1': 0.6486486486486487, 'eval_auc': 0.5, 'eval_precision': 0.48, 'eval_recall': 1.0, 'eval_runtime': 20.5758, 'eval_samples_per_second': 97.202, 'eval_steps_per_second': 12.15, 'epoch': 3.0}


 38%|███▊      | 4501/11830 [21:58<31:46,  3.84it/s]   

{'loss': 0.7039, 'learning_rate': 3.23477493380406e-05, 'epoch': 3.8}


  _warn_prf(average, modifier, msg_start, len(result))
                                                    
 40%|████      | 4732/11830 [23:20<28:40,  4.13it/s]

{'eval_loss': 0.6938462257385254, 'eval_accuracy': 0.52, 'eval_f1': 0.0, 'eval_auc': 0.5, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 19.8993, 'eval_samples_per_second': 100.506, 'eval_steps_per_second': 12.563, 'epoch': 4.0}


 48%|████▊     | 5658/11830 [27:27<28:40,  3.59it/s]   

KeyboardInterrupt: 

In [None]:
trainer.evaluate()

100%|██████████| 13/13 [00:00<00:00, 29.08it/s]


{'eval_loss': 3.9832382202148438,
 'eval_accuracy': 0.56,
 'eval_f1': 0.37142857142857144,
 'eval_auc': 0.5729166666666667,
 'eval_precision': 0.7222222222222222,
 'eval_recall': 0.25,
 'eval_runtime': 0.5096,
 'eval_samples_per_second': 196.232,
 'eval_steps_per_second': 25.51,
 'epoch': 10.0}

In [None]:
trainer.evaluate(test_train_dataset)

  0%|          | 0/13 [00:00<?, ?it/s]

100%|██████████| 13/13 [00:00<00:00, 27.90it/s]


{'eval_loss': 0.6377905011177063,
 'eval_accuracy': 0.92,
 'eval_f1': 0.9285714285714286,
 'eval_auc': 0.9188311688311689,
 'eval_precision': 0.9285714285714286,
 'eval_recall': 0.9285714285714286,
 'eval_runtime': 0.5367,
 'eval_samples_per_second': 186.316,
 'eval_steps_per_second': 24.221,
 'epoch': 10.0}

In [None]:
trainer.evaluate(test_dev_dataset)

100%|██████████| 125/125 [00:04<00:00, 28.46it/s]


{'eval_loss': 1.6349717378616333,
 'eval_accuracy': 0.719,
 'eval_f1': 0.6420382165605096,
 'eval_auc': 0.7212221222122212,
 'eval_precision': 0.9,
 'eval_recall': 0.499009900990099,
 'eval_runtime': 4.4392,
 'eval_samples_per_second': 225.265,
 'eval_steps_per_second': 28.158,
 'epoch': 8.0}

In [None]:
trainer.save_model('SavedModels/'+(MODEL_NAME.split('/')[-1])+str(round(SAMPLES_TO_TRAIN/1000))+'k')