In [1]:
'''Import libraries'''
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import os
from torch.utils.data import Dataset
import torch
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from datasets import Dataset
import numpy as np
wandb.login()

  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malberto-rodero557[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [38]:
'''Variables and parameters'''

# MODEL_NAME = 'roberta-base-openai-detector'
# MODEL_NAME = 'Hello-SimpleAI/chatgpt-detector-roberta'
MODEL_NAME = 'roberta-base'
# MODEL_NAME = 'roberta-large'
# MODEL_NAME = 'microsoft/deberta-large'
# MODEL_NAME = 'bert-base-uncased'

SAMPLES_TO_TRAIN=5000

N_LABELS=2
MAX_LEN = 256
EPOCHS=5
PATIENCE=2
LEARNING_RATE=2e-5
WEIGHT_DECAY=.01
BATCH_SIZE=16
METRIC_FOR_BEST_MODEL='eval_f1'
if METRIC_FOR_BEST_MODEL=='eval_loss':
    GREATER_IS_BETTER = False
else:
    GREATER_IS_BETTER = True

In [14]:
'''Preparing dataset'''

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_train_monolingual.jsonl', lines=True)
print(f'Original dataset')
print(df.info())
print(f'''\n{df['label'].value_counts()}''')
print(f'''\n{df['model'].value_counts()}''')
print(f'''\n{df['source'].value_counts()}''')

df = df[['text', 'label']]

train_df=df.sample(round(SAMPLES_TO_TRAIN))
test_train_df=df.sample(round(SAMPLES_TO_TRAIN*.2))
val_train_df=df.sample(round(SAMPLES_TO_TRAIN*.2))

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_dev_monolingual.jsonl', lines=True)
df = df[['text', 'label']]

test_dev_df= df

# we balance the training set
print(f'Dataset size before balancing: {train_df.shape}')
counts = train_df['label'].value_counts()
sampler = RandomUnderSampler(random_state=42)
x_text, y = sampler.fit_resample(train_df[['text']], train_df['label'])

print(f'Dataset size after balancing: {x_text.shape}')
print(f'Entried dropped: {train_df.shape[0]-x_text.shape[0]}')

# Create a new balanced DataFrame
train_df = pd.DataFrame({'text': x_text['text'], 'label': y})

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(train_df['label'].value_counts())

print(train_df.sample(5))

Original dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119757 entries, 0 to 119756
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    119757 non-null  object
 1   label   119757 non-null  int64 
 2   model   119757 non-null  object
 3   source  119757 non-null  object
 4   id      119757 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 4.6+ MB
None

label
0    63351
1    56406
Name: count, dtype: int64

model
human      63351
davinci    14343
chatGPT    14339
dolly      14046
cohere     13678
Name: count, dtype: int64

source
reddit       27500
wikihow      27499
arxiv        27497
wikipedia    25530
peerread     11731
Name: count, dtype: int64
Dataset size before balancing: (5000, 2)
Dataset size after balancing: (4736, 1)
Entried dropped: 264

Balanced DataFrame:
label
0    2368
1    2368
Name: count, dtype: int64
                                                     text  label
50319   The paper

In [15]:
'''create custom dataset'''

def createDataset(indf, tokenizer, maxLen):
    # Rename 'label' to 'labels' in the original DataFrame if necessary
    if 'label' in indf.columns:
        indf = indf.rename(columns={'label': 'labels'})

    # Convert the input DataFrame to a Dataset
    tmpDataset = Dataset.from_pandas(indf)

    # Define a function to tokenize the data and structure the output features
    def tokenize_and_structure_output(entry):
        # Tokenize the text
        tokenized_inputs = tokenizer(entry['text'], truncation=True, max_length=maxLen, padding='max_length')
        # Construct the features dictionary
        return {
            'labels': entry['labels'],
            'input_ids': tokenized_inputs['input_ids'],
            'attention_mask': tokenized_inputs['attention_mask']
        }

    # Tokenize the data and structure the output in one step
    outDataset = tmpDataset.map(
        tokenize_and_structure_output,
        batched=True,
        remove_columns=[column for column in tmpDataset.column_names if column not in ['labels']]
    )
    
    return outDataset


In [39]:
'''Initialize model, tokenizer and dataset object'''

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
from transformers import AutoConfig
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=N_LABELS)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,config=config)

print(f'Tokenizer vocab size: {tokenizer.vocab_size}')

train_dataset = createDataset(train_df, tokenizer, MAX_LEN)
val_train_dataset = createDataset(val_train_df, tokenizer, MAX_LEN)
test_train_dataset = createDataset(test_train_df, tokenizer, MAX_LEN)
test_dev_dataset = createDataset(test_dev_df, tokenizer, MAX_LEN)

print(train_dataset,val_train_dataset)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer vocab size: 50265


Map: 100%|██████████| 4736/4736 [00:01<00:00, 3721.29 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 2866.31 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 4067.20 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 5177.35 examples/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 4736
}) Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 1000
})





In [17]:
'''metrics'''

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    f1 = precision_recall_fscore_support(labels, preds, average='macro')[2]
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    cm = confusion_matrix(labels, preds)

    return {
        'f1': f1,
        'confusion_matrix': cm.tolist(),
        'accuracy': acc,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }


In [40]:
'''Define training arguments and initialize trainer'''

from transformers import EarlyStoppingCallback,DataCollatorWithPadding

data_collator=DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    # warmup_steps=500,
    # weight_decay=WEIGHT_DECAY,
    metric_for_best_model=METRIC_FOR_BEST_MODEL,
    greater_is_better=GREATER_IS_BETTER,
    # logging_dir='./logs',
    # logging_steps=1500,
    # do_train=True,
    # do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    # push_to_hub=False,
    # logging_first_step=False,
    load_best_model_at_end=True,
    # save_total_limit=2,
    # report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_train_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)],
    tokenizer = tokenizer,
    data_collator=data_collator
)

In [41]:
trainer.train()

  0%|          | 0/1184 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 25%|██▌       | 296/1184 [00:41<02:01,  7.29it/s]Trainer is attempting to log a value of "[[478, 53], [1, 468]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.

 25%|██▌       | 296/1184 [00:44<02:01,  7.29it/s]

{'eval_loss': 0.27311497926712036, 'eval_f1': 0.9459945994599459, 'eval_confusion_matrix': [[478, 53], [1, 468]], 'eval_accuracy': 0.946, 'eval_auc': 0.9490280638775451, 'eval_precision': 0.8982725527831094, 'eval_recall': 0.997867803837953, 'eval_runtime': 2.8367, 'eval_samples_per_second': 352.526, 'eval_steps_per_second': 22.209, 'epoch': 1.0}


 42%|████▏     | 501/1184 [01:14<01:34,  7.20it/s]

{'loss': 0.1571, 'learning_rate': 1.1554054054054056e-05, 'epoch': 1.69}


 50%|█████     | 592/1184 [01:27<01:21,  7.23it/s]Trainer is attempting to log a value of "[[490, 41], [1, 468]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.

 50%|█████     | 592/1184 [01:30<01:21,  7.23it/s]

{'eval_loss': 0.22254396975040436, 'eval_f1': 0.9579796621564838, 'eval_confusion_matrix': [[490, 41], [1, 468]], 'eval_accuracy': 0.958, 'eval_auc': 0.9603274989057938, 'eval_precision': 0.9194499017681729, 'eval_recall': 0.997867803837953, 'eval_runtime': 2.8531, 'eval_samples_per_second': 350.491, 'eval_steps_per_second': 22.081, 'epoch': 2.0}


 75%|███████▌  | 888/1184 [02:13<00:40,  7.35it/s]Trainer is attempting to log a value of "[[511, 20], [1, 468]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.

 75%|███████▌  | 888/1184 [02:15<00:40,  7.35it/s]

{'eval_loss': 0.13911165297031403, 'eval_f1': 0.9789610990721844, 'eval_confusion_matrix': [[511, 20], [1, 468]], 'eval_accuracy': 0.979, 'eval_auc': 0.980101510205229, 'eval_precision': 0.9590163934426229, 'eval_recall': 0.997867803837953, 'eval_runtime': 2.8394, 'eval_samples_per_second': 352.185, 'eval_steps_per_second': 22.188, 'epoch': 3.0}


 85%|████████▍ | 1001/1184 [02:33<00:25,  7.30it/s]

{'loss': 0.0209, 'learning_rate': 3.1081081081081082e-06, 'epoch': 3.38}


100%|██████████| 1184/1184 [02:58<00:00,  7.38it/s]Trainer is attempting to log a value of "[[516, 15], [0, 469]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.

100%|██████████| 1184/1184 [03:01<00:00,  7.38it/s]

{'eval_loss': 0.09601246565580368, 'eval_f1': 0.9849667916427388, 'eval_confusion_matrix': [[516, 15], [0, 469]], 'eval_accuracy': 0.985, 'eval_auc': 0.9858757062146893, 'eval_precision': 0.96900826446281, 'eval_recall': 1.0, 'eval_runtime': 2.8007, 'eval_samples_per_second': 357.055, 'eval_steps_per_second': 22.494, 'epoch': 4.0}


100%|██████████| 1184/1184 [03:02<00:00,  6.49it/s]

{'train_runtime': 182.5734, 'train_samples_per_second': 103.761, 'train_steps_per_second': 6.485, 'train_loss': 0.07523875393451669, 'epoch': 4.0}





TrainOutput(global_step=1184, training_loss=0.07523875393451669, metrics={'train_runtime': 182.5734, 'train_samples_per_second': 103.761, 'train_steps_per_second': 6.485, 'train_loss': 0.07523875393451669, 'epoch': 4.0})

In [42]:
trainer.evaluate(test_dev_dataset)

100%|█████████▉| 312/313 [00:14<00:00, 21.74it/s]Trainer is attempting to log a value of "[[2389, 111], [1488, 1012]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
100%|██████████| 313/313 [00:14<00:00, 21.25it/s]


{'eval_loss': 2.4735047817230225,
 'eval_f1': 0.6539540928011587,
 'eval_confusion_matrix': [[2389, 111], [1488, 1012]],
 'eval_accuracy': 0.6802,
 'eval_auc': 0.6802,
 'eval_precision': 0.9011576135351737,
 'eval_recall': 0.4048,
 'eval_runtime': 14.8103,
 'eval_samples_per_second': 337.603,
 'eval_steps_per_second': 21.134,
 'epoch': 4.0}

In [22]:
print('''
      5k
      4:0.6539540928011587
      3:0.7636741989099447
      2:0.6891655718619156
      1:0.6123717083608671
      ''')

100%|██████████| 313/313 [00:14<00:00, 21.43it/s]


{'0': {'precision': 0.604384653712008,
  'recall': 0.9704,
  'f1-score': 0.7448572305802886,
  'support': 2500.0},
 '1': {'precision': 0.9249492900608519,
  'recall': 0.3648,
  'f1-score': 0.5232358003442341,
  'support': 2500.0},
 'accuracy': 0.6676,
 'macro avg': {'precision': 0.76466697188643,
  'recall': 0.6676,
  'f1-score': 0.6340465154622614,
  'support': 5000.0},
 'weighted avg': {'precision': 0.76466697188643,
  'recall': 0.6676,
  'f1-score': 0.6340465154622614,
  'support': 5000.0}}

In [41]:
trainer.save_model('SavedModels/'+(MODEL_NAME.split('/')[-1])+str(round(SAMPLES_TO_TRAIN/1000))+'k')

In [42]:
print('''roberta base openai detector 20k
train - dev
{'eval_loss': 0.048573993146419525 - 3.720285654067993
 'eval_accuracy': 0.99425 - 0.63425
 'eval_f1': 0.9940614510715208 - 0.4388185654008438
 'eval_confusion_matrix': [[2052, 14], [9, 1925]] - [[1965, 29], [1434, 572]]
 'eval_runtime': 12.1499,
 'eval_samples_per_second': 329.221,
 'eval_steps_per_second': 41.153,
 'epoch': 8.0}''')


roberta base openai detector 20k
train - dev
{'eval_loss': 0.048573993146419525 - 3.720285654067993
 'eval_accuracy': 0.99425 - 0.63425
 'eval_f1': 0.9940614510715208 - 0.4388185654008438
 'eval_confusion_matrix': [[2052, 14], [9, 1925]] - [[1965, 29], [1434, 572]]
 'eval_runtime': 12.1499,
 'eval_samples_per_second': 329.221,
 'eval_steps_per_second': 41.153,
 'epoch': 8.0}
