In [8]:
from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import EarlyStoppingCallback,AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed
import os
from sklearn.model_selection import train_test_split
from scipy.special import softmax
import argparse
import logging
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import os
from torch.utils.data import Dataset
import torch
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from datasets import Dataset
import numpy as np

In [9]:
'''Variables'''

MODEL_NAME = 'roberta-base'
# MODEL_NAME = 'roberta-base-openai-detector'
# MODEL_NAME = 'Hello-SimpleAI/chatgpt-detector-roberta'
# MODEL_NAME = 'bert-base-uncased'
# MODEL_NAME = 'distilbert-base-uncased'
# MODEL_NAME = 'google/electra-base-discriminator'

SAMPLES_TO_TRAIN=1000

random_seed=0
LEARNING_RATE=2e-5
BATCH_SIZE=16
EPOCHS=3
WEIGHT_DECAY=0.01

PATIENCE=2
id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1}

In [10]:
'''Preparing Data'''

df= pd.read_json('datasets/subtaskA_train_monolingual.jsonl', lines=True)

print(f'Original dataset')
print(df.info())
print(f'''\n{df['label'].value_counts()}''')
print(f'''\n{df['model'].value_counts()}''')
print(f'''\n{df['source'].value_counts()}''')

if SAMPLES_TO_TRAIN>0:
    df=df.sample(SAMPLES_TO_TRAIN)
train_df, val_df = train_test_split(df, test_size=0.4, stratify=df['label'], random_state=random_seed)
val_df, test_df= train_test_split(val_df, test_size=0.5, stratify=val_df['label'], random_state=random_seed)
dev_df= pd.read_json('datasets/subtaskA_dev_monolingual.jsonl', lines=True)

print("\nTrain DataFrame:")
print(train_df['label'].value_counts())

Original dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119757 entries, 0 to 119756
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    119757 non-null  object
 1   label   119757 non-null  int64 
 2   model   119757 non-null  object
 3   source  119757 non-null  object
 4   id      119757 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 4.6+ MB
None

label
0    63351
1    56406
Name: count, dtype: int64

model
human      63351
davinci    14343
chatGPT    14339
dolly      14046
cohere     13678
Name: count, dtype: int64

source
reddit       27500
wikihow      27499
arxiv        27497
wikipedia    25530
peerread     11731
Name: count, dtype: int64

Train DataFrame:
label
0    318
1    282
Name: count, dtype: int64


In [11]:
'''Preparing data'''

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    f1 = precision_recall_fscore_support(labels, preds, average='macro')[2]
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    cm = confusion_matrix(labels, preds)

    return {
        'f1': f1,
        'confusion_matrix': cm.tolist(),
        'accuracy': acc,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)
dev_dataset = Dataset.from_pandas(dev_df)

def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs['tokenizer'](examples["text"], truncation=True)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)     # put your model here
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(label2id), id2label=id2label, label2id=label2id    # put your model here
)

# tokenize data for train/valid
train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
val_dataset = val_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
test_dataset = test_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
dev_dataset = dev_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)],
)

print(train_dataset)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 600/600 [00:00<00:00, 4211.85 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 3059.34 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 4261.27 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 5690.24 examples/s]


Dataset({
    features: ['text', 'label', 'model', 'source', 'id', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 600
})


In [12]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malberto-rodero557[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/114 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 33%|███▎      | 38/114 [00:10<00:15,  5.04it/s]Trainer is attempting to log a value of "[[101, 5], [18, 76]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
                                                
 33%|███▎      | 38/114 [00:11<00:15,  5.04it/s]

{'eval_loss': 0.28595981001853943, 'eval_f1': 0.8831746031746032, 'eval_confusion_matrix': [[101, 5], [18, 76]], 'eval_accuracy': 0.885, 'eval_auc': 0.8806704134885589, 'eval_precision': 0.9382716049382716, 'eval_recall': 0.8085106382978723, 'eval_runtime': 1.0244, 'eval_samples_per_second': 195.228, 'eval_steps_per_second': 12.69, 'epoch': 1.0}


 67%|██████▋   | 76/114 [00:22<00:07,  5.15it/s]Trainer is attempting to log a value of "[[98, 8], [8, 86]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
                                                
 67%|██████▋   | 76/114 [00:23<00:07,  5.15it/s]

{'eval_loss': 0.17878346145153046, 'eval_f1': 0.9197109594540345, 'eval_confusion_matrix': [[98, 8], [8, 86]], 'eval_accuracy': 0.92, 'eval_auc': 0.9197109594540345, 'eval_precision': 0.9148936170212766, 'eval_recall': 0.9148936170212766, 'eval_runtime': 1.0041, 'eval_samples_per_second': 199.191, 'eval_steps_per_second': 12.947, 'epoch': 2.0}


100%|██████████| 114/114 [00:33<00:00,  5.26it/s]Trainer is attempting to log a value of "[[92, 14], [2, 92]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
                                                 
100%|██████████| 114/114 [00:34<00:00,  5.26it/s]

{'eval_loss': 0.3038085699081421, 'eval_f1': 0.9199999999999999, 'eval_confusion_matrix': [[92, 14], [2, 92]], 'eval_accuracy': 0.92, 'eval_auc': 0.9233239662786029, 'eval_precision': 0.8679245283018868, 'eval_recall': 0.9787234042553191, 'eval_runtime': 0.999, 'eval_samples_per_second': 200.192, 'eval_steps_per_second': 13.012, 'epoch': 3.0}


100%|██████████| 114/114 [00:36<00:00,  3.14it/s]

{'train_runtime': 38.2047, 'train_samples_per_second': 47.115, 'train_steps_per_second': 2.984, 'train_loss': 0.262399623268529, 'epoch': 3.0}





TrainOutput(global_step=114, training_loss=0.262399623268529, metrics={'train_runtime': 38.2047, 'train_samples_per_second': 47.115, 'train_steps_per_second': 2.984, 'train_loss': 0.262399623268529, 'epoch': 3.0})

In [13]:
evaluation=trainer.evaluate(dev_dataset)
evaluation

100%|█████████▉| 312/313 [00:26<00:00, 10.77it/s]Trainer is attempting to log a value of "[[2234, 266], [1410, 1090]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
100%|██████████| 313/313 [00:27<00:00, 11.57it/s]


{'eval_loss': 1.2136058807373047,
 'eval_f1': 0.6462831193810512,
 'eval_confusion_matrix': [[2234, 266], [1410, 1090]],
 'eval_accuracy': 0.6648,
 'eval_auc': 0.6648,
 'eval_precision': 0.803834808259587,
 'eval_recall': 0.436,
 'eval_runtime': 27.1934,
 'eval_samples_per_second': 183.868,
 'eval_steps_per_second': 11.51,
 'epoch': 3.0}

In [14]:
# '''Save'''
# import json
# trainer.save_model('SavedModels/'+(MODEL_NAME.split('/')[-1])+str(round(SAMPLES_TO_TRAIN/1000))+'k')
# with open('SavedModels/'+(MODEL_NAME.split('/')[-1])+str(round(SAMPLES_TO_TRAIN/1000))+'k/metrics.json', 'w') as file:
#     json.dump(evaluation, file)