In [1]:
from datasets import Dataset
from collections import Counter
import pandas as pd
import evaluate
import numpy as np
from transformers import EarlyStoppingCallback,AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed
import os
from sklearn.model_selection import train_test_split
from scipy.special import softmax
import argparse
import logging
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import os
from torch.utils.data import Dataset
import torch
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from datasets import Dataset
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
'''Variables'''

MODEL_NAME = 'bert-base-multilingual-cased'
MODEL_NAME = 'xlm-roberta-base'

SAMPLES_TO_TRAIN=5000

random_seed=0
LEARNING_RATE=2e-5
BATCH_SIZE=16
EPOCHS=3
WEIGHT_DECAY=0.01

PATIENCE=2
id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1}

In [17]:
'''Preparing Data'''

df= pd.read_json('datasets/subtaskA_train_multilingual.jsonl', lines=True)

# We convert source to languaje
df = df.rename(columns={'source': 'language'})
non_language_sources = ['wikihow', 'wikipedia', 'reddit', 'arxiv', 'peerread']
df['language'] = df['language'].replace(non_language_sources, 'english')

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(df.drop('language', axis=1), df['language'])
df = pd.concat([X_resampled, y_resampled], axis=1)

print(f'Original dataset')
print(df.info())
print(f'''\n{df['label'].value_counts()}''')
print(f'''\n{df['model'].value_counts()}''')
print(f'''\n{df['language'].value_counts()}''')

print(df.sample(5))

if SAMPLES_TO_TRAIN>0:
    df=df.sample(SAMPLES_TO_TRAIN)
train_df, val_df = train_test_split(df, test_size=0.4, stratify=df['label'], random_state=random_seed)
val_df, test_df= train_test_split(val_df, test_size=0.5, stratify=val_df['label'], random_state=random_seed)
dev_df= pd.read_json('datasets/subtaskA_dev_multilingual.jsonl', lines=True)

print("\nTrain DataFrame:")
print(train_df['label'].value_counts())
print(f'Total entries {train_df.shape}')

Original dataset
<class 'pandas.core.frame.DataFrame'>
Index: 29495 entries, 44964 to 92357
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      29495 non-null  object
 1   label     29495 non-null  int64 
 2   model     29495 non-null  object
 3   id        29495 non-null  int64 
 4   language  29495 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.4+ MB
None

label
0    14764
1    14731
Name: count, dtype: int64

model
human      14764
chatGPT     9392
davinci     3499
cohere       638
bloomz       622
dolly        580
Name: count, dtype: int64

language
bulgarian     5899
chinese       5899
english       5899
indonesian    5899
urdu          5899
Name: count, dtype: int64
                                                     text  label    model  \
34391   This article provides an analysis of the conve...      1  davinci   
168430    The combined all-electron and two-step appro...      0    human   
534

In [18]:
'''Preparing data'''

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    f1 = precision_recall_fscore_support(labels, preds, average='macro')[2]
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    cm = confusion_matrix(labels, preds)

    return {
        'f1': f1,
        'confusion_matrix': cm.tolist(),
        'accuracy': acc,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)
dev_dataset = Dataset.from_pandas(dev_df)

def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs['tokenizer'](examples["text"], truncation=True)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)     # put your model here
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(label2id), id2label=id2label, label2id=label2id    # put your model here
)

# tokenize data for train/valid
train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
val_dataset = val_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
test_dataset = test_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
dev_dataset = dev_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)],
)

print(train_dataset)

Downloading (…)okenizer_config.json: 100%|██████████| 502/502 [00:00<?, ?B/s] 
Downloading (…)tencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 9.34MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 9.08M/9.08M [00:00<00:00, 12.3MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<?, ?B/s] 
Downloading (…)lve/main/config.json: 100%|██████████| 1.42k/1.42k [00:00<?, ?B/s]
Downloading pytorch_model.bin: 100%|██████████| 1.11G/1.11G [00:16<00:00, 65.8MB/s]


RuntimeError: Error(s) in loading state_dict for XLMRobertaForSequenceClassification:
	size mismatch for classifier.out_proj.weight: copying a param with shape torch.Size([20, 768]) from checkpoint, the shape in current model is torch.Size([2, 768]).
	size mismatch for classifier.out_proj.bias: copying a param with shape torch.Size([20]) from checkpoint, the shape in current model is torch.Size([2]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

In [None]:
trainer.train()

  0%|          | 0/564 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 33%|███▎      | 188/564 [00:46<01:21,  4.61it/s]Trainer is attempting to log a value of "[[322, 176], [9, 493]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
                                                 
 33%|███▎      | 188/564 [00:52<01:21,  4.61it/s]

{'eval_loss': 0.5861284136772156, 'eval_f1': 0.8094274686096137, 'eval_confusion_matrix': [[322, 176], [9, 493]], 'eval_accuracy': 0.815, 'eval_auc': 0.8143290292644683, 'eval_precision': 0.7369207772795217, 'eval_recall': 0.9820717131474104, 'eval_runtime': 5.659, 'eval_samples_per_second': 176.71, 'eval_steps_per_second': 11.133, 'epoch': 1.0}


 67%|██████▋   | 376/564 [01:42<00:39,  4.75it/s]Trainer is attempting to log a value of "[[449, 49], [16, 486]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
                                                 
 67%|██████▋   | 376/564 [01:47<00:39,  4.75it/s]

{'eval_loss': 0.25484809279441833, 'eval_f1': 0.9349108930125342, 'eval_confusion_matrix': [[449, 49], [16, 486]], 'eval_accuracy': 0.935, 'eval_auc': 0.9348669578713259, 'eval_precision': 0.908411214953271, 'eval_recall': 0.9681274900398407, 'eval_runtime': 5.5, 'eval_samples_per_second': 181.817, 'eval_steps_per_second': 11.454, 'epoch': 2.0}


 89%|████████▊ | 500/564 [02:22<00:15,  4.23it/s]

{'loss': 0.2422, 'learning_rate': 2.269503546099291e-06, 'epoch': 2.66}


100%|██████████| 564/564 [02:37<00:00,  4.70it/s]Trainer is attempting to log a value of "[[389, 109], [1, 501]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
                                                 
100%|██████████| 564/564 [02:43<00:00,  4.70it/s]

{'eval_loss': 0.5217993259429932, 'eval_f1': 0.8886026314083868, 'eval_confusion_matrix': [[389, 109], [1, 501]], 'eval_accuracy': 0.89, 'eval_auc': 0.8895662330597289, 'eval_precision': 0.8213114754098361, 'eval_recall': 0.99800796812749, 'eval_runtime': 5.6099, 'eval_samples_per_second': 178.257, 'eval_steps_per_second': 11.23, 'epoch': 3.0}


100%|██████████| 564/564 [02:47<00:00,  3.37it/s]

{'train_runtime': 167.3923, 'train_samples_per_second': 53.766, 'train_steps_per_second': 3.369, 'train_loss': 0.22287737308664524, 'epoch': 3.0}





TrainOutput(global_step=564, training_loss=0.22287737308664524, metrics={'train_runtime': 167.3923, 'train_samples_per_second': 53.766, 'train_steps_per_second': 3.369, 'train_loss': 0.22287737308664524, 'epoch': 3.0})

In [14]:
evaluation=trainer.evaluate(dev_dataset)
evaluation

100%|█████████▉| 249/250 [00:21<00:00,  9.65it/s]Trainer is attempting to log a value of "[[898, 1102], [323, 1677]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
100%|██████████| 250/250 [00:21<00:00, 11.74it/s]


{'eval_loss': 1.6140904426574707,
 'eval_f1': 0.6297056374198434,
 'eval_confusion_matrix': [[898, 1102], [323, 1677]],
 'eval_accuracy': 0.64375,
 'eval_auc': 0.64375,
 'eval_precision': 0.6034544800287873,
 'eval_recall': 0.8385,
 'eval_runtime': 21.458,
 'eval_samples_per_second': 186.411,
 'eval_steps_per_second': 11.651,
 'epoch': 3.0}

In [15]:
trainer.evaluate(test_dataset)

100%|██████████| 63/63 [00:06<00:00, 11.27it/s]Trainer is attempting to log a value of "[[463, 34], [21, 482]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
100%|██████████| 63/63 [00:06<00:00,  9.87it/s]


{'eval_loss': 0.22549313306808472,
 'eval_f1': 0.9449801378297567,
 'eval_confusion_matrix': [[463, 34], [21, 482]],
 'eval_accuracy': 0.945,
 'eval_auc': 0.9449200171206164,
 'eval_precision': 0.9341085271317829,
 'eval_recall': 0.9582504970178927,
 'eval_runtime': 6.5911,
 'eval_samples_per_second': 151.719,
 'eval_steps_per_second': 9.558,
 'epoch': 3.0}

In [14]:
# '''Save'''
# import json
# trainer.save_model('SavedModels/'+(MODEL_NAME.split('/')[-1])+str(round(SAMPLES_TO_TRAIN/1000))+'k')
# with open('SavedModels/'+(MODEL_NAME.split('/')[-1])+str(round(SAMPLES_TO_TRAIN/1000))+'k/metrics.json', 'w') as file:
#     json.dump(evaluation, file)