In [20]:
from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed
import os
from sklearn.model_selection import train_test_split
from scipy.special import softmax
import argparse
import logging
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import os
from torch.utils.data import Dataset
import torch
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from datasets import Dataset
import numpy as np
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix

import pandas as pd,os
import torch
from statistics import mode
from sklearn.metrics import confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer
from datasets import Dataset, load_metric
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import pickle

In [9]:
def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs['tokenizer'](examples["text"], truncation=True)


def get_data(train_path, test_path, random_seed):
    """
    function to read dataframe with columns
    """

    train_df = pd.read_json(train_path, lines=True)
    test_df = pd.read_json(test_path, lines=True)
    
    train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=random_seed)

    return train_df, val_df, test_df

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    cm = confusion_matrix(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'auc': auc,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': cm.tolist()
    }


def fine_tune(train_df, valid_df, checkpoints_path, id2label, label2id, model):

    # pandas dataframe to huggingface Dataset
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)
    
    # get tokenizer and model from huggingface
    tokenizer = AutoTokenizer.from_pretrained(model)     # put your model here
    model = AutoModelForSequenceClassification.from_pretrained(
       model, num_labels=len(label2id), id2label=id2label, label2id=label2id    # put your model here
    )
    
    # tokenize data for train/valid
    tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
    

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


    # create Trainer 
    training_args = TrainingArguments(
        output_dir=checkpoints_path,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_valid_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # save best model
    best_model_path = checkpoints_path+'/best/'
    
    if not os.path.exists(best_model_path):
        os.makedirs(best_model_path)
    

    trainer.save_model(best_model_path)


def test(test_df, model_path, id2label, label2id):
    
    # load tokenizer from saved model 
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # load best model
    model = AutoModelForSequenceClassification.from_pretrained(
       model_path, num_labels=len(label2id), id2label=id2label, label2id=label2id
    )
            
    test_dataset = Dataset.from_pandas(test_df)

    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # create Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    # get logits from predictions and evaluate results using classification report
    predictions = trainer.predict(tokenized_test_dataset)
    prob_pred = softmax(predictions.predictions, axis=-1)
    preds = np.argmax(predictions.predictions, axis=-1)
    metric = evaluate.load("bstrai/classification_report")
    results = metric.compute(predictions=preds, references=predictions.label_ids)
    
    # return dictionary of classification report
    return results, preds

In [10]:
random_seed = 0

df= pd.read_json('../../../SubtaskA/datasets/subtaskA_train_multilingual.jsonl', lines=True)
df = df.rename(columns={'source': 'language'})
non_language_sources = ['wikihow', 'wikipedia', 'reddit', 'arxiv', 'peerread']
df['language'] = df['language'].replace(non_language_sources, 'english')
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(df.drop('language', axis=1), df['language'])
df = pd.concat([X_resampled, y_resampled], axis=1)
# df=df.sample(4000)

df.to_json('reducedTrainDataFrame.jsonl', orient='records', lines=True)

train_path =  'reducedTrainDataFrame.jsonl'
test_path =  '../../../SubtaskA/datasets/subtaskA_dev_multilingual.jsonl'

model = 'xlm-roberta-base'

subtask =  'A'
prediction_path = 'reducedPredictedDataFrame.jsonl'

if not os.path.exists(train_path):
    logging.error("File doesnt exists: {}".format(train_path))
    raise ValueError("File doesnt exists: {}".format(train_path))

if not os.path.exists(test_path):
    logging.error("File doesnt exists: {}".format(train_path))
    raise ValueError("File doesnt exists: {}".format(train_path))

if subtask == 'A':
    id2label = {0: "human", 1: "machine"}
    label2id = {"human": 0, "machine": 1}
elif subtask == 'B':
    id2label = {0: 'human', 1: 'chatGPT', 2: 'cohere', 3: 'davinci', 4: 'bloomz', 5: 'dolly'}
    label2id = {'human': 0, 'chatGPT': 1,'cohere': 2, 'davinci': 3, 'bloomz': 4, 'dolly': 5}
else:
    logging.error("Wrong subtask: {}. It should be A or B".format(train_path))
    raise ValueError("Wrong subtask: {}. It should be A or B".format(train_path))

set_seed(random_seed)

train_df, valid_df, test_df = get_data(train_path, test_path, random_seed)

fine_tune(train_df, valid_df, f"{model}/subtask{subtask}/{random_seed}", id2label, label2id, model)

results, predictions = test(test_df, f"{model}/subtask{subtask}/{random_seed}/best/", id2label, label2id)
print(results['macro avg']['f1-score'])

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 23596/23596 [00:06<00:00, 3635.26 examples/s]
Map: 100%|██████████| 5899/5899 [00:01<00:00, 3745.12 examples/s]
  0%|          | 0/4425 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 11%|█▏        | 500/4425 [01:57<15:37,  4.19it/s]

{'loss': 0.2914, 'learning_rate': 1.7740112994350286e-05, 'epoch': 0.34}


 23%|██▎       | 1000/4425 [03:55<13:21,  4.27it/s]

{'loss': 0.152, 'learning_rate': 1.5480225988700566e-05, 'epoch': 0.68}


 33%|███▎      | 1475/4425 [05:46<10:49,  4.54it/s]Trainer is attempting to log a value of "[[2665, 288], [17, 2929]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
                                                   
 33%|███▎      | 1475/4425 [06:22<10:49,  4.54it/s]

{'eval_loss': 0.24676579236984253, 'eval_accuracy': 0.9482963214104085, 'eval_f1': 0.9505111147168587, 'eval_auc': 0.9483507629945407, 'eval_precision': 0.9104755983835872, 'eval_recall': 0.9942294636795656, 'eval_confusion_matrix': [[2665, 288], [17, 2929]], 'eval_runtime': 35.9069, 'eval_samples_per_second': 164.286, 'eval_steps_per_second': 10.277, 'epoch': 1.0}


 34%|███▍      | 1500/4425 [06:32<11:37,  4.19it/s]  

{'loss': 0.1116, 'learning_rate': 1.3220338983050848e-05, 'epoch': 1.02}


 45%|████▌     | 2001/4425 [08:29<09:28,  4.27it/s]

{'loss': 0.0685, 'learning_rate': 1.096045197740113e-05, 'epoch': 1.36}


 56%|█████▋    | 2500/4425 [10:26<07:30,  4.27it/s]

{'loss': 0.0623, 'learning_rate': 8.700564971751413e-06, 'epoch': 1.69}


 67%|██████▋   | 2950/4425 [12:13<05:26,  4.52it/s]Trainer is attempting to log a value of "[[2783, 170], [17, 2929]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
                                                   
 67%|██████▋   | 2950/4425 [12:49<05:26,  4.52it/s]

{'eval_loss': 0.15709815919399261, 'eval_accuracy': 0.968299711815562, 'eval_f1': 0.9690653432588916, 'eval_auc': 0.9683304446741885, 'eval_precision': 0.9451435947079703, 'eval_recall': 0.9942294636795656, 'eval_confusion_matrix': [[2783, 170], [17, 2929]], 'eval_runtime': 36.4753, 'eval_samples_per_second': 161.726, 'eval_steps_per_second': 10.116, 'epoch': 2.0}


 68%|██████▊   | 3000/4425 [13:05<05:32,  4.28it/s]  

{'loss': 0.0596, 'learning_rate': 6.440677966101695e-06, 'epoch': 2.03}


 79%|███████▉  | 3501/4425 [15:03<03:36,  4.27it/s]

{'loss': 0.0239, 'learning_rate': 4.180790960451978e-06, 'epoch': 2.37}


 90%|█████████ | 4000/4425 [17:00<01:39,  4.27it/s]

{'loss': 0.0223, 'learning_rate': 1.92090395480226e-06, 'epoch': 2.71}


100%|██████████| 4425/4425 [18:41<00:00,  4.52it/s]Trainer is attempting to log a value of "[[2688, 265], [5, 2941]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
                                                   
100%|██████████| 4425/4425 [19:17<00:00,  4.52it/s]

{'eval_loss': 0.3114318549633026, 'eval_accuracy': 0.9542295304288863, 'eval_f1': 0.9561118335500651, 'eval_auc': 0.9542817676065096, 'eval_precision': 0.9173424828446662, 'eval_recall': 0.9983027834351663, 'eval_confusion_matrix': [[2688, 265], [5, 2941]], 'eval_runtime': 36.4737, 'eval_samples_per_second': 161.733, 'eval_steps_per_second': 10.117, 'epoch': 3.0}


100%|██████████| 4425/4425 [19:21<00:00,  3.81it/s]


{'train_runtime': 1161.7878, 'train_samples_per_second': 60.93, 'train_steps_per_second': 3.809, 'train_loss': 0.09160447589421676, 'epoch': 3.0}


Map: 100%|██████████| 4000/4000 [00:01<00:00, 3382.30 examples/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 500/500 [00:21<00:00, 22.73it/s]


0.7522601995214531


In [12]:
results, predictions

({'0': {'precision': 0.8186366479049406,
   'recall': 0.6545,
   'f1-score': 0.7274242845234787,
   'support': 2000.0},
  '1': {'precision': 0.7122032486463973,
   'recall': 0.855,
   'f1-score': 0.7770961145194273,
   'support': 2000.0},
  'accuracy': 0.75475,
  'macro avg': {'precision': 0.765419948275669,
   'recall': 0.75475,
   'f1-score': 0.7522601995214531,
   'support': 4000.0},
  'weighted avg': {'precision': 0.765419948275669,
   'recall': 0.75475,
   'f1-score': 0.752260199521453,
   'support': 4000.0}},
 array([0, 0, 1, ..., 0, 1, 1], dtype=int64))

In [21]:
def getMetrics(predicted_labels, true_labels):
    # Ensure the labels are numpy arrays
    predicted_labels = np.array(predicted_labels)
    true_labels = np.array(true_labels)

    # Compute metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = precision_recall_fscore_support(true_labels, predicted_labels, average='macro')[2]
    precision = precision_score(true_labels, predicted_labels, average='binary')
    recall = recall_score(true_labels, predicted_labels, average='binary')
    auc = roc_auc_score(true_labels, predicted_labels)
    cm = confusion_matrix(true_labels, predicted_labels)

    # Create a dictionary of metrics
    metrics = {
        'f1': f1,
        'confusion_matrix': cm.tolist(),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'auc': auc,
    }

    return metrics
# test_df['label'].tolist()
# list(predictions)
getMetrics(list(predictions),test_df['label'].tolist())

{'f1': 0.7522601995214531,
 'confusion_matrix': [[1309, 691], [290, 1710]],
 'accuracy': 0.75475,
 'precision': 0.7122032486463973,
 'recall': 0.855,
 'auc': 0.75475}