Code to fine-tune the RoBERTa Open AI detector model with one tenth of the trianing data from task A. It is assumed that this code is run in Google Colab. 

# Dataset

The code below was used to generate a split of the dataset. The dataset is available at XXXXXX. 

```python
import pandas as pd

def get_random_samples(data, model, size):
    content = data["model"] == model
    random_samples = data[content].sample(n=size)

    return random_samples

data_path = "subtaskA_train_monolingual.jsonl"
data = pd.read_json(path_or_buf=data_path, lines=True)
unique_model = set(data["model"])
random_samples_2000_per_model = []

for model in unique_model:
  samples = get_random_samples(data, model, 2000)
  random_samples_2000_per_model.append(samples)

sample_set_10000 = pd.concat(random_samples_2000_per_model, ignore_index=True)
sample_set_10000.to_json("sample_set_10000.json", orient="records", lines=True)
```

# Fine-tuning
A Huggingface account and access token are required to execute the code below. In your account, go to Settings > Access Tokens to retrieve it. 

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install datasets
!pip install pandas
!pip install evaluate
!pip install numpy
!pip install transformers
!pip install -U scikit-learn
!pip install scipy
!pip install tensorflow==2.14 # Upgrade to higher version of Tensorflow (standard with Colab is 12.0)
!pip install accelerate -U #  Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`:

In [None]:
from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    AutoTokenizer,
    set_seed,
)
import os
from sklearn.model_selection import train_test_split
from scipy.special import softmax
import argparse
import logging

In [1]:
model_name = "roberta-base-openai-detector"
train_file_path = "/content/drive/MyDrive/test folder/sample_set_10000.jsonl"
test_file_path = "/content/drive/MyDrive/test folder/subtaskA_dev_monolingual.jsonl"
id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1}
random_seed = 0

# If you want to save to directory rather than Hugging Face
checkpoints_path = ""  # If you want to save to directory rather than Hugging Face
best_model_path = ""  # If you want to save to directory rather than Hugging Face
output_path = "artificially-natural-roberta"

In [2]:
def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs["tokenizer"](examples["text"], truncation=True)


def get_data(train_path, test_path, random_seed):
    train_df = pd.read_json(train_path, lines=True)
    test_df = pd.read_json(test_path, lines=True)
    train_df, val_df = train_test_split(
        train_df, test_size=0.2, stratify=train_df["label"], random_state=random_seed
    )
    return train_df, val_df, test_df


def compute_metrics(eval_pred):

    f1_metric = evaluate.load("f1")

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    results = {}
    results.update(
        f1_metric.compute(predictions=predictions, references=labels, average="micro")
    )
    return results


def get_tokenizer_and_model(model_name, id2label, label2id):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id
    )
    return tokenizer, model


def get_data_collator(tokenizer):
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    return data_collator


def prepare_data_for_training(train_df, valid_df, tokenizer):
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)
    tokenized_train_dataset = train_dataset.map(
        preprocess_function, batched=True, fn_kwargs={"tokenizer": tokenizer}
    )
    tokenized_valid_dataset = valid_dataset.map(
        preprocess_function, batched=True, fn_kwargs={"tokenizer": tokenizer}
    )
    return tokenized_train_dataset, tokenized_valid_dataset


def create_trainer(
    model,
    tokenizer,
    data_collator,
    tokenized_train_dataset,
    tokenized_valid_dataset,
    output_dir_path,
):
    training_args = TrainingArguments(
        output_dir=output_dir_path,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=True,
        # hub_model_id =
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_valid_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    return trainer


def finetune_and_save_best(trainer):
    trainer.train()
    trainer.push_to_hub("End of training.")
    # trainer.save_model(best_model_path) # Enable if you want to save to local directory


def test(test_df, model_path, id2label, label2id):

    tokenizer, model = get_tokenizer_and_model(model_path, id2label, label2id)

    test_dataset = Dataset.from_pandas(test_df)

    tokenized_test_dataset = test_dataset.map(
        preprocess_function, batched=True, fn_kwargs={"tokenizer": tokenizer}
    )
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    predictions = trainer.predict(tokenized_test_dataset)
    prob_pred = softmax(predictions.predictions, axis=-1)
    preds = np.argmax(predictions.predictions, axis=-1)
    metric = evaluate.load("bstrai/classification_report")
    results = metric.compute(predictions=preds, references=predictions.label_ids)
    return results, preds

# Training

In [None]:
set_seed(random_seed)

train_df, valid_df, test_df = get_data(train_file_path, test_file_path, random_seed)
tokenizer, model = get_tokenizer_and_model(model_name, id2label, label2id)
data_collator = get_data_collator(tokenizer)
tokenized_train_dataset, tokenized_valid_dataset = prepare_data_for_training(
    train_df, valid_df, tokenizer
)

trainer = create_trainer(
    model,
    tokenizer,
    data_collator,
    tokenized_train_dataset,
    tokenized_valid_dataset,
    output_path,
)

finetune_and_save_best(trainer)

# Valiation and Testing

In [None]:
t_test_file_path = "subtaskA_dev_monolingual.jsonl"
checkpoint_for_testing = "artificially-natural-roberta-redone"
prediction_file_path = "new_predictions.jsonl"

In [None]:
t_test_df = pd.read_json(t_test_file_path, lines=True)
results, predictions = test(t_test_df, checkpoint_for_testing, id2label, label2id)
print(results, predictions)

In [None]:
# Save the predictions to folder in this notebook
predictions_df = pd.DataFrame({"id": test_df["id"], "label": predictions})
predictions_df.to_json(prediction_file_path, lines=True, orient="records")

# Submission predictions from test 

In [None]:
submission_test_file_path = "test_subtaskA_monolingual.jsonl"
finetuned_model_id = "artificially-natural-roberta"
submission_prediction_file_path = "subtask_a_monolingual.jsonl"
submission_id2label = {0: "human", 1: "machine"}
submission_label2id = {"human": 0, "machine": 1}

In [None]:
def submission_get_and_prepare_data(test_path, tokenizer, model):
    test_df = pd.read_json(test_path, lines=True)
    test_dataset = Dataset.from_pandas(test_df)
    tokenized_test_dataset = test_dataset.map(
        preprocess_function, batched=True, fn_kwargs={"tokenizer": tokenizer}
    )
    return tokenized_test_dataset


def submission_predict(tokenized_test_df, tokenizer, model, prediction_file_path):
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    predictions = trainer.predict(tokenized_test_df)
    prob_pred = softmax(predictions.predictions, axis=-1)
    preds = np.argmax(predictions.predictions, axis=-1)
    return predictions, preds


submission_tokenizer, submission_model = get_tokenizer_and_model(
    finetuned_model_id, submission_id2label, submission_label2id
)

In [None]:
submission_input = submission_get_and_prepare_data(
    submission_test_file_path, submission_tokenizer, submission_model
)
sub_predictions, sub_preds = submission_predict(
    submission_input,
    submission_tokenizer,
    submission_model,
    submission_prediction_file_path,
)

In [None]:
predictions_df = pd.DataFrame({"id": submission_input["id"], "label": sub_preds})
predictions_df.to_json(submission_prediction_file_path, lines=True, orient="records")