This notebook trains the required models to support the data presented on the paper.  
The way this code is presented is an iterative process in which datasets are fetched, models are trained, and results are stored locally.
This notebook needs access to a GPU environment, that can be obtained in Google Collab.

In [3]:
!pip install -q datasets
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
import math
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
from tqdm import tqdm

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)


### Auxiliary functions

In [5]:
def encode_label_function(examples) :
    return {'labels':[category_to_label[c] for c in examples["category"]]}

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'f1_macro': f1_score(labels, predictions, average='macro'),
            'f1_micro': f1_score(labels, predictions, average='micro'),
            'precision': precision_score(labels, predictions, average='macro'),
            'recall': recall_score(labels, predictions, average='macro'),}
    
def create_baseline(dataset, pct, version):
    """Creates a baseline dataset"""
    df = pd.DataFrame(dataset["train"])

    # Extract a sample per class
    df_sample = []

    for _, g in df.groupby('category'):
        n = math.ceil(len(g)*pct/100.0)
        sample = g.sample(n, random_state=version)
        df_sample.append(sample)

    df_sample = pd.concat(df_sample)
    dataset_artificial = Dataset.from_pandas(df_sample)
    print(f"Dataset size at {pct} pct {len(df_sample)}")
    return dataset_artificial


<a id='additional-resources'></a>

In [None]:
# Download tokenizer
huggingface_model = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(huggingface_model)

# List of available datasets
DATASET_LIST = ['atis', 'massive', 'banking', 'snips']

# Method used to generate the dataset
# full indicates to use the original dataset
# baseline indicates to generate and use a baseline dataset with a predefined random seed
TYPE_LIST = ['llama2', 'chatgpt', 'traditional', 'full', 'baseline']

# Pct of sampled data. If used with the baseline type, this is the percentage used
# If used with Full, it will be ignores
# If used with any other type, it will be used to fetch the dataset from the repo.
# Currently only available 1, 3 and 5 pct
PCT_LIST = [1, 3, 5]

df_results = []

for DATASET_NAME in DATASET_LIST:
  DATASET_ID = f"benayas/{DATASET_NAME}"
  for TYPE in TYPE_LIST:
    for VERSION in [0, 1, 2]:
      for PCT in PCT_LIST:
        DATASET_ARTIFICIAL = f"benayas/llm-augmented_{DATASET_NAME}_{TYPE}_{PCT}pct_v{VERSION}"
        RUN_NAME = f'{DATASET_NAME}_{TYPE}_{PCT}pct_v{VERSION}'

        # Download dataset
        dataset = load_dataset(DATASET_ID)
        dataset_original_train = dataset['train']
        dataset_original_test = dataset['test']

        if TYPE == "baseline":
          # Calculate a baseline dataset
          dataset_artificial = create_baseline(dataset, PCT, VERSION)

        elif TYPE == "full":
          # Use the regular dataset
          dataset_artificial = dataset_original_train
        else:
          # Cases with dataset in Huggingface
          dataset_artificial = load_dataset(DATASET_ARTIFICIAL)['train']
          
        # Prepare data
        labelencoder = LabelEncoder().fit(dataset_original_train['category'])
        category_to_label = {c: i for i, c in enumerate(labelencoder.classes_)}
        mapping = {v:k for k,v in category_to_label.items()}

        tokenized_datasets = dataset_original_test.map(tokenize_function, batched=True)
        tokenized_datasets = tokenized_datasets.map(encode_label_function, batched=True)

        tokenized_datasets_artificial = dataset_artificial.map(tokenize_function, batched=True)
        tokenized_datasets_artificial = tokenized_datasets_artificial.map(encode_label_function, batched=True)

        # Remove columns
        tokenized_datasets = tokenized_datasets.remove_columns(["text","category"])
        tokenized_datasets.set_format("torch")

        tokenized_datasets_artificial = tokenized_datasets_artificial.remove_columns(["text","category"])
        tokenized_datasets_artificial.set_format("torch")

        # split datasets
        train_dataset = tokenized_datasets_artificial.shuffle(seed=142)
        eval_dataset = tokenized_datasets

        # Model
        model = AutoModelForSequenceClassification.from_pretrained(huggingface_model, num_labels=len(set(mapping)))

        # Training Args
        training_args = TrainingArguments(output_dir="test_trainer",
                                          learning_rate=0.00001,
                                          num_train_epochs=15,
                                          per_device_train_batch_size=16,
                                          per_device_eval_batch_size=16,
                                          load_best_model_at_end=True,
                                          save_strategy="epoch",
                                          evaluation_strategy="epoch",
                                          metric_for_best_model="f1_macro",
                                          greater_is_better=True,
                                          logging_strategy="epoch",
                                          )

        # Train
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
        )
        trainer.train()

        # Evaluate
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        eval_dataloader = DataLoader(eval_dataset, batch_size=16)

        all_outputs = []
        all_labels = []
        all_logits = []


        model.eval()
        model.to(device)
        for batch in tqdm(eval_dataloader):

            batch = {'input_ids': batch['input_ids'].to(device),
                     'attention_mask': batch['attention_mask'].to(device),
                     'labels': batch['labels'].to(device)}
            with torch.no_grad():
                outputs = model(**batch)

            logits = outputs.logits
            all_logits.append(logits.detach().cpu().numpy())
            predictions = torch.argmax(logits, dim=-1)
            all_outputs += predictions.detach().cpu().numpy().tolist()
            all_labels += batch["labels"].detach().cpu().numpy().tolist()

        all_labels = np.array(all_labels)
        all_outputs = np.array(all_outputs)
        all_logits = np.concatenate(all_logits, axis=0)

        # Calculate metrics
        f1_score_macro = f1_score(all_labels, all_outputs, average='macro')
        f1_score_micro = f1_score(all_labels, all_outputs, average='micro')
        precision = precision_score(all_labels, all_outputs, average='macro', labels=np.unique(all_labels))
        recall = recall_score(all_labels, all_outputs, average='macro', labels=np.unique(all_labels))
        print(f'f1_score_macro: {f1_score_macro} | f1_score_micro: {f1_score_micro} | precision: {precision} | recall: {recall}')
        df_results.append({'run':RUN_NAME,
                           'f1-macro':f1_score_macro,
                           'f1-micro':f1_score_micro,
                           'precision': precision,
                           'recall': recall})

        # Generate CSV with full results and predictions
        df_outputs = pd.DataFrame({'text': np.array(dataset_original_test['text']),
                                  'label': np.array([category_to_label[value] for value in np.array(dataset_original_test['category'])]),
                                  'prediction': all_outputs,
                                  'intent':  np.array(dataset_original_test['category'])
                                  })
        df_outputs = df_outputs[['text','intent','label','prediction']]
        df_outputs.to_csv(f'{RUN_NAME}.csv', index=False)

df_results = pd.DataFrame(df_results)
df_results