In [None]:
import os
import re
import numpy as np
from pathlib import Path

from datasets import load_dataset
import torch 
import json 

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

## Configuration

In [None]:
OUT_PATH = '../output/'
MODEL_NAME = "bert-base-cased"

os.environ["CUDA_VISIBLE_DEVICES"] = "2"


In [None]:
import sys
sys.path.append('../src/')
from utils.utils import set_seed
set_seed(42)

## Load Dataset

In [None]:
data_files = {
    "train": "../counterfactually-augmented-data/sentiment/orig/train.tsv", 
    "dev": "../counterfactually-augmented-data/sentiment/orig/dev.tsv", 
    "test": "../counterfactually-augmented-data/sentiment/orig/test.tsv", 
}

data = load_dataset("csv", data_files=data_files, sep = '\t')

LABEL_ENCODER = LabelEncoder()
LABEL_ENCODER.fit(data['train']['Sentiment'])
LABEL_ENCODER.classes_

## Preprocessing

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    do_lower_case=False
)

def preprocess(row):
    d = {             
    }

    d['input'] = row['Text']
    d['label'] = LABEL_ENCODER.transform([row['Sentiment']])[0]
    return d

def tokenize(examples):
    return tokenizer(
        examples['input'],
        truncation=True,
        max_length=512
    )
data = data.map(preprocess)
data = data.map(tokenize, batched=True)


In [None]:
data = data.map(preprocess)
data = data.map(tokenize, batched=True)

In [None]:
tokenizer.decode(data['train'][0]['input_ids'])

## Train Model

In [None]:
def get_model():
    model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABEL_ENCODER.classes_)
    )
    model.resize_token_embeddings(len(tokenizer))
    
    return model

In [None]:
import ray
from ray import tune
from ray.tune import CLIReporter

from ray.tune.schedulers import PopulationBasedTraining


In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score as acc

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    accuracy = acc(labels, preds)
    return {
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'acc': accuracy
    }

In [None]:
import os
smoke_test = True
os.environ["TOKENIZERS_PARALLELISM"] = "false"

training_args = TrainingArguments(
    output_dir=OUT_PATH,
    num_train_epochs=1,
    learning_rate=5e-5,
    warmup_ratio=0,
    weight_decay=0,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=512,
    logging_strategy="steps",
    logging_steps=20,
    evaluation_strategy="steps",
    eval_steps=20,
    save_strategy="steps",
    save_steps=20,
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model='eval_acc',
)

trainer = Trainer(model_init= get_model, 
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["dev"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

tune_config = {
    "per_device_train_batch_size": tune.choice([16, 32, 64]),
    "per_device_eval_batch_size": 64,
    "num_train_epochs": 5,
    "max_steps": 1 if smoke_test else -1,  
}

scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric= "eval_acc", 
    mode="max",
    perturbation_interval=1,
    hyperparam_mutations={
        "weight_decay": tune.uniform(0.0, 0.3),
        "learning_rate": tune.uniform(1e-5, 5e-5),
        "per_device_train_batch_size": [16, 32, 64],
        "warmup_ratio":[0.1, 0.2, 0.5]
    },
)

reporter = CLIReporter(
    parameter_columns={
        "learning_rate": "lr",
        "per_device_train_batch_size": "train_bs/gpu",
        "num_train_epochs": "num_epochs",
        "warmup_ratio" : "warmup_ratio"
    },
    metric_columns=["eval_f1", "eval_loss", "epoch", "training_iteration"],
)


best_run = trainer.hyperparameter_search(
    hp_space=lambda _: tune_config, #  A function that defines the hyperparameter search space.
    backend="ray",
    n_trials=10, # test with one only 
    resources_per_trial={"cpu": 1, "gpu": 1},
    scheduler=scheduler,
    keep_checkpoints_num=1,
    checkpoint_score_attr="training_iteration",
    stop={"training_iteration": 1} if smoke_test else None,
    progress_reporter=reporter,
    local_dir="./ray_results/",
    name="tune_transformer_pbt",
    log_to_file=True,   # 
    direction="maximize",
    
)

In [None]:
best_run 

In [None]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()



## Inference on train/dev/test

In [None]:
def predict(trainer, data, split_name: str, out_path):
    out_path = Path(out_path)
    preds = trainer.predict(data[split_name])
    logits = torch.tensor(preds.predictions)
    y_pred = np.argmax(preds.predictions, axis=-1)
    y_pred_proba = torch.nn.functional.softmax(logits, dim=1)
    y_true = preds.label_ids
    print(f"Evaluate {split_name}\n")
    print(classification_report(y_true, y_pred, digits=3))
    
    with open(out_path / f"y_true_{split_name}.txt", "w") as fout:
        for i in y_true:
            fout.write(str(i) + "\n")

    with open(out_path / f"y_pred_{split_name}.txt", "w") as fout:
        for i in y_pred:
            fout.write(str(i) + "\n")
            
    with open(out_path / f"y_pred_proba_{split_name}.jsonl", "w") as fout:
        for ps in y_pred_proba.tolist():
            json.dump(ps, fout)
            fout.write('\n')

In [None]:
predict(trainer, data, 'train', out_path=OUT_PATH)

In [None]:
predict(trainer, data, 'dev', out_path=OUT_PATH)