In [None]:
!pip install transformers datasets
!apt install git-lfs
!git lfs install
!pip install optuna
!pip install wandb

In [2]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
from huggingface_hub.hf_api import HfApi, HfFolder
import subprocess
from huggingface_hub import notebook_login
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer
import torch
import numpy as np
from datasets import load_metric
import requests
import wandb

from huggingface_hub.constants import ENDPOINT
USERNAME_PLACEHOLDER = "hf_user"

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


Huggingface login support functions

In [3]:
def write_to_credential_store(username: str, password: str):
    with subprocess.Popen(
        "git credential-store store".split(),
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
    ) as process:
        input_username = f"username={username.lower()}"
        input_password = f"password={password}"

        process.stdin.write(
            f"url={ENDPOINT}\n{input_username}\n{input_password}\n\n".encode("utf-8")
        )
        process.stdin.flush()


def hf_login(hf_api, username=None, password=None, token=None):
    if token is not None:
        write_to_credential_store(USERNAME_PLACEHOLDER, token)
        HfFolder.save_token(token)
        print("Login successful")
        print("Your token has been saved to", HfFolder.path_token)
        helpers = currently_setup_credential_helpers()

def currently_setup_credential_helpers(directory=None):
    try:
        output = subprocess.run(
            "git config --list".split(),
            stderr=subprocess.PIPE,
            stdout=subprocess.PIPE,
            encoding="utf-8",
            check=True,
            cwd=directory,
        ).stdout.split("\n")

        current_credential_helpers = []
        for line in output:
            if "credential.helper" in line:
                current_credential_helpers.append(line.split("=")[-1])
    except subprocess.CalledProcessError as exc:
        raise EnvironmentError(exc.stderr)

    return current_credential_helpers


In [None]:
#

Huggingface and wandb login

In [4]:
hf_login(HfApi(), token="")

wandb.login(key="")
wandb.init(project="ATML", entity="aXhyra")

Login successful
Your token has been saved to /root/.huggingface/token
Authenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store


[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpietrotrope[0m (use `wandb login --relogin` to force relogin)


In [6]:
wandb.finish()
%env WANDB_PROJECT=tweeteval_sentiment
%env WANDB_LOG_MODEL=true

In [7]:
class Dataset:
  dataset = None
  tokenized_dataset = None

  def __init__(self, task, tokenizer_name, s_key1="text", s_key2 = None):
    self.dataset_name = "tweet_eval"
    self.task = task
    self.s_key1 = s_key1
    self.s_key2 = s_key2
    self.tokenizer_name = tokenizer_name
    self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
    self.dataset = load_dataset(self.dataset_name, task)
    self.tokenized_dataset = self.dataset.map(self.preprocess_function, batched=True)
    self.n_classes = np.max(self.dataset["validation"]["label"]) + 1
    self.retrieve_labels()
  
  def retrieve_labels(self):
    self.labels = {}

    r = requests.get("https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/"+self.task+"/mapping.txt")
    tmp = r.text.split("\n")
    for el in tmp:
      if len(el) > 1:
        tmp2 = el.split("\t")
        self.labels[tmp2[0]] = tmp2[1]

  def preprocess_function(self, dataset):
    if self.s_key2 is None:
        return self.tokenizer(dataset[self.s_key1], truncation=True)
    return self.tokenizer(dataset[self.s_key1], dataset[self.s_key2], truncation=True)

In [8]:
class Engine:

  model_checkpoint = "distilbert-base-uncased"

  @staticmethod
  def compute_metrics(eval_pred):
      metric = load_metric("f1")
      predictions, labels = eval_pred
      predictions = predictions.argmax(axis=-1)
      return metric.compute(predictions=predictions, references=labels, average="macro")

  @staticmethod
  def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32, 64]),
    }

  def __init__(self, data, args, device="cuda:0"):
    self.model = None 
    self.args = args
    self.trainer = None
    self.best_run = None
    self.dataset = data
    self.results = None
    self.device = device

  def model_init(self):
    return AutoModelForSequenceClassification.from_pretrained(Engine.model_checkpoint, id2label = self.dataset.labels, num_labels=self.dataset.n_classes, return_dict=True)

  def load_trainer(self, use_init=False):
    if use_init:
      self.trainer = Trainer(
        model_init = self.model_init,
        args=self.args,
        train_dataset=self.dataset.tokenized_dataset['train'],
        eval_dataset=self.dataset.tokenized_dataset['validation'],
        tokenizer = self.dataset.tokenizer,
        compute_metrics = Engine.compute_metrics,
      )
    else:
      self.trainer = Trainer(
        model = self.model,
        args=self.args,
        train_dataset=self.dataset.tokenized_dataset['train'],
        eval_dataset=self.dataset.tokenized_dataset['validation'],
        tokenizer = self.dataset.tokenizer,
        compute_metrics = Engine.compute_metrics,
      )
    
  def load_train_args(self, opt_name = "test", lr = 2e-5, epochs=4, batch_size=16, push_to_hub=False, seed=0):
    self.args = TrainingArguments(
      opt_name,
      seed = seed,
      evaluation_strategy = "epoch",
      save_strategy = "epoch",
      learning_rate=lr,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      num_train_epochs=epochs,
      weight_decay=0.01,
      load_best_model_at_end=True,
      push_to_hub = push_to_hub,
      metric_for_best_model="f1",
      report_to="wandb",
      save_total_limit=1,
      run_name=opt_name,
    )


  def train(self, epochs, seed=0, opt_name="test"):
    self.load_train_args(opt_name,
                         self.best_run.hyperparameters["learning_rate"],
                         epochs,
                         self.best_run.hyperparameters["per_device_train_batch_size"],
                         True, seed)
    self.load_trainer(True)
    
    self.results = self.trainer.train()
    self.trainer.push_to_hub()
    wandb.finish()


  def evaluate(self):
    if self.trainer is None:
      print("[!] Training required")
      return
    self.trainer.evaluate()


  def hyperparameter_search(self, n_trials = 5):
    if self.args is None:
      print("[!] TraininArgument object is required")
      return -1
    self.load_trainer(True)
    self.best_run = self.trainer.hyperparameter_search(n_trials=n_trials, direction="maximize", hp_space=self.my_hp_space)
    wandb.finish()


  def load_model(self, model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
    self.dataset.tokenizer = tokenizer
    self.model.to(self.device)


  def predict(self, input_text):    
    input_text_tokenized = self.dataset.tokenizer(input_text, truncation=True, padding=True, return_tensors="pt")
    if self.device is not None:
      input_text_tokenized.to(self.device)
    prediction = self.model(**input_text_tokenized).logits

    result = torch.softmax(prediction, dim=1)
    result = np.argmax(result.tolist())

    return self.dataset.labels[str(result)]
  


In [None]:
task = "sentiment"

import os

batch_size = 16
metric_name = "f1"

# for task in tasks:

name = f"{task}"

args = TrainingArguments(
  name,
  evaluation_strategy = "epoch",
  save_strategy = "no",
  learning_rate=2e-5,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  num_train_epochs=4,
  weight_decay=0.01,
  load_best_model_at_end=False,
  push_to_hub = False,
  metric_for_best_model=metric_name,
  report_to="wandb",
  save_total_limit=1,
  run_name=name
)

dataset = Dataset(task, "distilbert-base-uncased")
engine = Engine(dataset, args)

engine.hyperparameter_search(10)

name = f"{task}_trained"
print(f"\n\n [+] Training model: {name}")
engine.train(4, seed=0, opt_name=name)
# os.system(f"rm -rf {name}")


In [None]:
!nvidia-smi

Fri Dec 10 09:15:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
engine.predict("I'm so happy you have cancer")

'positive'