In [1]:
# Transformers installation
! pip install transformers datasets
! pip install seqeval
! pip install evaluate
! pip install wandb
! pip install pyyaml h5py

import numpy as np

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m101.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading sa

In [59]:
# model_name = "distilroberta-base" # https://huggingface.co/distilroberta-base
#model_name = "haisongzhang/roberta-tiny-cased" # https://github.com/haisongzhang/roberta-tiny-cased
# model_name = "distilbert-base-uncased" # https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation
# model_name = "roberta-base" # https://huggingface.co/roberta-base
model_name = "bert-base-uncased" # https://huggingface.co/bert-base-uncased

# Preprocess Data


## Load Tokenizer and Dataset

Inspec Dataset has 3 datasets: train, test, and validation

### Data Types

*   train: [*sample*]
*   *sample*: {
  document: [string],
  doc_bio_tags: [int]
  }

In [60]:
from datasets import load_dataset

inspec = load_dataset("midas/inspec")

Repo card metadata block was not found. Setting CardData to empty.


In [61]:
example = inspec["train"][0]

In [62]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Preprocess

We need to


*   Convert words to tokens
*   Realign the tokens and labels, since a single word may correspond to multiple tokens.
*   Ignore the special tokens [CLS] and [SEP] in the loss function.
*   Only label the first token of a given word, and assign -100 to the others.
*   Convert doc_bio_tags to integers for classificatioon



In [63]:
label_list = np.unique(example["doc_bio_tags"])

id2label = {i: label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

print('Mapping doc_bio_tag to integer:\n\n',label2id)
print('\nMapping integer to doc_bio_tag:\n\n',id2label)

Mapping doc_bio_tag to integer:

 {'B': 0, 'I': 1, 'O': 2}

Mapping integer to doc_bio_tag:

 {0: 'B', 1: 'I', 2: 'O'}


In [64]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["document"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"doc_bio_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label2id[label[word_idx]]) # Convert BIO to integers for classification
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [65]:
# This is our tokenized dataset (a data_dict which we will convert to a TF Dataset)
tokenized_inspec = inspec.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

## Utils for Training

In [66]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

In [67]:
import evaluate

seqeval = evaluate.load("seqeval")

In [68]:
labels = example[f"doc_bio_tags"]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # return metrics
    all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels)
    del all_metrics['_']
    print(all_metrics)
    wandb.log(all_metrics)

    return all_metrics

## Experiment Tracking with W&B

In [69]:
import wandb

wandb.login()

True

## These variables will be used to define a custom loss with weighted cross entropy

In [70]:
from collections import Counter

#counting how many beginning keywords, middle keywords, and non-keywords there are
count_0s = 0
count_1s = 0
count_2s = 0

for listt in tokenized_inspec["train"]["labels"]:
    count_dict = Counter(listt)
    count_0s += count_dict[0]
    count_1s += count_dict[1]
    count_2s += count_dict[2]

#getting weights for weighted cross_entropy
max_ = max(count_0s,count_1s,count_2s)
weights = [max_/count_0s, max_/count_1s, max_/count_2s]

## Train with multi-GPU
Reference: https://saturncloud.io/docs/examples/python/tensorflow/qs-multi-gpu-tensorflow/

In [71]:
import tensorflow as tf
import keras
import time
from transformers import TFAutoModelForTokenClassification
from transformers.keras_callbacks import KerasMetricCallback
from transformers import create_optimizer

def train_multigpu(n_epochs, base_lr, batchsize):
    num_train_steps = (len(tokenized_inspec["train"]) // batchsize) * n_epochs
    num_labels=3

    # Set up for multi-GPU training
    strategy = tf.distribute.MirroredStrategy()
    print("Number of devices: %d" % strategy.num_replicas_in_sync)

    # Initialize W&B run
    run = wandb.init(entity="ac215-ppp", project="ppp-keyword-extraction", name=f"{model_name}-trained")

    with strategy.scope():
        model = TFAutoModelForTokenClassification.from_pretrained(
                  model_name, num_labels=num_labels, id2label=id2label, label2id=label2id)

        # We define our own optimizer (and lr_schedule which we do not use)
        optimizer, lr_schedule = create_optimizer(
                                                  init_lr=2e-5,
                                                  num_train_steps=num_train_steps,
                                                  weight_decay_rate=0.01,
                                                  num_warmup_steps=0,
                                                 )

        # Compute custom loss (CrossEntropyLoss with weights)
        def loss_fn(y_true, y_pred):
          loss = tf.nn.weighted_cross_entropy_with_logits(
              labels=tf.one_hot(y_true, depth=num_labels),
              logits=y_pred,
              pos_weight=tf.constant(weights)
          )
          loss = tf.reduce_mean(loss)
          return loss

        # The model is ready for training
        model.compile(loss=loss_fn, optimizer=optimizer)

    # Load in our data as TF Datasets (with data_collator applied)
    train_ds = model.prepare_tf_dataset(
                                        tokenized_inspec["train"],
                                        shuffle=True,
                                        batch_size=batchsize,
                                        collate_fn=data_collator,
                                       ).prefetch(2).cache().shuffle(1000)

    valid_ds = model.prepare_tf_dataset(
                                        tokenized_inspec["validation"],
                                        shuffle=False,
                                        batch_size=batchsize,
                                        collate_fn=data_collator,
                                       ).prefetch(2)


    # Set up callback for end of each epoch
    metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=valid_ds)
    callbacks = [metric_callback]


    # Run training
    start = time.time()

    history = model.fit(x=train_ds, validation_data=valid_ds, epochs=n_epochs, callbacks=callbacks)

    end = time.time() - start
    print("model training time", end)
    wandb.config.update({"execution_time": end})

    # Log validation loss to Weights & Biases
    wandb.define_metric("epochs")
    wandb.define_metric("validation_loss", step_metric="epochs")
    for epoch, val_loss in enumerate(history.history["val_loss"]):
      wandb.log({"epochs" : epoch, "validation_loss": val_loss})

    # Create a W&B artifact to save the model
    trained_model_artifact = wandb.Artifact("trained_model", type="model")
    # Save the model to a specified directory (adjust the path)
    directory = "model_directory"
    model.save_pretrained(directory, saved_model=True)
    # Add the saved model to the artifact
    trained_model_artifact.add_dir("model_directory")
    # Log the artifact
    run.log_artifact(trained_model_artifact)

    # Close the W&B run
    wandb.run.finish()

    return model

In [72]:
# Call the training function with specified parameters
model_params = {
    "n_epochs": 10,
    "base_lr": 2e-5,
    "batchsize": 16
}

tester_plain = train_multigpu(**model_params)

Number of devices: 1


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
model training time 333.33925771713257


[34m[1mwandb[0m: Adding directory to artifact (./model_directory)... Done. 1.3s


0,1
epochs,▁▂▃▃▄▅▆▆▇█
overall_accuracy,▁▄▅▆▇▇█▇▇█
overall_f1,▁▅▆▇▇██▇██
overall_precision,▁▄▅▆▇▇█▇▇█
overall_recall,▄▄█▆▄▅▁▄▅▂
validation_loss,█▂▁▁▁▁▂▂▂▃

0,1
epochs,9.0
overall_accuracy,0.88703
overall_f1,0.53736
overall_precision,0.42547
overall_recall,0.72911
validation_loss,0.15423
