## Google colab setup

In [None]:
try:
  import google.colab
  IS_COLAB = True
except:
  IS_COLAB = False

In [None]:
if IS_COLAB:
  !pip install tensorflow_addons
  !pip install unidecode
  !pip install transformers
  !pip install nlpaug

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/SemiSupervised/*.csv .

## Imports and setup

In [1]:
%%capture
import tensorflow as tf
import tensorflow_addons as tfa
import pandas as pd
import re
import time
import spacy
import numpy as np
from transformers import AutoTokenizer, TFAutoModel
from unidecode import unidecode
from utils import prepare_train_ds, prepare_ds

2023-04-29 12:10:44.714897: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-29 12:10:45.236063: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64:/usr/local/cuda-10.2/lib64:/usr/local/cuda/extras/CUPTI/lib64::/usr/local/cuda-10.1/lib64:/usr/local/cuda-10.2/lib64:/usr/local/cuda/extras/CUPTI/lib64:
2023-04-29 12:10:45.236109: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object 

In [2]:
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')

## Prepare data splits

In [3]:
labeled_ds, unlabeled_ds = prepare_train_ds("train_ner.csv",batch_sizel=16,batch_sizeu=16)
test_ds = prepare_ds("test_ner.csv")
val_ds = prepare_ds("validation_internal_ner.csv")

2023-04-29 12:10:51.941819: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-29 12:10:51.942459: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-29 12:10:51.942805: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-29 12:10:51.943086: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

## Model definition and declaration

In [4]:
class FreeMatchTune(tf.keras.Model):
  def __init__(
      self,
      encoder_name="readerbench/RoBERT-base",
      num_classes=4,
      **kwargs
  ):
    super(FreeMatchTune,self).__init__(**kwargs)

    self.bert = TFAutoModel.from_pretrained(encoder_name)
    self.num_classes = num_classes
    self.weak_augment = tf.keras.layers.GaussianNoise(stddev=0.5)
    self.strong_augment = tf.keras.layers.GaussianNoise(stddev=5)

    self.cls_head = tf.keras.Sequential([
      tf.keras.layers.Dense(256,activation="relu"),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Dense(64,activation="relu"),
      tf.keras.layers.Dense(self.num_classes, activation="softmax")
    ])

  def call(self, inputs, training):
    ids, mask = inputs
    
    embeds = self.bert(input_ids=ids, attention_mask=mask,training=training).pooler_output

    strongs = self.strong_augment(embeds,training=training)
    weaks = self.weak_augment(embeds,training=training)

    strong_preds = self.cls_head(strongs,training=training)
    weak_preds = self.cls_head(weaks,training=training)

    return weak_preds, strong_preds
  
class FreeMatch(tf.keras.Model):
  def __init__(
      self,
      encoder_name="readerbench/RoBERT-base",
      num_classes=4,
      encoder_train=False,
      **kwargs
  ):
    super(FreeMatch,self).__init__(**kwargs)

    self.bert = TFAutoModel.from_pretrained(encoder_name)
    self.bert.trainable = encoder_train
    self.num_classes = num_classes
    self.weak_augment = tf.keras.layers.GaussianNoise(stddev=0.5)
    self.strong_augment = tf.keras.layers.GaussianNoise(stddev=5)


    self.cls_head = tf.keras.Sequential([
        tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(10),
            merge_mode="ave"
        ),
        tf.keras.layers.Dense(units=self.num_classes,activation="softmax")
    ])

  def call(self, inputs, training):
    ids, mask = inputs
    
    embeds = self.bert(input_ids=ids, attention_mask=mask).last_hidden_state

    strongs = self.strong_augment(embeds,training=training)
    weaks = self.weak_augment(embeds,training=training)

    strong_preds = self.cls_head(strongs)
    weak_preds = self.cls_head(weaks)

    return weak_preds, strong_preds



In [5]:
# model = FreeMatch()
model = FreeMatchTune()

## Train and evaluate

In [6]:
optim = tfa.optimizers.AdamW(weight_decay=0.001,learning_rate=0.005)
optim2 = tfa.optimizers.AdamW(weight_decay=0.0,learning_rate=0.00001)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_prec_metric = tf.keras.metrics.Precision(name="precision")
val_recall_metric = tf.keras.metrics.Recall(name="recall")
f1_metric_micro = tfa.metrics.F1Score(num_classes=4, threshold=0.5, average='micro', name='f1_micro')
f1_metric_macro = tfa.metrics.F1Score(num_classes=4, threshold=0.5, average='macro', name='f1_macro')

In [7]:
thresh = 0.9
unsup_weight = 1.0
num_classes = 4
fairness_weight = 0.01
eps = 1e-7
steps_per_epoch = 600

In [8]:
tau = tf.Variable(tf.constant(1/num_classes))
p_thresh = tf.Variable(tau * tf.ones((num_classes,),dtype=tf.float32))
hist_t = tf.Variable(eps * tf.ones((num_classes,),dtype=tf.float32))
eps_t = tf.Variable(eps * tf.ones((num_classes,),dtype=tf.float32))

ema_decay = tf.constant(0.999)

In [9]:
@tf.function
def sumnorm(x):
  return x/tf.reduce_sum(x)

@tf.function
def maxnorm(x):
  return x/tf.reduce_max(x)


@tf.function
def train_step(x,y,xu,ema_decay):

  with tf.GradientTape() as tape:
      wl,_ = model([x["input_ids"],x["attention_mask"]],training=True)
      wu,su = model([xu["input_ids"],xu["attention_mask"]],training=True)

      ls = loss_fn(y, wl)

      local_thresh_update = tf.reduce_mean(tf.reduce_max(wu,axis=1))
      tau.assign(ema_decay * tau + (1-ema_decay) * local_thresh_update)
      p_thresh.assign(ema_decay * p_thresh + (1-ema_decay) * tf.reduce_mean(wu,axis=0))

      hist_t.assign(ema_decay * hist_t + (1-ema_decay) * tf.reduce_sum(tf.one_hot(tf.argmax(wu,axis=1),4),axis=0))
      sat = tau * maxnorm(p_thresh)
      mask = tf.reduce_max(wu,axis=1)>=tf.gather(sat,tf.argmax(wu,axis=1))
      
      su = su[mask]
      wu = wu[mask]
      
      lu = loss_fn(tf.argmax(wu,axis=1),su)
      
      p_avg = tf.reduce_mean(su,axis=0)
      h_avg = tf.reduce_sum(tf.one_hot(tf.argmax(su,axis=1),4),axis=0)

      lf = -tf.keras.metrics.categorical_crossentropy(sumnorm(p_thresh/hist_t),sumnorm(p_avg/(eps_t+h_avg)))

      loss = ls + unsup_weight * lu + fairness_weight * lf

  grads = tape.gradient(loss, model.trainable_weights)
  optim.apply_gradients(zip(grads, model.trainable_weights))

  return loss

@tf.function
def train_step_tune(x,y,xu,ema_decay):

  with tf.GradientTape() as tape:
      wl,_ = model([x["input_ids"],x["attention_mask"]],training=True)
      wu,su = model([xu["input_ids"],xu["attention_mask"]],training=True)

      ls = loss_fn(y, wl)

      local_thresh_update = tf.reduce_mean(tf.reduce_max(wu,axis=1))
      tau.assign(ema_decay * tau + (1-ema_decay) * local_thresh_update)
      p_thresh.assign(ema_decay * p_thresh + (1-ema_decay) * tf.reduce_mean(wu,axis=0))

      hist_t.assign(ema_decay * hist_t + (1-ema_decay) * tf.reduce_sum(tf.one_hot(tf.argmax(wu,axis=1),4),axis=0))
      sat = tau * maxnorm(p_thresh)
      mask = tf.reduce_max(wu,axis=1)>=tf.gather(sat,tf.argmax(wu,axis=1))
      
      su = su[mask]
      wu = wu[mask]
      
      lu = loss_fn(tf.argmax(wu,axis=1),su)
      
      lf = 0.0
      if tf.shape(su)[0] != 0:
        p_avg = tf.reduce_mean(su,axis=0)
        h_avg = tf.reduce_sum(tf.one_hot(tf.argmax(su,axis=1),4),axis=0)

        lf = -tf.keras.metrics.categorical_crossentropy(sumnorm(p_thresh/hist_t),sumnorm(p_avg/(eps_t+h_avg)))

      loss = ls + unsup_weight * lu + fairness_weight * lf

  grads = tape.gradient(loss, [model.cls_head.trainable_weights, model.bert.trainable_weights])
  optim.apply_gradients(zip(grads[0], model.cls_head.trainable_weights))
  optim2.apply_gradients(zip(grads[1], model.bert.trainable_weights))

  return loss

@tf.function
def test_step(x, y):
    wpred,_ = model([x["input_ids"],x["attention_mask"]], training=False)
    val_acc_metric.update_state(y, wpred)
    true_hot = tf.one_hot(y, 4)
    val_prec_metric.update_state(true_hot, wpred)
    val_recall_metric.update_state(true_hot, wpred)
    f1_metric_micro.update_state(true_hot, wpred)
    f1_metric_macro.update_state(true_hot, wpred)

In [10]:
EPOCHS=15
max_val_acc = 0.0
best_weights = None

l_iter = iter(labeled_ds)
u_iter = iter(unlabeled_ds)
for epoch in range(EPOCHS):
  print(f"Epoch {epoch}")
  start_time = time.time()

  for step in range(steps_per_epoch):
    x, y = next(l_iter)
    xu = next(u_iter)
    
    loss = train_step_tune(x,y,xu,ema_decay)
    if step % 50 == 0:
      print(
        "Training loss (for one batch) at step %d: %.4f"
        % (step, float(loss))
      )
  for x_batch_val, y_batch_val in val_ds:
      test_step(x_batch_val, y_batch_val)

  acc = float(val_acc_metric.result())
  prec = float(val_prec_metric.result())
  recall = float(val_recall_metric.result())
  micro = float(f1_metric_micro.result())
  macro = float(f1_metric_macro.result())

  val_acc_metric.reset_states()
  val_prec_metric.reset_states()
  val_recall_metric.reset_states()
  f1_metric_micro.reset_states()
  f1_metric_macro.reset_states()

  if acc > max_val_acc:
    max_val_acc = acc
    best_weights = model.get_weights()
  print(f"Validation acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")
  print("Time taken: %.2fs" % (time.time() - start_time))
print("Restoring best weights relative to validation accuracy...")
model.set_weights(best_weights)

Epoch 0
Training loss (for one batch) at step 0: 1.5229
Training loss (for one batch) at step 50: 2.1581
Training loss (for one batch) at step 100: 2.6760
Training loss (for one batch) at step 150: 1.3524
Training loss (for one batch) at step 200: 1.2020
Training loss (for one batch) at step 250: 1.4142
Training loss (for one batch) at step 300: 1.7221
Training loss (for one batch) at step 350: 1.8700
Training loss (for one batch) at step 400: 1.4413
Training loss (for one batch) at step 450: 2.1481
Training loss (for one batch) at step 500: 1.8101
Training loss (for one batch) at step 550: 1.4955
Validation acc: 0.459 precision: 0.692 recall: 0.295 f1_micro: 0.414 f1_macro: 0.187
Time taken: 230.80s
Epoch 1
Training loss (for one batch) at step 0: 1.2285
Training loss (for one batch) at step 50: 2.1055
Training loss (for one batch) at step 100: 4.0407
Training loss (for one batch) at step 150: 1.2502
Training loss (for one batch) at step 200: 0.7201
Training loss (for one batch) at st

KeyboardInterrupt: 

In [11]:
for x_batch_val, y_batch_val in test_ds:
    test_step(x_batch_val, y_batch_val)
acc = float(val_acc_metric.result())
prec = float(val_prec_metric.result())
recall = float(val_recall_metric.result())
micro = float(f1_metric_micro.result())
macro = float(f1_metric_macro.result())

val_acc_metric.reset_states()
val_prec_metric.reset_states()
val_recall_metric.reset_states()
f1_metric_micro.reset_states()
f1_metric_macro.reset_states()
print(f"Test acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")

Test acc: 0.562 precision: 0.610 recall: 0.485 f1_micro: 0.540 f1_macro: 0.402


In [12]:
model.cls_head.save_weights("./checkpoints/freematch_notune")