## Google colab setup

In [None]:
try:
  import google.colab
  IS_COLAB = True
except:
  IS_COLAB = False

In [None]:
if IS_COLAB:
  !pip install tensorflow_addons
  !pip install unidecode
  !pip install transformers
  !pip install nlpaug

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp /content/drive/MyDrive/SemiSupervised/*.csv .

## Imports and setup

In [1]:
%%capture
import tensorflow as tf
import tensorflow_addons as tfa
import pandas as pd
import re
import time
import spacy
import numpy as np
from transformers import AutoTokenizer, TFAutoModel
from unidecode import unidecode

2023-05-12 09:44:53.411320: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-12 09:44:53.921629: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64:/usr/local/cuda-10.2/lib64:/usr/local/cuda/extras/CUPTI/lib64::/usr/local/cuda-10.1/lib64:/usr/local/cuda-10.2/lib64:/usr/local/cuda/extras/CUPTI/lib64:
2023-05-12 09:44:53.921675: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object 

In [2]:
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[1], 'GPU')

## Utils

In [3]:
data = pd.read_csv("train_ner.csv")

In [4]:
def preprocess(x):
    s = unidecode(x)
    s = str.lower(s)
    s = re.sub(r"\[[a-z]+\]","", s)
    s = re.sub(r"\*","", s)
    s = re.sub(r"[^a-zA-Z0-9]+"," ",s)
    s = re.sub(r" +"," ",s)
    s = re.sub(r"(.)\1+",r"\1",s)

    return s

In [5]:
label_ids = {label_name:i for i, label_name in enumerate(sorted(set(data["label"])))}
label_ids

{'ABUSE': 0, 'INSULT': 1, 'OTHER': 2, 'PROFANITY': 3}

In [6]:
data["preprocessed"] = data["text"].apply(preprocess)
data.head()

Unnamed: 0,id,text,label,preprocessed
0,32674,da de unde stii u mai [ORG] ca banii au fost p...,ABUSE,da de unde sti u mai ca bani au fost pt si nu ...
1,16514,m*uie [PERS] m*uie [PERS] ... m*uie\nbai kakat...,INSULT,muie muie muie bai kakatule stai in banca ta d...
2,32556,PT ALA CARE ARE TREABA CU [PERS]!!ESTI UNUL CA...,OTHER,pt ala care are treaba cu esti unul care nu ar...
3,23861,sunt bucuros ca [PERS] nu a mai venit la [ORG]...,OTHER,sunt bucuros ca nu a mai venit la jucator de d...
4,21811,[PERS] esti....PE..N\n ES..CU..LI..BI.L!!! te ...,INSULT,esti pe n es cu li bi l te asemeni cu de la c...


In [7]:
data["class"] = data["label"].map(lambda x: label_ids[x])
data.head(5)

Unnamed: 0,id,text,label,preprocessed,class
0,32674,da de unde stii u mai [ORG] ca banii au fost p...,ABUSE,da de unde sti u mai ca bani au fost pt si nu ...,0
1,16514,m*uie [PERS] m*uie [PERS] ... m*uie\nbai kakat...,INSULT,muie muie muie bai kakatule stai in banca ta d...,1
2,32556,PT ALA CARE ARE TREABA CU [PERS]!!ESTI UNUL CA...,OTHER,pt ala care are treaba cu esti unul care nu ar...,2
3,23861,sunt bucuros ca [PERS] nu a mai venit la [ORG]...,OTHER,sunt bucuros ca nu a mai venit la jucator de d...,2
4,21811,[PERS] esti....PE..N\n ES..CU..LI..BI.L!!! te ...,INSULT,esti pe n es cu li bi l te asemeni cu de la c...,1


In [8]:
data["label"].value_counts()

OTHER        3649
ABUSE        2768
INSULT       2242
PROFANITY    1294
Name: label, dtype: int64

## Prepare data splits

In [9]:
tok_robert = AutoTokenizer.from_pretrained("readerbench/RoBERT-base")
# robert = TFAutoModel.from_pretrained("readerbench/RoBERT-base")

In [10]:
def split_ssl_data(ids_array,mask_array,labels,num_classes,label_percent):
  labeled = None
  unlabeled = None

  for class_idx in range(num_classes):
    class_ids = ids_array[labels==class_idx]
    class_mask = mask_array[labels==class_idx]
    sz = int(label_percent * class_ids.shape[0])

    labels_reduced = labels[labels==class_idx][:sz]
    labeled_ids, unlabeled_ids = class_ids[:sz], class_ids[sz:]
    labeled_mask, unlabeled_mask = class_mask[:sz], class_mask[sz:]

    if not labeled:
      labeled = (labeled_ids, labeled_mask, labels_reduced)
      unlabeled = (unlabeled_ids, unlabeled_mask)
    else:
      labeled = (
          np.concatenate([labeled[0],labeled_ids]),
          np.concatenate([labeled[1],labeled_mask]),
          np.concatenate([labeled[2],labels_reduced])
      )
      unlabeled = (
          np.concatenate([unlabeled[0],unlabeled_ids]),
          np.concatenate([unlabeled[1],unlabeled_mask]),
      )

  return labeled, unlabeled


In [11]:
def preprocess_robert(x):
  t = tok_robert(x,padding="max_length",max_length=96,truncation=True,return_tensors='np')
  return t["input_ids"], t["attention_mask"]

def map_func(input_ids, masks, labels):
  return {'input_ids': input_ids, 'attention_mask': masks}, labels

def map_func2(input_ids, masks):
  return {'input_ids': input_ids, 'attention_mask': masks}

def prepare_ds(filename,batch_size=64):
  df = pd.read_csv(filename)
  X_id_mask = df['text'].map(preprocess).apply(preprocess_robert).apply(pd.Series)

  X_id_mask.columns = ["input_ids","attention_mask"]

  ids_array = np.squeeze(np.stack(X_id_mask.input_ids.values), axis=1)
  mask_array = np.squeeze(np.stack(X_id_mask.attention_mask.values), axis=1)
  labels = df["label"].map(lambda x: label_ids[x]).values

  res_ds = tf.data.Dataset.from_tensor_slices((ids_array, mask_array, labels)).map(map_func).shuffle(len(df)).batch(batch_size)
  return res_ds

In [12]:
def prepare_train_ds(filename,batch_size=16,label_percent=0.2):
  df = pd.read_csv(filename)
  X_id_mask = df['text'].map(preprocess).apply(preprocess_robert).apply(pd.Series)

  X_id_mask.columns = ["input_ids","attention_mask"]

  ids_array = np.squeeze(np.stack(X_id_mask.input_ids.values), axis=1)
  mask_array = np.squeeze(np.stack(X_id_mask.attention_mask.values), axis=1)
  labels = df["label"].map(lambda x: label_ids[x]).values

  labeled, unlabeled = split_ssl_data(ids_array,mask_array,labels,len(label_ids),label_percent)
  labeled_ds = tf.data.Dataset.from_tensor_slices(labeled)
  labeled_ds = labeled_ds.map(map_func).shuffle(len(labeled_ds)).batch(4*batch_size).repeat()
  unlabeled_ds = tf.data.Dataset.from_tensor_slices(unlabeled)
  unlabeled_ds = unlabeled_ds.map(map_func2).shuffle(len(unlabeled_ds)).batch(4*batch_size).repeat()
  
  return labeled_ds, unlabeled_ds

In [13]:
labeled_ds, unlabeled_ds = prepare_train_ds("train_ner.csv")
test_ds = prepare_ds("test_ner.csv")
val_ds = prepare_ds("validation_internal_ner.csv")

2023-05-12 09:45:07.295978: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-12 09:45:07.296758: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-12 09:45:07.297110: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-12 09:45:07.297392: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

## Model definition and declaration

In [14]:
class TeacherModel(tf.keras.Model):
  def __init__(self,encoder_name="readerbench/RoBERT-base",num_classes=4,**kwargs):
    super(TeacherModel,self).__init__(**kwargs)
    self.bert = TFAutoModel.from_pretrained(encoder_name)
    self.bert.trainable=False
    self.num_classes = num_classes
    self.noise = tf.keras.layers.GaussianNoise(stddev=1)

    self.cls_head = tf.keras.Sequential([
        tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(10,recurrent_dropout=0.2),
            merge_mode="ave"
        ),
        tf.keras.layers.Dense(units=self.num_classes,activation="softmax")
    ])


  def call(self, inputs, training):
    ids, mask = inputs
    
    embeds = self.bert(input_ids=ids, attention_mask=mask).last_hidden_state
    embeds = self.noise(embeds,training=training)
    out = self.cls_head(embeds,training=training)

    return out
  
class StudentModel(tf.keras.Model):
  def __init__(self,encoder_name="readerbench/RoBERT-base",num_classes=4,**kwargs):
    super(StudentModel,self).__init__(**kwargs)
    self.bert = TFAutoModel.from_pretrained(encoder_name)
    self.bert.trainable=False
    self.num_classes = num_classes
    self.noise = tf.keras.layers.GaussianNoise(stddev=1)

    self.cls_head = tf.keras.Sequential([
        tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(10),
            merge_mode="ave"
        ),
        tf.keras.layers.Dense(units=self.num_classes,activation="softmax")
    ])


  def call(self, inputs, training):
    ids, mask = inputs
    
    embeds = self.bert(input_ids=ids, attention_mask=mask).last_hidden_state
    out = self.cls_head(embeds,training=training)

    return out


In [15]:
student = StudentModel()
teacher = TeacherModel()

## Train and evaluate

In [16]:
optim = tfa.optimizers.AdamW(weight_decay=0.001,learning_rate=0.005)
optim2 = tfa.optimizers.AdamW(weight_decay=0.001,learning_rate=0.005)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_prec_metric = tf.keras.metrics.Precision(name="precision")
val_recall_metric = tf.keras.metrics.Recall(name="recall")
f1_metric_micro = tfa.metrics.F1Score(num_classes=4, threshold=0.5, average='micro', name='f1_micro')
f1_metric_macro = tfa.metrics.F1Score(num_classes=4, threshold=0.5, average='macro', name='f1_macro')

In [17]:
epochs = 20
iterations = 3
steps_per_epoch = 125

In [18]:
@tf.function
def teacher_step(x,y):

  with tf.GradientTape() as tape:
      pred_teacher = teacher([x["input_ids"],x["attention_mask"]],training=True)
      ls = loss_fn(y, pred_teacher)
      
  grads = tape.gradient(ls, teacher.trainable_weights)
  optim.apply_gradients(zip(grads, teacher.trainable_weights))

  return ls



@tf.function
def train_step(x,y,xu):


  pseudo_labels = tf.argmax(teacher([xu["input_ids"],xu["attention_mask"]],training=False),axis=1)
  targets = tf.concat([y,pseudo_labels],axis=0)
  ids = tf.concat([x["input_ids"],xu["input_ids"]],axis=0)
  mask = tf.concat([x["attention_mask"],xu["attention_mask"]],axis=0)
  with tf.GradientTape() as tape:
      pred_stud = student([ids,mask],training=True)
      loss = loss_fn(targets, pred_stud)
      
  grads = tape.gradient(loss, student.trainable_weights)
  optim2.apply_gradients(zip(grads, student.trainable_weights))

  return loss

@tf.function
def test_step_stud(x, y):
    wpred = student([x["input_ids"],x["attention_mask"]], training=False)
    val_acc_metric.update_state(y, wpred)
    true_hot = tf.one_hot(y, 4)
    val_prec_metric.update_state(true_hot, wpred)
    val_recall_metric.update_state(true_hot, wpred)
    f1_metric_micro.update_state(true_hot, wpred)
    f1_metric_macro.update_state(true_hot, wpred)


@tf.function
def test_step_teacher(x, y):
    wpred = teacher([x["input_ids"],x["attention_mask"]], training=False)
    val_acc_metric.update_state(y, wpred)
    true_hot = tf.one_hot(y, 4)
    val_prec_metric.update_state(true_hot, wpred)
    val_recall_metric.update_state(true_hot, wpred)
    f1_metric_micro.update_state(true_hot, wpred)
    f1_metric_macro.update_state(true_hot, wpred)

In [19]:
max_val_acc = 0.0
best_weights = None
l_iter = iter(labeled_ds)
u_iter = iter(unlabeled_ds)

# step 1 from paper
for epoch in range(epochs):
  print(f"Teacher Epoch {epoch}")
  start_time = time.time()

  for step in range(steps_per_epoch):
    x, y = next(l_iter)

    loss = teacher_step(x,y)

    if step % 50 == 0:
      print(
        "Training loss (for one batch) at step %d: %.4f"
        % (step, float(loss))
      )

  for x_batch_val, y_batch_val in val_ds:
      test_step_teacher(x_batch_val, y_batch_val)

  acc = float(val_acc_metric.result())
  prec = float(val_prec_metric.result())
  recall = float(val_recall_metric.result())
  micro = float(f1_metric_micro.result())
  macro = float(f1_metric_macro.result())

  val_acc_metric.reset_states()
  val_prec_metric.reset_states()
  val_recall_metric.reset_states()
  f1_metric_micro.reset_states()
  f1_metric_macro.reset_states()

  if acc > max_val_acc:
    max_val_acc = acc
    best_weights = teacher.get_weights()

  print(f"Validation acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")
  print("Time taken: %.2fs" % (time.time() - start_time))

teacher.set_weights(best_weights)

#2 - #4 iterative training
for _ in range(iterations):
  max_val_acc = 0.0
  best_weights = None
  l_iter = iter(labeled_ds)
  u_iter = iter(unlabeled_ds)
  for epoch in range(epochs):
    print(f"Epoch {epoch}")
    start_time = time.time()

    for step in range(steps_per_epoch):
      x, y = next(l_iter)
      xu = next(u_iter)

      loss = train_step(x,y,xu)

      if step % 50 == 0:
        print(
          "Training loss (for one batch) at step %d: %.4f"
          % (step, float(loss))
        )

    for x_batch_val, y_batch_val in val_ds:
      test_step_stud(x_batch_val, y_batch_val)

    acc = float(val_acc_metric.result())
    prec = float(val_prec_metric.result())
    recall = float(val_recall_metric.result())
    micro = float(f1_metric_micro.result())
    macro = float(f1_metric_macro.result())

    val_acc_metric.reset_states()
    val_prec_metric.reset_states()
    val_recall_metric.reset_states()
    f1_metric_micro.reset_states()
    f1_metric_macro.reset_states()

    if acc > max_val_acc:
      max_val_acc = acc
      best_weights = student.get_weights()

    print(f"Validation acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")
    print("Time taken: %.2fs" % (time.time() - start_time))
  teacher.set_weights(best_weights)
  for l in student.layers:
      if l.trainable:
        if hasattr(l,"kernel_initializer"):
            l.kernel.assign(l.kernel_initializer(tf.shape(l.kernel)))
        if hasattr(l,"bias_initializer"):
            l.bias.assign(l.bias_initializer(tf.shape(l.bias)))
        if hasattr(l,"recurrent_initializer"):
            l.recurrent_kernel.assign(l.recurrent_initializer(tf.shape(l.recurrent_kernel)))


Teacher Epoch 0
Training loss (for one batch) at step 0: 1.3880
Training loss (for one batch) at step 50: 1.1989
Training loss (for one batch) at step 100: 1.1540
Validation acc: 0.525 precision: 0.630 recall: 0.372 f1_micro: 0.468 f1_macro: 0.273
Time taken: 59.46s
Teacher Epoch 1
Training loss (for one batch) at step 0: 1.2021
Training loss (for one batch) at step 50: 1.1260
Training loss (for one batch) at step 100: 1.0727
Validation acc: 0.542 precision: 0.624 recall: 0.406 f1_micro: 0.492 f1_macro: 0.322
Time taken: 46.84s
Teacher Epoch 2
Training loss (for one batch) at step 0: 1.1573
Training loss (for one batch) at step 50: 1.0989
Training loss (for one batch) at step 100: 0.8967
Validation acc: 0.555 precision: 0.727 recall: 0.317 f1_micro: 0.442 f1_macro: 0.293
Time taken: 46.90s
Teacher Epoch 3
Training loss (for one batch) at step 0: 1.0613
Training loss (for one batch) at step 50: 0.9847
Training loss (for one batch) at step 100: 0.9327
Validation acc: 0.570 precision: 0.6

2023-05-12 10:02:12.520807: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8100


Training loss (for one batch) at step 0: 1.3876
Training loss (for one batch) at step 50: 1.0554
Training loss (for one batch) at step 100: 0.9298
Validation acc: 0.558 precision: 0.623 recall: 0.470 f1_micro: 0.535 f1_macro: 0.326
Time taken: 107.38s
Epoch 1
Training loss (for one batch) at step 0: 0.8365
Training loss (for one batch) at step 50: 0.8268
Training loss (for one batch) at step 100: 0.7479
Validation acc: 0.626 precision: 0.679 recall: 0.517 f1_micro: 0.587 f1_macro: 0.424
Time taken: 89.72s
Epoch 2
Training loss (for one batch) at step 0: 0.6651
Training loss (for one batch) at step 50: 0.7852
Training loss (for one batch) at step 100: 0.7315
Validation acc: 0.644 precision: 0.692 recall: 0.563 f1_micro: 0.621 f1_macro: 0.497
Time taken: 89.97s
Epoch 3
Training loss (for one batch) at step 0: 0.8003
Training loss (for one batch) at step 50: 0.6020
Training loss (for one batch) at step 100: 0.6791
Validation acc: 0.654 precision: 0.700 recall: 0.605 f1_micro: 0.649 f1_mac

In [20]:
for x_batch_val, y_batch_val in test_ds:
    test_step_teacher(x_batch_val, y_batch_val)
acc = float(val_acc_metric.result())
prec = float(val_prec_metric.result())
recall = float(val_recall_metric.result())
micro = float(f1_metric_micro.result())
macro = float(f1_metric_macro.result())

val_acc_metric.reset_states()
val_prec_metric.reset_states()
val_recall_metric.reset_states()
f1_metric_micro.reset_states()
f1_metric_macro.reset_states()
print(f"Test acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")

Test acc: 0.661 precision: 0.679 recall: 0.637 f1_micro: 0.657 f1_macro: 0.611
