## Google colab setup

In [None]:
try:
  import google.colab
  IS_COLAB = True
except:
  IS_COLAB = False

In [None]:
if IS_COLAB:
  !pip install tensorflow_addons
  !pip install unidecode
  !pip install transformers
  # !pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (591 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m591.0/591.0 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.20.0 typeguard-2.13.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6
Looking in indexes: ht

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/SemiSupervised/*.csv .

## Imports and setup

In [None]:
%%capture
import tensorflow as tf
import tensorflow_addons as tfa
import pandas as pd
import re
import time
import spacy
import numpy as np
from transformers import AutoTokenizer, TFAutoModel
from unidecode import unidecode

In [None]:
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')

## Utils

In [None]:
data = pd.read_csv("train_ner.csv")

In [None]:
def preprocess(x):
    s = unidecode(x)
    s = str.lower(s)
    s = re.sub(r"\[[a-z]+\]","", s)
    s = re.sub(r"\*","", s)
    s = re.sub(r"[^a-zA-Z0-9]+"," ",s)
    s = re.sub(r" +"," ",s)
    s = re.sub(r"(.)\1+",r"\1",s)

    return s

In [None]:
label_ids = {label_name:i for i, label_name in enumerate(sorted(set(data["label"])))}
label_ids

{'ABUSE': 0, 'INSULT': 1, 'OTHER': 2, 'PROFANITY': 3}

In [None]:
data["preprocessed"] = data["text"].apply(preprocess)
data.head()

Unnamed: 0,id,text,label,preprocessed
0,32674,da de unde stii u mai [ORG] ca banii au fost p...,ABUSE,da de unde sti u mai ca bani au fost pt si nu ...
1,16514,m*uie [PERS] m*uie [PERS] ... m*uie\nbai kakat...,INSULT,muie muie muie bai kakatule stai in banca ta d...
2,32556,PT ALA CARE ARE TREABA CU [PERS]!!ESTI UNUL CA...,OTHER,pt ala care are treaba cu esti unul care nu ar...
3,23861,sunt bucuros ca [PERS] nu a mai venit la [ORG]...,OTHER,sunt bucuros ca nu a mai venit la jucator de d...
4,21811,[PERS] esti....PE..N\n ES..CU..LI..BI.L!!! te ...,INSULT,esti pe n es cu li bi l te asemeni cu de la c...


In [None]:
data["class"] = data["label"].map(lambda x: label_ids[x])
data.head(5)

Unnamed: 0,id,text,label,preprocessed,class
0,32674,da de unde stii u mai [ORG] ca banii au fost p...,ABUSE,da de unde sti u mai ca bani au fost pt si nu ...,0
1,16514,m*uie [PERS] m*uie [PERS] ... m*uie\nbai kakat...,INSULT,muie muie muie bai kakatule stai in banca ta d...,1
2,32556,PT ALA CARE ARE TREABA CU [PERS]!!ESTI UNUL CA...,OTHER,pt ala care are treaba cu esti unul care nu ar...,2
3,23861,sunt bucuros ca [PERS] nu a mai venit la [ORG]...,OTHER,sunt bucuros ca nu a mai venit la jucator de d...,2
4,21811,[PERS] esti....PE..N\n ES..CU..LI..BI.L!!! te ...,INSULT,esti pe n es cu li bi l te asemeni cu de la c...,1


In [None]:
data["label"].value_counts()

OTHER        3649
ABUSE        2768
INSULT       2242
PROFANITY    1294
Name: label, dtype: int64

## Prepare data splits

In [None]:
tok_robert = AutoTokenizer.from_pretrained("readerbench/RoBERT-base")
# robert = TFAutoModel.from_pretrained("readerbench/RoBERT-base")

Downloading (…)okenizer_config.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/468 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/245k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def split_ssl_data(ids_array,mask_array,labels,num_classes,label_percent):
  labeled = None
  unlabeled = None

  for class_idx in range(num_classes):
    class_ids = ids_array[labels==class_idx]
    class_mask = mask_array[labels==class_idx]
    sz = int(label_percent * class_ids.shape[0])

    labels_reduced = labels[labels==class_idx][:sz]
    labeled_ids, unlabeled_ids = class_ids[:sz], class_ids[sz:]
    labeled_mask, unlabeled_mask = class_mask[:sz], class_mask[sz:]

    if not labeled:
      labeled = (labeled_ids, labeled_mask, labels_reduced)
      unlabeled = (unlabeled_ids, unlabeled_mask)
    else:
      labeled = (
          np.concatenate([labeled[0],labeled_ids]),
          np.concatenate([labeled[1],labeled_mask]),
          np.concatenate([labeled[2],labels_reduced])
      )
      unlabeled = (
          np.concatenate([unlabeled[0],unlabeled_ids]),
          np.concatenate([unlabeled[1],unlabeled_mask]),
      )

  return labeled, unlabeled


In [None]:
def preprocess_robert(x):
  t = tok_robert(x,padding="max_length",max_length=96,truncation=True,return_tensors='np')
  return t["input_ids"], t["attention_mask"]

def map_func(input_ids, masks, labels):
  return {'input_ids': input_ids, 'attention_mask': masks}, labels

def map_func2(input_ids, masks):
  return {'input_ids': input_ids, 'attention_mask': masks}

def prepare_ds(filename,batch_size=64,label_percent=0.8):
  df = pd.read_csv(filename)
  X_id_mask = df['text'].map(preprocess).apply(preprocess_robert).apply(pd.Series)

  X_id_mask.columns = ["input_ids","attention_mask"]

  ids_array = np.squeeze(np.stack(X_id_mask.input_ids.values), axis=1)
  mask_array = np.squeeze(np.stack(X_id_mask.attention_mask.values), axis=1)
  labels = df["label"].map(lambda x: label_ids[x]).values

  res_ds = tf.data.Dataset.from_tensor_slices((ids_array, mask_array, labels)).map(map_func).shuffle(len(df)).batch(batch_size)
  return res_ds

In [None]:
def prepare_train_ds(filename,batch_size=64,label_percent=0.05):
  df = pd.read_csv(filename)
  df = df.sample(frac=1)
  X_id_mask = df['text'].map(preprocess).apply(preprocess_robert).apply(pd.Series)

  X_id_mask.columns = ["input_ids","attention_mask"]

  ids_array = np.squeeze(np.stack(X_id_mask.input_ids.values), axis=1)
  mask_array = np.squeeze(np.stack(X_id_mask.attention_mask.values), axis=1)
  labels = df["label"].map(lambda x: label_ids[x]).values

  labeled, unlabeled = split_ssl_data(ids_array,mask_array,labels,len(label_ids),label_percent)

  labeled_ds = tf.data.Dataset.from_tensor_slices(labeled)
  labeled_ds = labeled_ds.map(map_func).shuffle(len(labeled_ds)).batch(batch_size).repeat()
  unlabeled_ds = tf.data.Dataset.from_tensor_slices(unlabeled)
  unlabeled_ds = unlabeled_ds.map(map_func2).shuffle(len(unlabeled_ds)).batch(batch_size).repeat()
  
  return labeled_ds, unlabeled_ds

In [None]:
labeled_ds, unlabeled_ds= prepare_train_ds("train_ner.csv")
test_ds = prepare_ds("test_ner.csv")
val_ds = prepare_ds("validation_internal_ner.csv")

## Model definition and declaration

In [None]:
class MixMatch(tf.keras.Model):
  def __init__(self,bert_model="readerbench/RoBERT-base",num_classes=4,**kwargs):
    super(MixMatch,self).__init__(**kwargs)
    self.bert = TFAutoModel.from_pretrained(bert_model)

    self.num_classes = num_classes
  
    self.cls_head = tf.keras.Sequential([
      tf.keras.layers.Dense(256,activation="relu"),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Dense(64,activation="relu"),
      tf.keras.layers.Dense(self.num_classes, activation="softmax")
    ])

    self.augment = tf.keras.layers.GaussianNoise(stddev=2)

  def call(self, inputs, training):
    ids, mask = inputs
    
    embeds = self.bert(input_ids=ids, attention_mask=mask,training=training).pooler_output

    augs = self.augment(embeds,training=training)

    return self.cls_head(augs,training=training)

In [None]:
model = MixMatch()

## Train and evaluate

In [None]:
optim = tfa.optimizers.AdamW(weight_decay=0.001,learning_rate=0.001)
optim2 = tfa.optimizers.AdamW(weight_decay=0.0,learning_rate=0.00001)
entropy = tf.keras.losses.CategoricalCrossentropy()
mse = tf.keras.losses.MeanSquaredError()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_prec_metric = tf.keras.metrics.Precision(name="precision")
val_recall_metric = tf.keras.metrics.Recall(name="recall")
f1_metric_micro = tfa.metrics.F1Score(num_classes=4, threshold=0.5, average='micro', name='f1_micro')
f1_metric_macro = tfa.metrics.F1Score(num_classes=4, threshold=0.5, average='macro', name='f1_macro')

In [None]:
def weight_schedule(t, init_period, final_period, max_period):

  if t < init_period:
    x = t / init_period
    return 2.71 ** (-5*(x-1)**2)
  elif t < max_period - final_period:
    return 1.0
  else:
    x = (t - max_period + final_period) / final_period
    return 1 - 2.71 ** (-5*(x-1)**2)

In [None]:
epochs = 10
K = 2
steps_per_epoch = 150
max_period = epochs * steps_per_epoch
alpha = 10
unsup_weight = tf.Variable(1.0)
init_period = 300
final_period = 300
blend = tf.constant(1.0)

In [None]:
@tf.function
def sharpen(x,T=0.5):
  power = 1/T
  x = x**power
  s = tf.expand_dims(tf.reduce_sum(x,axis=1),axis=-1)
  return x/s

@tf.function
def train_step(x,y,xu,unsup_weight,blend):


  with tf.GradientTape() as tape:
      x_emb = model.bert([x["input_ids"],x["attention_mask"]],training=True).pooler_output
      x_aug = model.augment(x_emb,training=True)

      r_ids = tf.repeat(xu["input_ids"],K,axis=0)
      r_mask = tf.repeat(xu["attention_mask"],K,axis=0)

      xu_emb = model.bert([r_ids,r_mask],training=True).pooler_output
      r_xu = tf.repeat(xu_emb,K,axis=0)
      cnt = tf.shape(x_aug)[0]

      xu_aug = model.augment(r_xu, training=True)

      pred_unsup = model.cls_head(xu_aug,training=False)

      pred_unsup = tf.reshape(pred_unsup,(-1, K, 4))

      pred_unsup = tf.repeat(sharpen(tf.reduce_mean(pred_unsup,axis=1)),K,axis=0)

      y_ext = tf.one_hot(y,4)

    # (x_aug, y_ext) , (xu_aug, pred_unsup)

      x_total = tf.random.shuffle(tf.concat([x_aug,xu_aug],axis=0),seed=1)
      total = tf.shape(x_total)[0]
      y_total = tf.random.shuffle(tf.concat([y_ext, pred_unsup],axis=0),seed=1)

      first = tf.gather(x_total,range(cnt),axis=0)
      first_labels = tf.gather(y_total,range(cnt),axis=0)


      second = tf.gather(x_total,range(cnt,total),axis=0)
      second_labels = tf.gather(y_total,range(cnt,total),axis=0)

      a = blend * x_aug + (1-blend) * first
      a_target = blend * y_ext + (1-blend) * first_labels

      b = blend * xu_aug + (1-blend) * second
      b_target = blend * pred_unsup + (1-blend) * second_labels
      
      a_pred = model.cls_head(a,training=True)
      b_pred = model.cls_head(b,training=True)

      loss = entropy(a_target, a_pred) + unsup_weight * mse(b_target, b_pred) / 4
      
  grads = tape.gradient(loss, [model.cls_head.trainable_weights, model.bert.trainable_weights])
  optim.apply_gradients(zip(grads[0], model.cls_head.trainable_weights))
  optim2.apply_gradients(zip(grads[1], model.bert.trainable_weights))

  return loss


@tf.function
def test_step(x, y):
    wpred = model([x["input_ids"],x["attention_mask"]], training=False)
    val_acc_metric.update_state(y, wpred)
    true_hot = tf.one_hot(y, 4)
    val_prec_metric.update_state(true_hot, wpred)
    val_recall_metric.update_state(true_hot, wpred)
    f1_metric_micro.update_state(true_hot, wpred)
    f1_metric_macro.update_state(true_hot, wpred)

In [None]:
max_val_acc = 0.0
best_weights = None
l_iter = iter(labeled_ds)
u_iter = iter(unlabeled_ds)
t_step = 0

for epoch in range(epochs):
  print(f"Epoch {epoch}")
  start_time = time.time()

  for step in range(steps_per_epoch):
    x, y = next(l_iter)
    xu = next(u_iter)
    lam = np.random.beta(alpha, alpha)
    lam = max([lam,1-lam])
    blend = tf.constant(lam)
    unsup_val = weight_schedule(t_step, init_period, final_period, max_period)
    unsup_weight.assign(4.0 * unsup_val)
    
    loss = train_step(x,y,xu,unsup_weight,blend)

    if step % 50 == 0:
      print(
        "Training loss (for one batch) at step %d: %.4f"
        % (step, float(loss))
      )
    t_step+=1
  for x_batch_val, y_batch_val in val_ds:
      test_step(x_batch_val, y_batch_val)

  acc = float(val_acc_metric.result())
  prec = float(val_prec_metric.result())
  recall = float(val_recall_metric.result())
  micro = float(f1_metric_micro.result())
  macro = float(f1_metric_macro.result())

  val_acc_metric.reset_states()
  val_prec_metric.reset_states()
  val_recall_metric.reset_states()
  f1_metric_micro.reset_states()
  f1_metric_macro.reset_states()

  if acc > max_val_acc:
    max_val_acc = acc
    best_weights = model.get_weights()

  print(f"Validation acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")
  print("Time taken: %.2fs" % (time.time() - start_time))
model.set_weights(best_weights)


Epoch 0
Training loss (for one batch) at step 0: 1.8698
Training loss (for one batch) at step 50: 1.1936
Training loss (for one batch) at step 100: 1.1717
Validation acc: 0.564 precision: 0.643 recall: 0.482 f1_micro: 0.551 f1_macro: 0.343
Time taken: 101.36s
Epoch 1
Training loss (for one batch) at step 0: 0.9691
Training loss (for one batch) at step 50: 1.0239
Training loss (for one batch) at step 100: 0.8700
Validation acc: 0.614 precision: 0.631 recall: 0.598 f1_micro: 0.614 f1_macro: 0.573
Time taken: 64.09s
Epoch 2
Training loss (for one batch) at step 0: 0.7011
Training loss (for one batch) at step 50: 0.6337
Training loss (for one batch) at step 100: 0.6170
Validation acc: 0.655 precision: 0.675 recall: 0.641 f1_micro: 0.657 f1_macro: 0.616
Time taken: 51.52s
Epoch 3
Training loss (for one batch) at step 0: 0.6247
Training loss (for one batch) at step 50: 0.6072
Training loss (for one batch) at step 100: 0.6432
Validation acc: 0.650 precision: 0.660 recall: 0.636 f1_micro: 0.64

In [None]:
for x_batch_val, y_batch_val in test_ds:
    test_step(x_batch_val, y_batch_val)
acc = float(val_acc_metric.result())
prec = float(val_prec_metric.result())
recall = float(val_recall_metric.result())
micro = float(f1_metric_micro.result())
macro = float(f1_metric_macro.result())

val_acc_metric.reset_states()
val_prec_metric.reset_states()
val_recall_metric.reset_states()
f1_metric_micro.reset_states()
f1_metric_macro.reset_states()
print(f"Test acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")

Test acc: 0.679 precision: 0.684 recall: 0.672 f1_micro: 0.678 f1_macro: 0.649


In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.bert.push_to_hub("andrei-saceleanu/ro-offense-mixmatch")

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

tf_model.h5:   0%|          | 0.00/461M [00:00<?, ?B/s]

In [None]:
model.cls_head.save_weights("./checkpoint/mixmatch")