## Google colab setup

In [None]:
try:
  import google.colab
  IS_COLAB = True
except:
  IS_COLAB = False

In [None]:
if IS_COLAB:
  !pip install tensorflow_addons
  !pip install unidecode
  !pip install transformers
  !pip install nlpaug

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/SemiSupervised/*.csv .

## Imports and setup

In [1]:
%%capture
import tensorflow as tf
import tensorflow_addons as tfa
import pandas as pd
import re
import time
import spacy
import numpy as np
from transformers import AutoTokenizer, TFAutoModel
from unidecode import unidecode
from sklearn.neighbors import NearestNeighbors
import scipy

2023-05-12 08:56:15.577206: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-12 08:56:16.085856: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64:/usr/local/cuda-10.2/lib64:/usr/local/cuda/extras/CUPTI/lib64::/usr/local/cuda-10.1/lib64:/usr/local/cuda-10.2/lib64:/usr/local/cuda/extras/CUPTI/lib64:
2023-05-12 08:56:16.085901: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object 

In [2]:
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')

## Utils

In [3]:
data = pd.read_csv("train_ner.csv")

In [4]:
def preprocess(x):
    s = unidecode(x)
    s = str.lower(s)
    s = re.sub(r"\[[a-z]+\]","", s)
    s = re.sub(r"\*","", s)
    s = re.sub(r"[^a-zA-Z0-9]+"," ",s)
    s = re.sub(r" +"," ",s)
    s = re.sub(r"(.)\1+",r"\1",s)

    return s

In [5]:
label_ids = {label_name:i for i, label_name in enumerate(sorted(set(data["label"])))}
label_ids

{'ABUSE': 0, 'INSULT': 1, 'OTHER': 2, 'PROFANITY': 3}

In [6]:
data["preprocessed"] = data["text"].apply(preprocess)
data.head()

Unnamed: 0,id,text,label,preprocessed
0,32674,da de unde stii u mai [ORG] ca banii au fost p...,ABUSE,da de unde sti u mai ca bani au fost pt si nu ...
1,16514,m*uie [PERS] m*uie [PERS] ... m*uie\nbai kakat...,INSULT,muie muie muie bai kakatule stai in banca ta d...
2,32556,PT ALA CARE ARE TREABA CU [PERS]!!ESTI UNUL CA...,OTHER,pt ala care are treaba cu esti unul care nu ar...
3,23861,sunt bucuros ca [PERS] nu a mai venit la [ORG]...,OTHER,sunt bucuros ca nu a mai venit la jucator de d...
4,21811,[PERS] esti....PE..N\n ES..CU..LI..BI.L!!! te ...,INSULT,esti pe n es cu li bi l te asemeni cu de la c...


In [7]:
data["class"] = data["label"].map(lambda x: label_ids[x])
data.head(5)

Unnamed: 0,id,text,label,preprocessed,class
0,32674,da de unde stii u mai [ORG] ca banii au fost p...,ABUSE,da de unde sti u mai ca bani au fost pt si nu ...,0
1,16514,m*uie [PERS] m*uie [PERS] ... m*uie\nbai kakat...,INSULT,muie muie muie bai kakatule stai in banca ta d...,1
2,32556,PT ALA CARE ARE TREABA CU [PERS]!!ESTI UNUL CA...,OTHER,pt ala care are treaba cu esti unul care nu ar...,2
3,23861,sunt bucuros ca [PERS] nu a mai venit la [ORG]...,OTHER,sunt bucuros ca nu a mai venit la jucator de d...,2
4,21811,[PERS] esti....PE..N\n ES..CU..LI..BI.L!!! te ...,INSULT,esti pe n es cu li bi l te asemeni cu de la c...,1


In [8]:
data["label"].value_counts()

OTHER        3649
ABUSE        2768
INSULT       2242
PROFANITY    1294
Name: label, dtype: int64

In [9]:
import plotly.graph_objects as go
lengths = []
for elem in data.preprocessed:
    cnt = tok_robert(elem, return_tensors="np")["input_ids"].shape[1]
    lengths.append(cnt)

NameError: name 'tok_robert' is not defined

In [23]:
print(
    np.percentile(lengths,90),
    np.percentile(lengths,50),
    np.mean(lengths),
    np.std(lengths)
)


95.0 41.0 48.83190997689139 29.82889984375574


In [26]:
fig = go.Figure(data=[go.Histogram(x=lengths)])
fig.update_layout(
    title="Token count distribution for training set ",
    xaxis_title="Token count",
    yaxis_title="No. of samples",
    legend_title="Legend",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()

## Prepare data splits

In [9]:
tok_robert = AutoTokenizer.from_pretrained("readerbench/RoBERT-base")
robert = TFAutoModel.from_pretrained("readerbench/RoBERT-base")

2023-05-12 08:56:57.275266: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-12 08:56:57.275733: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-12 08:56:57.276087: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-12 08:56:57.276384: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least on

In [10]:
def split_ssl_data(ids_array,mask_array,labels,num_classes,label_percent):
  labeled = None
  unlabeled = None

  for class_idx in range(num_classes):
    class_ids = ids_array[labels==class_idx]
    class_mask = mask_array[labels==class_idx]
    sz = int(label_percent * class_ids.shape[0])

    labels_reduced = labels[labels==class_idx][:sz]
    labeled_ids, unlabeled_ids = class_ids[:sz], class_ids[sz:]
    labeled_mask, unlabeled_mask = class_mask[:sz], class_mask[sz:]

    if not labeled:
      labeled = (labeled_ids, labeled_mask, labels_reduced)
      unlabeled = (unlabeled_ids, unlabeled_mask)
    else:
      labeled = (
          np.concatenate([labeled[0],labeled_ids]),
          np.concatenate([labeled[1],labeled_mask]),
          np.concatenate([labeled[2],labels_reduced])
      )
      unlabeled = (
          np.concatenate([unlabeled[0],unlabeled_ids]),
          np.concatenate([unlabeled[1],unlabeled_mask]),
      )

  return labeled, unlabeled


In [11]:
def preprocess_robert(x):
  t = tok_robert(x,padding="max_length",max_length=96,truncation=True,return_tensors='np')
  return t["input_ids"], t["attention_mask"]

def map_func(input_ids, masks, labels):
  return {'input_ids': input_ids, 'attention_mask': masks}, labels

def map_func2(input_ids, masks):
  return {'input_ids': input_ids, 'attention_mask': masks}

def prepare_ds(filename,batch_size=64):
  df = pd.read_csv(filename)
  X_id_mask = df['text'].map(preprocess).apply(preprocess_robert).apply(pd.Series)

  X_id_mask.columns = ["input_ids","attention_mask"]

  ids_array = np.squeeze(np.stack(X_id_mask.input_ids.values), axis=1)
  mask_array = np.squeeze(np.stack(X_id_mask.attention_mask.values), axis=1)
  labels = df["label"].map(lambda x: label_ids[x]).values

  res_ds = tf.data.Dataset.from_tensor_slices((ids_array, mask_array, labels)).map(map_func).shuffle(len(df)).batch(batch_size)
  return res_ds

In [12]:
def prepare_train_ds(filename,batch_size=16,label_percent=0.2):
  df = pd.read_csv(filename)
  df = df.sample(frac=1)
  X_id_mask = df['text'].map(preprocess).apply(preprocess_robert).apply(pd.Series)

  X_id_mask.columns = ["input_ids","attention_mask"]

  ids_array = np.squeeze(np.stack(X_id_mask.input_ids.values), axis=1)
  mask_array = np.squeeze(np.stack(X_id_mask.attention_mask.values), axis=1)
  labels = df["label"].map(lambda x: label_ids[x]).values

  labeled, unlabeled = split_ssl_data(ids_array,mask_array,labels,len(label_ids),label_percent)

  labeled_ds = tf.data.Dataset.from_tensor_slices(labeled)
  labeled_ds = labeled_ds.map(map_func).shuffle(len(labeled_ds)).batch(batch_size).repeat()

  unlabeled_ds = tf.data.Dataset.from_tensor_slices(unlabeled)
  unlabeled_ds = unlabeled_ds.map(map_func2).shuffle(len(unlabeled_ds)).batch(batch_size).repeat()
  
  return labeled_ds, unlabeled_ds, labeled, unlabeled

In [13]:
labeled_ds, unlabeled_ds, labeled, unlabeled = prepare_train_ds("train_ner.csv")
test_ds = prepare_ds("test_ner.csv")
val_ds = prepare_ds("validation_internal_ner.csv")

## Model definition and declaration

In [14]:
class LPModel(tf.keras.Model):
  def __init__(self,bert_model,num_classes=4,**kwargs):
    super(LPModel,self).__init__(**kwargs)
    self.bert = bert_model
    self.num_classes = num_classes
    
    self.cls_head = tf.keras.Sequential([
      tf.keras.layers.Dense(256,activation="relu"),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Dense(64,activation="relu"),
      tf.keras.layers.Dense(self.num_classes, activation="softmax")
    ])

  def call(self, inputs, training):
    ids, mask = inputs
    
    embeds = self.bert(input_ids=ids, attention_mask=mask,training=training).pooler_output

    return self.cls_head(embeds, training=training)



In [15]:
model = LPModel(bert_model=robert)

## Train and evaluate

In [16]:
optim = tfa.optimizers.AdamW(weight_decay=0.001,learning_rate=0.005)
optim2 = tfa.optimizers.AdamW(weight_decay=0.0,learning_rate=0.00001)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_prec_metric = tf.keras.metrics.Precision(name="precision")
val_recall_metric = tf.keras.metrics.Recall(name="recall")
f1_metric_micro = tfa.metrics.F1Score(num_classes=4, threshold=0.5, average='micro', name='f1_micro')
f1_metric_macro = tfa.metrics.F1Score(num_classes=4, threshold=0.5, average='macro', name='f1_macro')

In [17]:
@tf.function
def train_step_1(x,y):

  with tf.GradientTape() as tape:
      preds = model([x["input_ids"],x["attention_mask"]],training=True)
      loss = loss_fn(y, preds)

  grads = tape.gradient(loss, [model.cls_head.trainable_weights, model.bert.trainable_weights])
  optim.apply_gradients(zip(grads[0], model.cls_head.trainable_weights))
  optim2.apply_gradients(zip(grads[1], model.bert.trainable_weights))

  return loss


@tf.function
def train_step_2(ids, mask, target, w1, w2,idsu, masku, targetu, w1u, w2u):

  with tf.GradientTape() as tape:
      preds = model([ids,mask],training=True)
      predsu = model([idsu,masku],training=True)
      loss_s = loss_fn(target, preds,sample_weight=w1*w2)
      loss_u = loss_fn(targetu,predsu, sample_weight=w1u*w2u)
      loss = loss_s + loss_u
  
  grads = tape.gradient(loss, [model.cls_head.trainable_weights, model.bert.trainable_weights])
  optim.apply_gradients(zip(grads[0], model.cls_head.trainable_weights))
  optim2.apply_gradients(zip(grads[1], model.bert.trainable_weights))

  return loss


@tf.function
def test_step(x, y):
    wpred = model([x["input_ids"],x["attention_mask"]], training=False)
    val_acc_metric.update_state(y, wpred)
    true_hot = tf.one_hot(y, 4)
    val_prec_metric.update_state(true_hot, wpred)
    val_recall_metric.update_state(true_hot, wpred)
    f1_metric_micro.update_state(true_hot, wpred)
    f1_metric_macro.update_state(true_hot, wpred)

### Phase 1

In [18]:
EPOCHS_PHASE1 = 5
max_val_acc = 0.0
best_weights = None
steps_per_epoch1 = 125

for epoch in range(EPOCHS_PHASE1):
  print(f"Epoch {epoch}")
  start_time = time.time()
  step = 0
  for x,y in labeled_ds:
    
    loss = train_step_1(x,y)
    if step % 150 == 0:
      print(
        "Training loss (for one batch) at step %d: %.4f"
        % (step, float(loss))
      )
    step += 1
    if steps_per_epoch1 == step:
      break
  for x_batch_val, y_batch_val in val_ds:
      test_step(x_batch_val, y_batch_val)

  acc = float(val_acc_metric.result())
  prec = float(val_prec_metric.result())
  recall = float(val_recall_metric.result())
  micro = float(f1_metric_micro.result())
  macro = float(f1_metric_macro.result())

  val_acc_metric.reset_states()
  val_prec_metric.reset_states()
  val_recall_metric.reset_states()
  f1_metric_micro.reset_states()
  f1_metric_macro.reset_states()

  if acc > max_val_acc:
    max_val_acc = acc
    best_weights = model.get_weights()
  print(f"Validation acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")
  print("Time taken: %.2fs" % (time.time() - start_time))
print("Restoring best weights relative to validation accuracy...")
model.set_weights(best_weights)

Epoch 0
Training loss (for one batch) at step 0: 1.4383
Validation acc: 0.565 precision: 0.677 recall: 0.395 f1_micro: 0.499 f1_macro: 0.320
Time taken: 41.65s
Epoch 1
Training loss (for one batch) at step 0: 1.0990
Validation acc: 0.696 precision: 0.811 recall: 0.550 f1_micro: 0.656 f1_macro: 0.563
Time taken: 28.29s
Epoch 2
Training loss (for one batch) at step 0: 0.4435
Validation acc: 0.716 precision: 0.793 recall: 0.624 f1_micro: 0.698 f1_macro: 0.611
Time taken: 28.36s
Epoch 3
Training loss (for one batch) at step 0: 0.4123
Validation acc: 0.723 precision: 0.774 recall: 0.681 f1_micro: 0.725 f1_macro: 0.697
Time taken: 28.45s
Epoch 4
Training loss (for one batch) at step 0: 0.2975
Validation acc: 0.786 precision: 0.812 recall: 0.763 f1_micro: 0.787 f1_macro: 0.779
Time taken: 28.49s
Restoring best weights relative to validation accuracy...


### Phase 2

In [19]:
EPOCHS_PHASE2 = 5
k = 50
alpha = 0.99
gamma = 3
batch_size = 16
steps_per_epoch = 500
num_classes = len(label_ids)

In [20]:
max_val_acc = 0.0
best_weights = None
for epoch in range(EPOCHS_PHASE2):
    start_time = time.time()
    init_time = start_time
    embeds = None
    for i in range(0,len(labeled[0]),200):
        end = min(i+200,len(labeled[0]))
        curr = tf.stop_gradient(robert(input_ids=labeled[0][i:end],attention_mask=labeled[1][i:end],training=False).pooler_output).numpy()
        if i == 0:
            embeds = curr
        else:
            embeds = np.concatenate([embeds, curr], axis=0)
    for i in range(0,len(unlabeled[0]),200):
        end = min(i+200,len(unlabeled[0]))
        curr = tf.stop_gradient(robert(input_ids=unlabeled[0][i:end],attention_mask=unlabeled[1][i:end],training=False).pooler_output).numpy()
        embeds = np.concatenate([embeds, curr], axis=0)

    
    print(f"Features computation time {time.time()-start_time}")
    start_time = time.time()

    embeds = embeds / np.linalg.norm(embeds, axis=1).reshape(-1,1)

    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm="auto").fit(embeds)
    adj = nbrs.kneighbors_graph(embeds).astype(np.int8).toarray()
    idx = np.arange(adj.shape[0])
    adj[idx,idx] = 0
    affinity = np.power(embeds @ embeds.T, gamma)
    A = affinity * adj
    a_sparse = scipy.sparse.csr_matrix(A)
    W = a_sparse + a_sparse.T

    D = W.sum(axis=1)
    dx = np.array(1/np.sqrt(D))
    D = scipy.sparse.diags(dx.reshape(-1))

    W = D @ W @ D
    W = scipy.sparse.eye(W.shape[0]) - alpha * W

    Z = np.zeros((W.shape[0], num_classes),dtype=np.float32)
    
    for i in range(num_classes):
        y = np.zeros((W.shape[0],))
        mask = labeled[2]==i
        cnt = sum(mask)
        y[np.where(mask)] = 1/cnt
        sol, _ = scipy.sparse.linalg.cg(W, y,tol=1e-6,maxiter=20)
        Z[:,i] = sol

    Z = Z / np.linalg.norm(Z,axis=1).reshape(-1,1)
    pseudo_labels = np.argmax(Z[len(labeled[2]):],axis=1)

    h = scipy.stats.entropy(Z,axis=1)
    omega = 1 - h / np.log(num_classes)
    omega = omega / np.max(omega)
    omega[:len(labeled[2])] = 1.0
    omega = omega.astype(np.float32)

    class_weights = {}
    for i in range(num_classes):
        class_weights[i] = 9953/(num_classes*(sum(labeled[2]==i)+sum(pseudo_labels==i)))


    all_labels = np.concatenate([labeled[2],pseudo_labels])
    instance_weights = np.array(list(map(lambda x:class_weights[x], all_labels)),dtype=np.float32)
    labeled_ds = tf.data.Dataset.from_tensor_slices(labeled + (h[:len(labeled[2])], instance_weights[:len(labeled[2])]))
    labeled_ds = labeled_ds.shuffle(len(labeled_ds)).batch(batch_size).repeat()
    unlabeled_ds = tf.data.Dataset.from_tensor_slices(unlabeled + (pseudo_labels, h[len(labeled[2]):], instance_weights[len(labeled[2]):]))
    unlabeled_ds = unlabeled_ds.shuffle(len(unlabeled_ds)).batch(batch_size).repeat()

    print(f"Graph computation time {time.time()-start_time}")
    
    step = 0
    l_iter = iter(labeled_ds)
    u_iter = iter(unlabeled_ds)
    for step in range(steps_per_epoch):
        ids, mask, target, w1, w2 = next(l_iter)
        idsu, masku, targetu, w1u, w2u = next(u_iter)
        loss = train_step_2(ids, mask, target, w1, w2,idsu, masku, targetu, w1u, w2u)
        if step % 200 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss))
            )
        step += 1
    for x_batch_val, y_batch_val in val_ds:
        test_step(x_batch_val, y_batch_val)

    acc = float(val_acc_metric.result())
    prec = float(val_prec_metric.result())
    recall = float(val_recall_metric.result())
    micro = float(f1_metric_micro.result())
    macro = float(f1_metric_macro.result())

    val_acc_metric.reset_states()
    val_prec_metric.reset_states()
    val_recall_metric.reset_states()
    f1_metric_micro.reset_states()
    f1_metric_macro.reset_states()

    if acc > max_val_acc:
        max_val_acc = acc
        best_weights = model.get_weights()
    print(f"Validation acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")
    print("Total epoch time: %.2fs" % (time.time() - init_time))
print("Restoring best weights relative to validation accuracy...")
model.set_weights(best_weights)

Features computation time 31.120615482330322
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Graph computation time 3.074552059173584
Training loss (for one batch) at step 0: 0.8524
Training loss (for one batch) at step 200: 0.9660
Training loss (for one batch) at step 400: 1.3327
Validation acc: 0.782 precision: 0.799 recall: 0.761 f1_micro: 0.780 f1_macro: 0.776
Total epoch time: 228.19s
Features computation time 31.290034770965576
Graph computation time 2.955547332763672
Training loss (for one batch) at step 0: 0.5112
Training loss (for one batch) at step 200: 0.4124
Training loss (for one batch) at step 400: 0.2238
Validation acc: 0.803 precision: 0.810 recall: 0.798 f1_micro: 0.804 f1_macro: 0.795
Total epoch time: 209.10s
Features computation time 3

In [21]:
for x_batch_val, y_batch_val in test_ds:
    test_step(x_batch_val, y_batch_val)
acc = float(val_acc_metric.result())
prec = float(val_prec_metric.result())
recall = float(val_recall_metric.result())
micro = float(f1_metric_micro.result())
macro = float(f1_metric_macro.result())

val_acc_metric.reset_states()
val_prec_metric.reset_states()
val_recall_metric.reset_states()
f1_metric_micro.reset_states()
f1_metric_macro.reset_states()
print(f"Test acc: {acc:.3f} precision: {prec:.3f} recall: {recall:.3f} f1_micro: {micro:.3f} f1_macro: {macro:.3f}")

Test acc: 0.758 precision: 0.761 recall: 0.756 f1_micro: 0.758 f1_macro: 0.746


In [23]:
model.save_weights("./checkpoints/label_prop")