In [None]:
import pandas as pd
import numpy as np
np.random.seed(seed=2021)


from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Input
from keras.layers import GRU, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.callbacks import EarlyStopping,ReduceLROnPlateau,ModelCheckpoint
from keras.utils import plot_model
from keras.metrics import BinaryAccuracy, Precision, Recall, AUC


### Loading the proccesed data to feed the model

Tokens from each text are taged according to the BIO format.

In [None]:
from ast import literal_eval
TRAIN_FILE="train.csv"
VAL_FILE="val.csv"
TEST_FILE="test.csv"


train_df=pd.read_csv(TRAIN_FILE)
train_df.spans=train_df.spans.apply(lambda row:literal_eval(row))
train_df.labels=train_df.labels.apply(lambda row:literal_eval(row))
train_df.offset_mapping=train_df.offset_mapping.apply(lambda row:literal_eval(row))
train_df.tokens=train_df.tokens.apply(lambda row:literal_eval(row))
train_df.BIO_tags=train_df.BIO_tags.apply(lambda row:literal_eval(row))

val_df=pd.read_csv(VAL_FILE)
val_df.spans=val_df.spans.apply(lambda row:literal_eval(row))
val_df.labels=val_df.labels.apply(lambda row:literal_eval(row))
val_df.offset_mapping=val_df.offset_mapping.apply(lambda row:literal_eval(row))
val_df.tokens=val_df.tokens.apply(lambda row:literal_eval(row))
val_df.BIO_tags=val_df.BIO_tags.apply(lambda row:literal_eval(row))

test_df=pd.read_csv(TEST_FILE)
test_df.spans=test_df.spans.apply(lambda row:literal_eval(row))
test_df.labels=test_df.labels.apply(lambda row:literal_eval(row))
test_df.offset_mapping=test_df.offset_mapping.apply(lambda row:literal_eval(row))
test_df.tokens=test_df.tokens.apply(lambda row:literal_eval(row))
test_df.BIO_tags=test_df.BIO_tags.apply(lambda row:literal_eval(row))

### Implementing suggested f1 metric.

In [None]:
def f1(predictions, gold):
    """
    F1 (a.k.a. DICE) operating on two lists of offsets (e.g., character).
    >>> assert f1([0, 1, 4, 5], [0, 1, 6]) == 0.5714285714285714
    :param predictions: a list of predicted offsets
    :param gold: a list of offsets serving as the ground truth
    :return: a score between 0 and 1
    """
    if len(gold) == 0:
        return 1 if len(predictions)==0 else 0
    nom = 2*len(set(predictions).intersection(set(gold)))
    denom = len(set(predictions))+len(set(gold))
    return nom/denom

### Implementing baseline BiLSTM for token based classification for toxic span detection.

In [None]:
np.random.seed(seed=2021)
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Input
from keras.layers import GRU, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.callbacks import EarlyStopping,ModelCheckpoint,ReduceLROnPlateau
from keras.utils import plot_model
from keras.metrics import BinaryAccuracy, Precision, Recall, AUC,Accuracy
from collections import Counter

class RNNSL:

    def __init__(self, maxlen=128, batch_size=32, w_embed_size=200, h_embed_size=200, dropout=0.1, patience=1, plot=True, max_epochs=100):
        self.maxlen = maxlen
        self.METRICS = [Precision(name='precision'), Recall(name='recall'), AUC(name='auc')]
        self.w_embed_size = w_embed_size
        self.h_embed_size = h_embed_size
        self.dropout = dropout
        self.patience = patience
        self.model = None
        self.epochs = max_epochs
        self.batch_size = batch_size
        self.show_the_model = plot
        self.threshold = 0.2
        self.word_to_index={}
        self.class_weights={}
        
        # self.toxic_label = 2
        # self.not_toxic_label = sel  



    def set_up_preprocessing(self, tokenized_texts,labels):
        #get unique words of corpus
        total_words=[w for s in tokenized_texts for w in s]
        self.vocab=list(set(total_words))
        self.vocab_size=len(self.vocab)

        #unique tags
        total_tags=[t for l in labels for t in l ]
        # tags=list(set(total_tags))
        tags=['B-TOX','I-TOX','O']
        self.n_tags=len(tags)

        # Dictionary word:index pair
        # word is key and its value is corresponding index
        self.word_to_index = {w : i + 2 for i, w in enumerate(self.vocab)}
        self.word_to_index["UNK"] = 1
        self.word_to_index["PAD"] = 0

        # Dictionary lable:index pair
        # label is key and value is index.
        self.tag_to_index = {t : i for i, t in enumerate(tags)}
        # self.tag_to_index["PAD"] = 0
        
        
        self.idx2word = {i: w for w, i in self.word_to_index.items()}
        self.idx2tag = {i: w for w, i in self.tag_to_index.items()}

        class_weights=Counter(total_tags)
        for k,v in class_weights.items():
          self.class_weights[self.tag_to_index[k]]=v/len(total_tags)
          

  

    def X_to_sequences(self, tokenized_texts):
        from tensorflow.keras.preprocessing.sequence import pad_sequences
        # Convert each sentence from list of Token to list of word_index
        # Converting each sentence into list of index from list of tokens
        X = [[self.word_to_index.get(w,self.word_to_index["UNK"]) for w in s] for s in tokenized_texts]
        # Padding each sentence to have the same lenght
        X = pad_sequences(maxlen=self.maxlen, sequences=X, padding="post", value=self.word_to_index["PAD"])
        return X

    def y_to_sequences(self,labels):
        # Convert Tag/Label to tag_index
        y = [[self.tag_to_index[l_i] for l_i in l] for l in labels]
        # Padding each sentence to have the same lenght
        y = pad_sequences(maxlen=self.maxlen, sequences=y, padding="post", value=self.tag_to_index["O"])

        # One hot encoded labels
        from keras.utils import to_categorical
        y = [to_categorical(i, num_classes = self.n_tags) for i in y]
        y=np.array(y)

        return y

    def build(self):
        input = Input(shape=(self.maxlen,))
        model = Embedding(input_dim=self.vocab_size+2, output_dim=self.w_embed_size, input_length=self.maxlen, mask_zero=True)(input)  # 50-dim embedding
        model = Dropout(self.dropout)(model)
        model = Bidirectional(LSTM(units=self.h_embed_size, return_sequences=True, recurrent_dropout=self.dropout))(model)  # variational biLSTM
        output = TimeDistributed(Dense(self.n_tags, activation="softmax"))(model)
        return Model(input, output)


    def fit(self, tokenized_texts, labels, validation_data=None, monitor="val_auc"):
        # set up the vocabulary and the related methods
        self.set_up_preprocessing(tokenized_texts,labels)
        X,y=self.X_to_sequences(tokenized_texts),self.y_to_sequences(labels)
        print("X shape {},y shape {}".format(X.shape,y.shape))
        # build the model and compile it
        self.model = self.build()
        if self.show_the_model:
            print(self.model.summary())
            plot_model(self.model, show_shapes=True, to_file="neural_sequence_labeler.model.png")
        self.model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=self.METRICS)
        
        #mode = "max" if monitor == "val_accuracy" else "min"
        early = EarlyStopping(monitor=monitor, 
                              mode="min" if "loss" in monitor else "max",
                              patience=self.patience,
                              verbose=1,
                              min_delta=0.0001, 
                              restore_best_weights=True)
        
        checkpointer=ModelCheckpoint(filepath = 'model.h5',
                       verbose = 0,
                       mode = "min" if "loss" in monitor else "max",
                       save_best_only = True,
                       monitor=monitor)

        # reduce_lr = ReduceLROnPlateau(monitor=monitor, factor=0.1,
        #                       patience=1, min_lr=0.0001)

        callbacks=[early,checkpointer]

        # start training
        if validation_data is not None:
            assert len(validation_data) == 2
            vX,vy=self.X_to_sequences(validation_data[0]),self.y_to_sequences(validation_data[1])

            history = self.model.fit(x, y, batch_size=self.batch_size, epochs=self.epochs, validation_data=(vx, vy), verbose=1, callbacks=callbacks)
        else:
            history = self.model.fit(X, y,
                                     batch_size=self.batch_size,
                                     epochs=self.epochs,
                                     validation_split=0.1,
                                     verbose=1,
                                     callbacks=callbacks)
        
        return self.model,pd.DataFrame(history.history)
  
    def tune_threshold(self,pred_label_proba,len_true_labels,th=0.2):
      self.label_preds=[]
      for tag_pred in pred_label_proba[:len_true_labels]:
        # print(tag_pred)
        if (tag_pred[0] >=th or tag_pred[1]>=th):
          self.label_preds.append(2)
        
        else:
          self.label_preds.append(1)
        
      return self.label_preds
    
    def fine_tuning(self,tokenized_texts,BIO_tags,offset_mappings,spans):
      len_data=len(tokenized_texts)
      
      X_val=self.X_to_sequences(tokenized_texts)
      pred_proba=self.model.predict(X_val)
      
      mean_f1_scores=[]
      thresholds=[]
      
      for th in np.arange(0.15,0.5,0.01):
        print(f"Tuning for threshold {th:.2f} ...")

        label_predictions=[]

        for i,p in enumerate(pred_proba):
          label_predictions.append(self.tune_threshold(p,len(BIO_tags[i]),th))



        span_predictions=[]
        f1_scores=[]

        for i in range(len_data):
          
          curr_pred_span=[]
         
          for j,pred_labl in enumerate(label_predictions[i]):
            if(pred_labl==2):
              curr_pred_span+=list(np.arange(offset_mappings[i][j][0],offset_mappings[i][j][1]))

          f1_scores.append(f1(curr_pred_span,spans[i]))        

        mean_f1_scores.append(np.mean(f1_scores))
        thresholds.append(th)

        print(f"|__ mean f1 scores {np.mean(f1_scores):.3f}")  
        print(f"|__ threshold {th:.2f}")
        print(20*'=')
      
      return mean_f1_scores,thresholds

### Instantiating and training the model.

In [None]:
clf=RNNSL(maxlen=192,max_epochs=10,patience=1)

In [None]:
# num_samples=100
model,hist=clf.fit(train_df.iloc[:].tokens.values,train_df.iloc[:].BIO_tags.values)

X shape (7939, 192),y shape (7939, 192, 3)
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 192)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 192, 200)          4715200   
_________________________________________________________________
dropout_2 (Dropout)          (None, 192, 200)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 192, 400)          641600    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 192, 3)            1203      
Total params: 5,358,003
Trainable params: 5,358,003
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Restoring model weights

### Tuning threshold to determine toxic/non-toxic tokens on validation data.

In [None]:
clf.fine_tuning(val_df.tokens,val_df.BIO_tags,val_df.offset_mapping,val_df.spans)

Tuning for threshold 0.15 ...
|__ mean f1 scores 0.592
|__ threshold 0.15
Tuning for threshold 0.16 ...
|__ mean f1 scores 0.595
|__ threshold 0.16
Tuning for threshold 0.17 ...
|__ mean f1 scores 0.594
|__ threshold 0.17
Tuning for threshold 0.18 ...
|__ mean f1 scores 0.593
|__ threshold 0.18
Tuning for threshold 0.19 ...
|__ mean f1 scores 0.594
|__ threshold 0.19
Tuning for threshold 0.20 ...
|__ mean f1 scores 0.591
|__ threshold 0.20
Tuning for threshold 0.21 ...
|__ mean f1 scores 0.591
|__ threshold 0.21
Tuning for threshold 0.22 ...
|__ mean f1 scores 0.585
|__ threshold 0.22
Tuning for threshold 0.23 ...
|__ mean f1 scores 0.583
|__ threshold 0.23
Tuning for threshold 0.24 ...
|__ mean f1 scores 0.583
|__ threshold 0.24
Tuning for threshold 0.25 ...
|__ mean f1 scores 0.582
|__ threshold 0.25
Tuning for threshold 0.26 ...
|__ mean f1 scores 0.584
|__ threshold 0.26
Tuning for threshold 0.27 ...
|__ mean f1 scores 0.581
|__ threshold 0.27
Tuning for threshold 0.28 ...
|__ mean

([0.5920065998413606,
  0.5949025041860077,
  0.593892304526475,
  0.592716169958767,
  0.5942245728718716,
  0.590915612760518,
  0.5907866378478577,
  0.5854721245540991,
  0.5828515460688904,
  0.5834104193578284,
  0.5824188571290647,
  0.5840018189376568,
  0.5809186835032177,
  0.579614815532352,
  0.5800507114541814,
  0.5780872002408833,
  0.5739749772650997,
  0.5710387536347706,
  0.5646187582916177,
  0.5614494085984384,
  0.5553500085984732,
  0.5513080158136425,
  0.5439091412138684,
  0.5343662708448437,
  0.5338516807650362,
  0.5319904827640713,
  0.5243868476443498,
  0.5226072272702343,
  0.5196603328040225,
  0.5182684990337214,
  0.5107123131320972,
  0.5056067885886659,
  0.5045438057257391,
  0.49249137926942893,
  0.4900593718054521],
 [0.15,
  0.16,
  0.17,
  0.18000000000000002,
  0.19000000000000003,
  0.20000000000000004,
  0.21000000000000005,
  0.22000000000000006,
  0.23000000000000007,
  0.24000000000000007,
  0.2500000000000001,
  0.2600000000000001,
  0

### Retraining on validation data

### Predicting toxic span sequences on test data.

In [None]:
X_test=clf.X_to_sequences(test_df.tokens.values)

In [None]:
X_test.shape

(2000, 192)

In [None]:
preds=model.predict(X_test)

In [None]:
label_predictions=[]

for i,p in enumerate(preds):
  label_predictions.append(clf.tune_threshold(p,len(test_df.iloc[i].labels),th=0.23))

In [None]:
span_predictions=[]
f1_scores=[]
for i,row in test_df.iterrows():
  curr_pred_span=[]

  for j,pred_labl in enumerate(label_predictions[i]):
    if(pred_labl==2):
      curr_pred_span+=list(np.arange(row.offset_mapping[j][0],row.offset_mapping[j][1]))

  f1_scores.append(f1(curr_pred_span,row.spans))  

In [None]:
np.mean(f1_scores)

0.6305217774547157