#Predicting and evaluating models for Toxic Spans Detection

This notebook contains the code for predicting toxic spans and evaluating the predictions. It is designed for use in Google colab and requires you to give access to Google Drive. Run the notebook fine_tune_TSD first to fine-tune models and save them to Google Drive.

In [42]:
#install modules
!pip install transformers==4.28.1
!pip install sacremoses



In [43]:
#import modules
from transformers import AutoTokenizer, AutoModelForTokenClassification, BatchEncoding, Trainer
import os
import torch
import numpy as np
import ast
import pandas as pd
import itertools
import json
from statistics import mean

In [35]:
#connect drive: a pop-up will allow you to give access to your drive.
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
#import git repository to collect the test data
!git clone https://github.com/tommasoc80/DALC
!git clone https://github.com/basjems/Cross_lingual_TSD

Cloning into 'DALC'...
remote: Enumerating objects: 210, done.[K
remote: Counting objects: 100% (46/46), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 210 (delta 12), reused 28 (delta 4), pack-reused 164[K
Receiving objects: 100% (210/210), 3.53 MiB | 10.71 MiB/s, done.
Resolving deltas: 100% (92/92), done.
fatal: destination path 'Cross_lingual_TSD' already exists and is not an empty directory.


# PREPROCESS DATA

Run the cells below to define functions for data preprocessing.

In [68]:
class TSDdataset(torch.utils.data.Dataset):
    #inspired by https://huggingface.co/transformers/v4.1.1/custom_datasets.html
    """Aligns tokens with annotation labels (I/O), given a BatchEncoding (a tokenized text) and a list of character offsets.
    Param BatchEncoding tokenized: a tokenized sentence that has been tokenized by a FastTokenizer.
    Param annotations: a list or string that indicates the character indices that are toxic spans. """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def align_tokens_and_annotation_labels(tokenized: BatchEncoding, annotations: str, max_len, pad_token):
  """Preprocesses a dict of .csv files into a dictionary TSDdataset objects.
    Param files: a dict holding paths to .csv files to be preprocessed. Keys should be 'train' 'dev' and/or 'test', values should be paths to respective data file.
    tokenizer: an AutoTokenizer with which we want to preprocess the data."""
    #create aligned_labels as a list of 0's, length equals the number of tokens
  aligned_labels = [0] * len([ids for ids in tokenized.ids if id != 0])
    #convert annotation from str to list
  spanlist = ast.literal_eval(annotations)

    #iterate over indices in the span list
  for char_ix in spanlist:
        #Find the corresponding token index
      token_ix = tokenized.char_to_token(char_ix)
        #Change the value in aligned_labels to 1 (I)
      if token_ix is not None: # White spaces have no token and will return None
        aligned_labels[token_ix] = 1

  n_pad_tokens = max_len-len(aligned_labels)
  aligned_labels += [pad_token]*n_pad_tokens

  return aligned_labels

def preprocess(file_dict, tokenizer, pad_token=-100, max_len=512):

  TSD_datasetdict = {}

  for data_type, file in file_dict.items():

    df = pd.read_csv(file)
    spans = list(df['spans'])
    tweets = list(df['text'])
    texts = []

    for text in tweets:
      text = text.replace('#', '')
      text = text.replace('URL', '')
      texts.append(text)

    encodings = tokenizer(texts, truncation = True, max_length = max_len, padding = 'max_length')

    labels = [align_tokens_and_annotation_labels(tokenized, annotation, max_len, pad_token) for tokenized, annotation in zip(tokenizer(texts).encodings, spans)]
    # TSD_dataset =
    # batch = data_collator(TSD_dataset)
    TSD_datasetdict[data_type] = TSDdataset(encodings, labels)


  return TSD_datasetdict



#Helper functions
Run the cells below to define the functions needed for for evaluating, converting predictions to spans, and converting spans to words.

In [69]:
def divide_list_into_consecutive_sequences(char_list):
  """divides a list of ints into sublists of consecutive sequences.
  Param char_list: the list of ints.
  Returns: a list of lists of all consecutive sequences in the char_list. """
  if char_list == []:
    return []
  new_list = [[char_list[0]]]
  for i in char_list[1:]:
    new_list[-1].append(i) if new_list[-1][-1] == i-1 else new_list.append([i])
  return new_list

def spans_to_words(spans, text):
  """Converts lists of character indices (spans) to text.
  Param spans: a list of lists that hold a span for each tweet.
  param text: a list of tweet texts.
  Returns: a list of lists that holds the words in the tweet that correspond to the span."""
  all_words = []
  for span_list, tweet in zip(spans, text):
    words = []
    if type(span_list) == str:
      span_list = ast.literal_eval(span_list)

    span_list = sorted(span_list)
    if len(span_list) > 0:
      span_list_per_word = divide_list_into_consecutive_sequences(span_list)
      for span in span_list_per_word:
        word = ''.join([tweet[ix] for ix in span])
        words.append(word)
      else:
        pass

    all_words.append(words)

  return all_words


def convert_token_predictions_to_spans(binary_predictions, test_dataset, text_list, pad_token=-100):
  """converts predictions on a token level to predictions on a span level.
  Param binary_predictions: a list of lists of binary token predictions.
  Param test_dataset: a TSDdataset that holds the tokenized data.
  Param text_list: a list of texts in the data.
  Param pad_token: the pad_token (as assigned in the preprocess function)
  Returns: a list of lists of predicted toxic spans.
  """
  all_spans = []
  for tweet_idx, predictions in enumerate(binary_predictions):
    cur_tweet = text_list[tweet_idx]
    tweet_spans = []
    toxic_words = []
    for token_idx, pred_token in enumerate(predictions):
      if test_datasetdict.labels[tweet_idx][token_idx] != pad_token:
        if pred_token == 1:
          cur_tweet = text_list[tweet_idx]

          token_span = test_datasetdict.encodings.token_to_chars(tweet_idx, token_idx)
          tweet_spans += [idx for idx in range(token_span.start, token_span.end) if cur_tweet[idx] != ' ']

      else:
        break

    all_spans.append(tweet_spans)

  return all_spans


def calculate_evaluation_metrics(gold, pred):
    """Calculates averaged f1, precision and recall for TSD, via the metric defined by Pavlopoulos et al's "SemEval-2021 Task 5: Toxic Spans Detection" (2021)
    Param gold: the gold labels (list of lists of character indices)
    Param pred: system predictions (list of lists of character indices)
    Returns: a dictionary holding precision, recall and f1"""
  all_precision = []
  all_recall = []
  all_f1 = []

  for tweet_gold, tweet_pred in zip(gold,pred):


    if type(tweet_gold) == str:
      tweet_gold = ast.literal_eval(tweet_gold)
    if type(tweet_pred) == str:
      tweet_pred = ast.literal_eval(tweet_pred)


    if tweet_gold == [] and tweet_pred == []:
      precision, recall, f1 = 1,1,1
    elif tweet_gold == [] or tweet_pred == []:
      precision, recall, f1 = 0,0,0
    else:
      tweet_pred, tweet_gold = set(tweet_pred), set(tweet_gold)

      precision = len((tweet_gold).intersection(tweet_pred))/len(tweet_pred)
      recall = len((tweet_gold).intersection(tweet_pred))/len(tweet_gold)
      try:
        f1 = 2*(precision*recall)/(precision+recall)
      except ZeroDivisionError:
        f1= 0

    if f1 < recall and f1 < precision:
      print(f1, recall, precision)

    all_precision.append(precision)
    all_recall.append(recall)
    all_f1.append(f1)

  return {
          'precision': mean(all_precision),
          'recall': mean(all_recall),
          'f1': mean(all_f1),
          }

def predict_TSD(model_names, model_folder, gold_file, find_best_checkpoint = True, manual_checkpoint = None):

  df = pd.read_csv(gold_file)
  texts = df['text']
  gold_spans = [ast.literal_eval(span) for span in list(df['spans'])]
  df["gold words"] = spans_to_words(gold_spans, df['text'])

  for chkpoint in model_names:

    print(f'model: {chkpoint}')

    #preprocess data
    print('downloading tokenizer...')
    tokenizer = AutoTokenizer.from_pretrained(chkpoint)
    print('processing data...')
    datasetdict = preprocess({'test':gold_file}, tokenizer)

    destination_folder = f"{model_folder}/{chkpoint.replace('/', '_')}_trained_for_TSD"
    if find_best_checkpoint:

    #find latest model checkpoint
      checkpoint_numbers = [int(file.split('-')[1]) for file in os.listdir(destination_folder) if file.startswith('checkpoint')]
      latest_checkpoint = max(checkpoint_numbers)
      checkpoint_size = min(checkpoint_numbers)

      with open(f"{destination_folder}/checkpoint-{latest_checkpoint}/trainer_state.json", 'r') as infile:
        trainer_state = json.load(infile)
      all_epoch_f1_scores = [float(epoch['eval_f1']) for epoch in trainer_state['log_history'] if 'eval_f1' in epoch]
      best_epoch = all_epoch_f1_scores.index(max(all_epoch_f1_scores))+1
      best_model = f"{destination_folder}/checkpoint-{checkpoint_size*best_epoch}"
      print(f'best epoch: {best_epoch}')

    else:
      best_model = f"{destination_folder}/checkpoint-{manual_checkpoint}"

    print(f'loading best model...')
    trained_model = AutoModelForTokenClassification.from_pretrained(best_model, num_labels = 2)

    #predict data
    test_trainer = Trainer(trained_model)
    print("predicting...")
    raw_pred, _, _ = test_trainer.predict(datasetdict['test']) #trained model was trainer before
    binary_predictions = y_pred = np.argmax(raw_pred, axis=-1)
    predicted_spans = convert_token_predictions_to_spans(binary_predictions, datasetdict['test'], texts)

    #add data to the df
    df[f"predictions {chkpoint}"] = predicted_spans
    # df[f"words {chkpoint}"] = spans_to_words(predicted_spans, df['text'])
    print("predictions finished\n")

  return df


# Making predictions
Fill in the model checkpoints for which you want to make predictions below and run the cell.

In [73]:
model_checkpoints =  ['facebook/xlm-v-base', 'xlm-roberta-base', 'bert-base-multilingual-cased', 'distilbert-base-multilingual-cased']
model_folder = f"/content/drive/MyDrive/models_trained_for_TSD"
gold_file = "/content/Cross_lingual_TSD/data/annotated_test_set_hashtags_removed.csv"

TSD_df = predict_TSD(model_checkpoints, model_folder, gold_file)# overwrite = True, find_best_checkpoint=False, manual_checkpoint = 28)


model: facebook/xlm-v-base
downloading tokenizer...
processing data...
best epoch: 3
loading best model...
predicting...
dict_items([('test', <__main__.TSDdataset object at 0x7fe3696b3820>)])


predictions finished

model: xlm-roberta-base
downloading tokenizer...
processing data...
best epoch: 4
loading best model...
predicting...
dict_items([('test', <__main__.TSDdataset object at 0x7fe36c308cd0>)])


predictions finished

model: bert-base-multilingual-cased
downloading tokenizer...
processing data...
best epoch: 3
loading best model...
predicting...
dict_items([('test', <__main__.TSDdataset object at 0x7fe36c321fc0>)])


predictions finished

model: distilbert-base-multilingual-cased
downloading tokenizer...
processing data...
best epoch: 4
loading best model...
predicting...
dict_items([('test', <__main__.TSDdataset object at 0x7fe3696b3820>)])


predictions finished



#Predictions are finished.
TSD_df now holds predictions for all language models.
The next function calculates the evaluation metrics for each model and prints them.

In [52]:
def print_eval_metrics(model_checkpoints, df):
  """Print evaluation metrics using the df that is the output of predict_TSD.
  Param model_ceckpoints"""
  gold = df['spans']
  model_f1_dict = {}
  for chkpoint in model_checkpoints:
    print(f'checkpoint: {chkpoint}')
    pred = df[f'predictions {chkpoint}']
    eval = calculate_evaluation_metrics(gold, pred)
    for k, v in eval.items():
      if k != 'f1_dict':
        print(f'{k} : {v}')
    model_f1_dict[chkpoint] = eval
    print()

  return model_f1_dict

eval = print_eval_metrics(model_checkpoints, TSD_df)

checkpoint: facebook/xlm-v-base
precision : 0.7127841383348249
recall : 0.6917749659067799
f1 : 0.6583874396831174

checkpoint: xlm-roberta-base
precision : 0.6966266610784485
recall : 0.6261557998625287
f1 : 0.6202935622472588

checkpoint: bert-base-multilingual-cased
precision : 0.47067908584936446
recall : 0.414969253774076
f1 : 0.40377840431918266

checkpoint: distilbert-base-multilingual-cased
precision : 0.43779374399438864
recall : 0.3863564265044897
f1 : 0.37917760044139975



# Lexicon system

Run the following cell to run a lexicon look-up system and add the results to the TSD_df.

In [77]:
#install stanza and prepare nlp pipeline
!pip install stanza
import stanza
stanza.download('nl')
nlp=stanza.Pipeline(lang='nl')

def lexicon_baseline(lexicon_file, tweet_file):
  """A lexicon baseline system that predicts toxic spans in tweets.
  Param lexicon_file: the path to the lexicon file.
  Param tweet_file: the path to the file that holds the to be predicted twetes.
  returns: a list of lists that hold the predicted spans for each tweet. """

  tweets = list(pd.read_csv(tweet_file)['text'])
  nl_words = list(pd.read_csv(lexicon_file, sep='\t')['lemma'])
  lexicon_spans = []
  for tweet in tweets:
    doc = nlp(tweet)
    spans =[]
    for sent in doc.sentences:
      for word in sent.words:
        for scheldwoord in nl_words:

          if word.lemma == scheldwoord:
            start = int(word.start_char)
            end = int(word.end_char)
            if word.text.startswith('#'):
              start += 1
            for i in range(start, end):
              spans.append(i)

    lexicon_spans.append(spans)

  return lexicon_spans

lexicon_spans = lexicon_baseline('/content/DALC/GROF_LEX/groflex.tsv',gold_file)
TSD_df['predictions lexicon'] = lexicon_spans





Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: nl (Dutch) ...
INFO:stanza:File exists: /root/stanza_resources/nl/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: nl (Dutch):
| Processor | Package |
-----------------------
| tokenize  | alpino  |
| pos       | alpino  |
| lemma     | alpino  |
| depparse  | alpino  |
| ner       | conll02 |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


847


# Voting ensembles

Run the following cell to create a majority voting ensemble. By changing the 'df' and
'n' parameter, we can change the ensemble.

In [78]:
def ensemble(df, n=None):
  """Creates an ensemble given a df with predictions of multiple systems.
  Param df: a pandas dataframe that only contains columns with predictions we want to include in the ensemble.
  param n: the number of systems that should have predicted a character index to make it into the vote. Defaults to the number of systems / 2.
  Returns: a list of ensemble voting predictions.
  """
  #set n to the number of systems / 2, or /2+1 if the number of systems is uneven
  if n == None:
    n_systems = len(df.columns)
    n = int(n_systems/2)
    if n_systems%2 == 1:
      n+=1

  ensemble = []
  #iterate over the df, add spans to one list
  for i, row in df.iterrows():
    spans = []
    result = []
    for pred in row:
      spans += pred

    #if an index occurs in the spans list at least n times, add it to the result
    for idx in set(spans):
      if spans.count(idx) >= n:
        result.append(idx)

    ensemble.append(result)

  return ensemble

TSD_df['predictions best ensemble'] = ensemble(TSD_df[['predictions facebook/xlm-v-base', 'predictions xlm-roberta-base', 'predictions lexicon']])

# Save to file
enter a desired file path to save the predictions to.

In [80]:
predictions_file = "/content/[YOUR_FILE_HERE].csv"
TSD_df.to_csv(predictions_file)