In [None]:
!pip install datasets
!pip install lime
!pip install jsonlines



In [None]:
import pandas as pd
import numpy as np
import re
import jsonlines
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
import lime
import json
from lime.lime_text import LimeTextExplainer
from datasets import load_dataset
from sklearn.metrics import f1_score
torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True

In [None]:
class RNN_model(nn.Module):
  def __init__(self, vocab_size, embedding_dim, dropout_rate, lstm_units, lstm_layers, bidirectional, pad_idx, output_dim):
    super(RNN_model, self).__init__()

    self.lstm_layers = lstm_layers
    self.num_directions = 2 if bidirectional else 1
    self.lstm_units = lstm_units

    self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
    # self.embeddings.weight.requires_grad = False
    self.dropout = nn.Dropout(dropout_rate)
    self.bilstm = nn.LSTM(embedding_dim, lstm_units, lstm_layers, bidirectional = bidirectional, dropout = dropout_rate)
    self.linear = nn.Linear(lstm_units * (2 if bidirectional else 1), output_dim)

  def forward(self, text, text_lengths):
    x = self.embeddings(text)
    # x = self.dropout(x)
    x = nn.utils.rnn.pack_padded_sequence(x, text_lengths)
    out, (h_n, c_n) = self.bilstm(x)
    out_unpacked , out_lengths = nn.utils.rnn.pad_packed_sequence(out)

    out = self.linear(self.dropout(torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)))
    return(out)

In [None]:
dataset = load_dataset("civil_comments", split='train')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2445.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1377.0, style=ProgressStyle(description…

Using custom data configuration default



Downloading and preparing dataset civil_comments/default (download: 395.73 MiB, generated: 630.60 MiB, post-processed: Unknown size, total: 1.00 GiB) to /root/.cache/huggingface/datasets/civil_comments/default/0.9.0/98bdc73fc77a117cf5d17c9977e278c8023c64177a3ed9e0c49f7a5bdf10a47b...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=414947977.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset civil_comments downloaded and prepared to /root/.cache/huggingface/datasets/civil_comments/default/0.9.0/98bdc73fc77a117cf5d17c9977e278c8023c64177a3ed9e0c49f7a5bdf10a47b. Subsequent calls will reuse this data.


In [None]:
dataset[0]

{'identity_attack': 0.0,
 'insult': 0.0,
 'obscene': 0.0,
 'severe_toxicity': 0.0,
 'sexual_explicit': 0.0,
 'text': "This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",
 'threat': 0.0,
 'toxicity': 0.0}

In [None]:
non_toxic_comments = []

toxic_comments = []

for comment in dataset:
  if(comment['toxicity'] == 0 and comment['severe_toxicity'] == 0):
    non_toxic_comments.append(comment['text'])
  elif(comment['toxicity']>0.5 or comment['severe_toxicity']>0.5):
    toxic_comments.append(comment['text'])

In [None]:
print(len(non_toxic_comments))
print(len(toxic_comments))

1248717
106438


In [None]:
toxic_data_train = pd.read_csv('/content/drive/My Drive/ToxicSpan_CS669V/train_split.csv')
toxic_data_val = pd.read_csv('/content/drive/My Drive/ToxicSpan_CS669V/val_split.csv')
toxic_data_test = pd.read_csv('/content/drive/My Drive/ToxicSpan_CS669V/test_split.csv')

toxic_train = toxic_data_train['text'].to_list()
toxic_val = toxic_data_val['text'].to_list()
toxic_test = toxic_data_test['text'].to_list()

for i in toxic_comments:
  if((i in toxic_test) or (i in toxic_val) or (i in toxic_train)):
    toxic_comments.remove(i)

np.random.shuffle(toxic_comments)
toxic_train, toxic_val = toxic_comments[:int(0.8*len(toxic_comments))], toxic_comments[int(0.8*len(toxic_comments)):]
np.random.shuffle(non_toxic_comments)
non_toxic_train, non_toxic_val = non_toxic_comments[:len(toxic_train)], non_toxic_comments[len(toxic_train):len(toxic_train)+len(toxic_val)] 

In [None]:
print(len(toxic_train))
print(len(toxic_val))
print(len(non_toxic_train))
print(len(non_toxic_val))

78948
19737
78948
19737


In [None]:
Y_train = np.ones(len(toxic_train)).tolist() + np.zeros(len(non_toxic_train)).tolist()
Y_val = np.ones(len(toxic_val)).tolist() + np.zeros(len(non_toxic_val)).tolist()

In [None]:
X_train = toxic_train + non_toxic_train
X_val = toxic_val + non_toxic_val

In [None]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'[emoji]',text)

from string import punctuation
def strip_punctuation(text):
    return ''.join(c for c in text if c not in punctuation)

import unicodedata
def convert_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def minimize_space(text):
  return(re.sub(' +', ' ', text))

def preprocess(text):
  for i in range(len(text)):
    text[i] = convert_accented_chars(text[i])
    text[i] = deEmojify(text[i])
    text[i] = strip_punctuation(text[i])
    return(text)

for i in range(len(X_train)):
  X_train[i] = convert_accented_chars(X_train[i])
  X_train[i] = strip_punctuation(X_train[i])
  X_train[i] = deEmojify(X_train[i])
  X_train[i] = minimize_space(X_train[i])
  X_train[i] = X_train[i].lower().split(' ')

for i in range(len(X_val)):
  X_val[i] = convert_accented_chars(X_val[i])
  X_val[i] = strip_punctuation(X_val[i])
  X_val[i] = deEmojify(X_val[i])
  X_val[i] = minimize_space(X_val[i])
  X_val[i] = X_val[i].lower().split(' ')

In [None]:
print(X_train[0])

['tabby', 'is', 'a', 'vacuous', 'twit', 'much', 'like', 'many', 'of', 'the', 'other', 'intolerant', 'feminists', 'nursing', 'leah', 'the', 'globe', 'publishes']


In [None]:
train_data = []
for i in range(len(Y_train)):
  train_data.append({"text" : X_train[i], "label" : [Y_train[i]]})

with jsonlines.open('/content/drive/My Drive/ToxicSpan_CS669V/LIME_Baseline/train_new.json', mode='w') as f:
  f.write_all(train_data)

val_data = []
for i in range(len(Y_val)):
  val_data.append({"text" : X_val[i], "label" : [Y_val[i]]})

with jsonlines.open('/content/drive/My Drive/ToxicSpan_CS669V/LIME_Baseline/val_new.json', mode='w') as f:
  f.write_all(val_data)

In [None]:
TEXT = data.Field(include_lengths=True)
LABEL = data.Field(unk_token=None, pad_token=None)

In [None]:
fields = {'text' : ('text', TEXT), 'label' : ('label', LABEL)}

In [None]:
train_data, val_data = data.TabularDataset.splits(
                              path = '/content/drive/My Drive/ToxicSpan_CS669V/LIME_Baseline',
                              train = 'train.json',
                              validation = 'val.json',
                              format = 'json',
                              fields = fields,
                              skip_header = True
)

In [None]:
MAX_VOCAB_SIZE = 50000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, vectors='glove.6B.100d', unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [None]:
print(len(TEXT.vocab))
print(LABEL.vocab.stoi)

50002
defaultdict(<function _default_unk_index at 0x7f691bceb400>, {0.0: 0, 1.0: 1})


In [None]:
BATCH_SIZE = 500

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, val_iterator = data.BucketIterator.splits(
    (train_data, val_data),
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key = lambda x: len(x.text),
    device = device
)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
LSTM_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT_RATE = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN_model(INPUT_DIM, EMBEDDING_DIM, DROPOUT_RATE, HIDDEN_DIM, LSTM_LAYERS, BIDIRECTIONAL, PAD_IDX, OUTPUT_DIM)

In [None]:
print(model)

RNN_model(
  (embeddings): Embedding(50002, 100, padding_idx=1)
  (dropout): Dropout(p=0.5, inplace=False)
  (bilstm): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (linear): Linear(in_features=512, out_features=1, bias=True)
)


In [None]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(count_parameters(model))

7310857


In [None]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([50002, 100])


In [None]:
model.embeddings.weight.data.copy_(pretrained_embeddings)

tensor([[-1.4220e+00,  9.9518e-01, -2.1328e-01,  ...,  6.9932e-01,
          1.8198e-01, -9.1527e-01],
        [ 1.0569e+00, -1.8136e+00,  1.0866e+00,  ..., -1.0926e+00,
          1.6550e-03,  1.4007e+00],
        [-3.8194e-02, -2.4487e-01,  7.2812e-01,  ..., -1.4590e-01,
          8.2780e-01,  2.7062e-01],
        ...,
        [ 4.6875e-01, -2.7269e-01,  2.5069e-01,  ..., -1.1505e+00,
          3.4400e-02, -7.2481e-01],
        [-8.6831e-02,  3.0148e-02, -1.6625e-01,  ...,  7.4492e-02,
         -4.7161e-02, -3.0928e-01],
        [ 4.1129e-01, -4.0361e-01, -5.6629e-02,  ...,  4.4429e-01,
         -2.9886e-01,  4.1527e-02]])

In [None]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embeddings.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embeddings.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model.cuda()
criterion = criterion.to(device)

In [None]:
def bin_acc(preds, y):
  rounded_preds = torch.round(torch.sigmoid(preds))
  return (rounded_preds == y).sum()

In [None]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.train()
  count = 0
  num = 0
  for batch in iterator:
    optimizer.zero_grad()
    count+=1
    text, text_lengths = batch.text
    # print(text[0])
    predictions = torch.transpose(model(text, text_lengths),0,1)

    loss = criterion(predictions, batch.label.type(torch.FloatTensor).cuda())
    acc = bin_acc(predictions, batch.label.type(torch.FloatTensor).cuda())

    loss.backward()
    optimizer.step()

    epoch_loss += float(loss.item())
    epoch_acc += acc.item()
    num += text.shape[1]
  return(epoch_loss/count, epoch_acc/num)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    num = 0    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = torch.transpose(model(text, text_lengths),0,1)

            loss = criterion(predictions, batch.label.type(torch.FloatTensor).cuda())
            acc = bin_acc(predictions, batch.label.type(torch.FloatTensor).cuda())

            epoch_loss += float(loss.item())
            epoch_acc += acc.item()
            num += text.shape[1]
    return epoch_loss / len(iterator), epoch_acc / num

In [None]:
EPOCHS = 20
val_loss_min = 100000

for epoch in range(EPOCHS):

  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  val_loss, val_acc = evaluate(model, val_iterator, criterion)

  print(f"Epoch: {epoch+1} \n Train Loss:{train_loss} | Train Acc:{train_acc} \n Val Loss:{val_loss} | Val Acc:{val_acc}")
  
  if(val_loss < val_loss_min):
    torch.save(model, '/content/drive/My Drive/model9.pt')
    val_loss_min = val_loss
    print("Model Checkpoint Saved")

Epoch: 1 
 Train Loss:0.42024407351884663 | Train Acc:0.8089743183761361 
 Val Loss:0.3002803448257567 | Val Acc:0.8712537683986522
Model Checkpoint Saved
Epoch: 2 
 Train Loss:0.24127172947495798 | Train Acc:0.904455492574179 
 Val Loss:0.21355608426317385 | Val Acc:0.9183492513870241
Model Checkpoint Saved
Epoch: 3 
 Train Loss:0.18367187650520592 | Train Acc:0.9310871148548086 
 Val Loss:0.19782874384258367 | Val Acc:0.924657360727586
Model Checkpoint Saved
Epoch: 4 
 Train Loss:0.15725760730101337 | Train Acc:0.9419867633553944 
 Val Loss:0.2005571162776102 | Val Acc:0.9242520203683531
Epoch: 5 
 Train Loss:0.13926357761779917 | Train Acc:0.9487887520187467 
 Val Loss:0.20865662188469608 | Val Acc:0.9220226483925721
Epoch: 6 
 Train Loss:0.11891520183674897 | Train Acc:0.9574590709015485 
 Val Loss:0.20752696645788 | Val Acc:0.9245813594102298
Epoch: 7 
 Train Loss:0.10261420134596433 | Train Acc:0.9637163938060104 
 Val Loss:0.22642314151118073 | Val Acc:0.9209332961771337
Epoch: 

KeyboardInterrupt: ignored

In [None]:
#The following code block has been changed for PyTorch model, originally taken from https://github.com/ipavlopoulos/toxic_spans/blob/master/baselines/models.py

class InputErasure:
    def __init__(self,
                 classifier,
                 text,
                 one_by_one=False,
                 tokenise=lambda txt: txt.split(),
                 class_names=[0, 1],
                 mask=u"[mask]",
                 threshold=0.2,
                 reshape_predictions=True):
        """
        Given a classifier and a tokenisation method InputErasure returns the toxic words and respective offsets.
        This implementation is based on the paper "Understanding Neural Networks through Representation Erasure" by
        Li et al.
        :param classifier: any toxicity classifier that predicts a text as toxic or not
        :param text: the textual input (sentence or document) as a string
        :param one_by_one: some classifiers may require one by one classification when scoring the "ablated" texts.
        :param tokenise: by default splits the words on empty space
        :param class_names: by default "toxic" is represented by 1 and "civil" by 0
        :param mask: the pseudo token to mask the toxic word (for visualisation purposes)
        :param threshold: above this value the text is predicted toxic (default 0.2)
        :param reshape_predictions: flattens the output, some classifiers may required this to be set to False
        """
        self.class_names = class_names
        self.classifier = classifier
        self.mask = mask
        self.one_by_one = one_by_one
        self.reshape_predictions = reshape_predictions
        # self.tokenise = tokenise
        self.texts = text
        self.words = text
        self.initial_score = self.clf_predict(self.texts, False).item()
        self.ablations, self.indices = self.create_ablations()
        self.scores = self.clf_predict(self.ablations)
        self.e = 10e-05
        self.scores_decrease = [(self.initial_score - s) / (self.initial_score+self.e) for s in self.scores]
        self.threshold = threshold
        self.black_list = self.get_black_list()

    def clf_predict(self, texts, batch=True):
        self.classifier.eval()
        if(batch):
          text_lengths = [len(x) for x in texts]
          predictions = self.classifier(torch.tensor(texts, dtype=torch.int64, device='cuda').view((-1, len(text_lengths))), torch.tensor(text_lengths, device='cuda', dtype=torch.int64))
        else:
          predictions = self.classifier(torch.tensor(texts, dtype=torch.int64, device='cuda').unsqueeze(1), torch.tensor([len(texts)], device='cuda', dtype=torch.int64))

        return predictions

    def create_ablations(self):
        ablations, indices = [], []
        for i, w in enumerate(self.words):
            words_copy = [w for w in self.words]
            words_copy[i] = self.mask
            ablations.append(list(words_copy))
            indices.append(i)
        return ablations, indices

    def get_black_list(self):
        return [self.indices[i] for i, s in enumerate(self.scores_decrease) if s > self.threshold]

    def get_toxic_offsets(self):
        current_offset = 0
        toxic_offsets = []
        for i, word in enumerate(self.words):
            if i in set(self.black_list):
                toxic_offsets.append(1)
            else:
              toxic_offsets.append(0)
        return toxic_offsets

    def get_mitigated_text(self):
        return " ".join([w if i not in set(self.black_list) else self.mask for i, w in enumerate(self.words)])

    def get_as_pandas(self):
        scores_pd = pd.DataFrame({"word": self.words, "indices": self.indices, "score_dec": self.scores_decrease})
        scores_pd = scores_pd.sort_values(by=["score_dec"])
        return scores_pd


class LimeUsd(InputErasure):

    def __init__(self,
                 classifier,
                 text,
                 one_by_one=False,
                 tokenise=lambda txt: txt.split(' '),
                 class_names=[0, 1],
                 mask=u"[mask]",
                 threshold=0.2,
                 reshape_predictions=True):
        """
        Given a classifier and a tokenisation method LimeUsd returns the toxic words and the respective offsets.
        This implementation is based on LIME.
        :param classifier: any toxicity classifier that predicts a text as toxic or not
        :param text: the textual input (sentence or document) as a string
        :param one_by_one: some classifiers may require one by one classification when scoring the "ablated" texts.
        :param tokenise: by default splits the words on empty space -- same as LIME
        :param class_names: by default "toxic" is represented by 1 and "civil" by 0
        :param mask: the pseudo token to mask the toxic word (for visualisation purposes)
        :param threshold: above this value the text is predicted toxic (default 0.2)
        :param reshape_predictions: flattens the output, some classifiers may required this to be set to False
        """
        self.class_names = class_names
        self.classifier = classifier
        self.mask = mask
        self.one_by_one = one_by_one
        self.reshape_predictions = reshape_predictions
        # self.tokenise = tokenise
        self.texts = text
        self.words = text
        self.initial_score = self.clf_predict(self.texts, False)
        self.ablations, self.indices = self.create_ablations()
        self.scores = self.clf_predict(self.ablations)
        self.explainer = LimeTextExplainer(class_names=self.class_names, split_expression=tokenise)
        self.scores_decrease = self.lime_explain(self.words)
        self.threshold = threshold
        self.black_list = self.get_black_list()

    def predictor_func(self, text):
      mx = 0
      for i in range(len(text)):
        text[i] = text[i].split(' ')
        for j in range(len(text[i])):
          if(text[i][j]==''):
            text[i][j] = '0'
        mx = max(mx, len(text[i]))

      for i in range(len(text)):
        while(len(text[i]) != mx):
          text[i].append('0')
        text[i] = list(map(int, text[i]))
        
      text_lengths = [len(x) for x in text]
      predictions = self.classifier(torch.tensor(text, dtype=torch.int64, device='cuda').view((-1, len(text_lengths))), torch.tensor(text_lengths, device='cuda', dtype=torch.int64))
      return(predictions)

    def lime_explain(self, words):
        num_of_feats = len(words)
        string = ""
        for x in words:
          string += str(x)+' '
        string = string.rstrip(' ')
        predictor = lambda texts: np.array([[0, p] for p in self.predictor_func(texts)])
        explain = self.explainer.explain_instance(string, predictor, num_features=num_of_feats)
        word2score = dict(explain.as_list())
        return [word2score[str(w)] for w in self.words]


In [None]:
# model_load = RNN_model(INPUT_DIM, EMBEDDING_DIM, DROPOUT_RATE, HIDDEN_DIM, LSTM_LAYERS, BIDIRECTIONAL, PAD_IDX, OUTPUT_DIM)
model_load = torch.load(('/content/drive/My Drive/model9.pt'))
model_load.eval()

RNN_model(
  (embeddings): Embedding(50002, 100, padding_idx=1)
  (dropout): Dropout(p=0.5, inplace=False)
  (bilstm): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (linear): Linear(in_features=512, out_features=1, bias=True)
)

In [None]:
stoi = TEXT.vocab.stoi

In [None]:
def tokenize_num(stoi, text):
  t = list(text)
  for i in range(len(text)):
    if(t[i] in stoi.keys()):
      t[i] = stoi[t[i]]
    else:
      t[i] = stoi[TEXT.unk_token]
  return(t)

In [None]:
filepath = '/content/drive/My Drive/ToxicSpan_CS669V/processed/finaldev.pkl'
f = open(filepath, 'rb')
data_df = pickle.load(f)
f.close()

In [None]:
X = data_df['token_final']
spans_list = data_df['span_final']
spans = data_df['spans']

targets = []
for x in data_df['target_final']:
  targets.extend(x)

In [None]:
predictions = []

for text in X:
  t = tokenize_num(stoi, text)
  erasure = InputErasure(model_load, t, mask=stoi[TEXT.unk_token])
  x = erasure.get_toxic_offsets()
  predictions.extend(x)

In [None]:
predictions = []

for text in X:
  # print(text)
  t = tokenize_num(stoi, text)
  erasure = LimeUsd(model_load, t, mask=stoi[TEXT.unk_token])
  x = erasure.get_toxic_offsets()
  predictions.extend(x)

In [None]:
F1score = f1_score(predictions, targets)
F1score

0.09212103599840168