In [1]:
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GRU, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Conv1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint
import gc
from sklearn.model_selection import train_test_split
from keras.models import load_model
import tensorflow as tf
from keras.models import model_from_json
from keras.utils.vis_utils import plot_model
import keras.backend
import unidecode
import json
import regex as re
import pickle

Using TensorFlow backend.


In [2]:
# Parameters
EMBEDSIZE = 50
MAXFEATURES = 2000
MAXLEN = 200
batch_size = 64
epochs = 3

In [3]:
#Data
test  = pd.read_csv('data/test.csv')
test_labels = pd.read_csv('data/test_labels.csv')
# EMBEDDING_FILE = f'glove-twitter-27B/glove.twitter.27B.50d.txt'

print(test_labels.shape)
print(test.shape)

idx = test_labels.index[test_labels['toxic'] == -1].tolist()
np.array(idx).shape
test_labels = test_labels.drop(test_labels.index[idx])
test = test.drop(test.index[idx])
print(test_labels.shape)
print(test.shape)

(153164, 7)
(153164, 2)
(63978, 7)
(63978, 2)


In [4]:
def glove_preprocess(text):
    """
    adapted from https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb

    """
    # Different regex parts for smiley faces
    eyes = "[8:=;]"
    nose = "['`\-]?"
    text = re.sub("https?:* ", "<URL>", text)
    text = re.sub("www.* ", "<URL>", text)
    text = re.sub("\[\[User(.*)\|", '<USER>', text)
    text = re.sub("<3", '<HEART>', text)
    text = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*", "<NUMBER>", text)
    text = re.sub(eyes + nose + "[Dd)]", '<SMILE>', text)
    text = re.sub("[(d]" + nose + eyes, '<SMILE>', text)
    text = re.sub(eyes + nose + "p", '<LOLFACE>', text)
    text = re.sub(eyes + nose + "\(", '<SADFACE>', text)
    text = re.sub("\)" + nose + eyes, '<SADFACE>', text)
    text = re.sub(eyes + nose + "[/|l*]", '<NEUTRALFACE>', text)
    text = re.sub("/", " / ", text)
    text = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*", "<NUMBER>", text)
    text = re.sub("([!]){2,}", "! <REPEAT>", text)
    text = re.sub("([?]){2,}", "? <REPEAT>", text)
    text = re.sub("([.]){2,}", ". <REPEAT>", text)
    pattern = re.compile(r"(.)\1{2,}")
    text = pattern.sub(r"\1" + " <ELONG>", text)

    return text

In [5]:
def get_model(embedding_matrix, dropout = 0.2):
    inp = Input(shape=(MAXLEN,))
    x = Embedding(MAXFEATURES, EMBEDSIZE, weights=[ embedding_matrix])(inp)
    x = Conv1D(filters = 100, kernel_size = 4, padding = 'same', activation = 'relu' )(x)
    x = MaxPooling1D(pool_size =4)(x)
    x = Bidirectional(GRU(60, return_sequences=True, dropout=dropout, recurrent_dropout=0.2))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Dense(6, activation = "sigmoid")(x)
    model = Model(inputs= inp, outputs = x)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [6]:
tok_file = open('tokenizer.pickle','rb')
tokenizer = pickle.load(tok_file)
tok_file.close()
print('Loaded Tokenizer')

emb_idx_file = open('embedding_index.pickle','rb')
embeddings_index = pickle.load(emb_idx_file)
emb_idx_file.close()
print('Loaded Embedding Index')

Loaded Tokenizer
Loaded Embedding Index


In [7]:
word_index = tokenizer.word_index
num_words = min(MAXFEATURES, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDSIZE))
for word, i in word_index.items():
    if i >= MAXFEATURES:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [8]:
model = get_model(embedding_matrix, dropout=0.2)
model.summary()
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
wtFile = "weights.best.hdf5"
model.load_weights(wtFile)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 50)           100000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 200, 100)          20100     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 50, 100)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 120)           57960     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 120)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6050      
__________

In [9]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [13]:
NUM_OF_WORDS = 1

In [14]:
def highlight_sentence(toxic_text, words_to_highlight):
    listOfWords = toxic_text.split()
    out_str = ''
    for i in range(len(listOfWords)):
        if i in words_to_highlight:
            for k in range(NUM_OF_WORDS):
                listOfWords[i+k] = bcolors.WARNING + listOfWords[i+k] + bcolors.ENDC
#             listOfWords[i+1] = bcolors.WARNING + listOfWords[i+1] + bcolors.ENDC
    out_str = ' '.join(listOfWords)
    highlight_txt = bcolors.FAIL + 'Highlighted' + bcolors.ENDC
    print(highlight_txt, ' - ', out_str)
#     print(out_str)

def predict_toxicity(toxic_text, word):
#     print(toxic_text)
    text_pp = glove_preprocess(toxic_text)
#     print(text_pp)
    tok_text = tokenizer.texts_to_sequences([text_pp])
#     print(tok_text)
    x = pad_sequences(tok_text, maxlen=MAXLEN)
#     print(x.shape)
    y = model.predict(x)
#     print(word, '-', np.round(y,3))
    return y

def text_representation(toxic_text):
    listOfWords = toxic_text.split()
#     print(listOfWords)
    nWords = len(listOfWords)
    baseline = predict_toxicity(toxic_text, 'BASELINE')
    words_to_highlight = []
    for i in range(nWords-(NUM_OF_WORDS-1)):
        listOfWords = toxic_text.split()
        curWord = ' '.join(listOfWords[i:i+2])
    #     print('Iter - ', i)
    #     print(listOfWords)
        for k in range(NUM_OF_WORDS):
            del listOfWords[i]
    #     del listOfWords[i]
        cur_toxic_text = ' '.join(listOfWords)
        curValue = predict_toxicity(cur_toxic_text, curWord)
        if curValue[0][0] < baseline[0][0]:
            words_to_highlight.append(i)
#     print(words_to_highlight)        
    highlight_sentence(toxic_text, words_to_highlight)
    

In [15]:
# Examples
idx =test_labels.index[test_labels['toxic'] ==1].tolist()
print('Number of Toxic Comments - ', len(idx))
for i in range(10):
    toxic_idx = idx[i]
    toxic_text = test['comment_text'][toxic_idx]
    orignial_txt = bcolors.BOLD + 'Original   ' + bcolors.ENDC
    print(orignial_txt, ' - ', toxic_text)
    text_representation(toxic_text)


Number of Toxic Comments -  6090
[1mOriginal   [0m  -  == Arabs are committing genocide in Iraq, but no protests in Europe. == 

 May Europe also burn in hell.
[91mHighlighted[0m  -  [93m==[0m [93m[93mArabs[0m[0m [93mare[0m committing genocide [93min[0m [93mIraq,[0m [93mbut[0m [93mno[0m protests in Europe. [93m==[0m [93m[93mMay[0m[0m [93m[93mEurope[0m[0m [93m[93malso[0m[0m [93mburn[0m [93min[0m [93mhell.[0m
[1mOriginal   [0m  -  DJ Robinson is gay as hell! he sucks his dick so much!!!!!
[91mHighlighted[0m  -  DJ Robinson [93mis[0m [93m[93mgay[0m[0m [93m[93mas[0m[0m [93m[93mhell![0m[0m [93m[93mhe[0m[0m [93m[93msucks[0m[0m [93m[93mhis[0m[0m [93m[93mdick[0m[0m [93mso[0m much!!!!!
[1mOriginal   [0m  -  :Fuck off, you anti-semitic cunt.  |
[91mHighlighted[0m  -  [93m:Fuck[0m [93m[93moff,[0m[0m [93myou[0m [93manti-semitic[0m [93m[93mcunt.[0m[0m [93m|[0m
[1mOriginal   [0m  -  How dare you vandali