In [1]:
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GRU, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Conv1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint
import gc
from sklearn.model_selection import train_test_split
from keras.models import load_model
import tensorflow as tf
from keras.models import model_from_json
from keras.utils.vis_utils import plot_model
import keras.backend
import unidecode
import json
import regex as re
import pickle

Using TensorFlow backend.


In [2]:
# Parameters
EMBEDSIZE = 50
MAXFEATURES = 2000
MAXLEN = 200
batch_size = 64
epochs = 3


In [3]:
train = pd.read_csv('data/train.csv')
test  = pd.read_csv('data/test.csv')
test_labels = pd.read_csv('data/test_labels.csv')
EMBEDDING_FILE = f'glove-twitter-27B/glove.twitter.27B.50d.txt'

print(test_labels.shape)
print(test.shape)

(153164, 7)
(153164, 2)


In [4]:
idx = test_labels.index[test_labels['toxic'] == -1].tolist()
np.array(idx).shape
test_labels = test_labels.drop(test_labels.index[idx])
test = test.drop(test.index[idx])
print(test_labels.shape)
print(test.shape)

(63978, 7)
(63978, 2)


In [5]:
def glove_preprocess(text):
    """
    adapted from https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb

    """
    # Different regex parts for smiley faces
    eyes = "[8:=;]"
    nose = "['`\-]?"
    text = re.sub("https?:* ", "<URL>", text)
    text = re.sub("www.* ", "<URL>", text)
    text = re.sub("\[\[User(.*)\|", '<USER>', text)
    text = re.sub("<3", '<HEART>', text)
    text = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*", "<NUMBER>", text)
    text = re.sub(eyes + nose + "[Dd)]", '<SMILE>', text)
    text = re.sub("[(d]" + nose + eyes, '<SMILE>', text)
    text = re.sub(eyes + nose + "p", '<LOLFACE>', text)
    text = re.sub(eyes + nose + "\(", '<SADFACE>', text)
    text = re.sub("\)" + nose + eyes, '<SADFACE>', text)
    text = re.sub(eyes + nose + "[/|l*]", '<NEUTRALFACE>', text)
    text = re.sub("/", " / ", text)
    text = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*", "<NUMBER>", text)
    text = re.sub("([!]){2,}", "! <REPEAT>", text)
    text = re.sub("([?]){2,}", "? <REPEAT>", text)
    text = re.sub("([.]){2,}", ". <REPEAT>", text)
    pattern = re.compile(r"(.)\1{2,}")
    text = pattern.sub(r"\1" + " <ELONG>", text)

    return text

In [11]:
X_train, X_test, y_train, y_test = train_test_split(train,
                                        train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]],
                                        test_size = 0.10, random_state = 42)
list_sentences_train = X_train["comment_text"].apply(glove_preprocess)
list_sentences_test = X_test["comment_text"].apply(glove_preprocess)
list_sentences_final_test = test["comment_text"].apply(glove_preprocess)

In [12]:
tokenizer = Tokenizer(num_words=MAXFEATURES,char_level=True)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_sentences_test = tokenizer.texts_to_sequences(list_sentences_test)
list_sentences_final_test = tokenizer.texts_to_sequences(list_sentences_final_test)

X_t = pad_sequences(list_tokenized_train, maxlen=MAXLEN)
X_te = pad_sequences(list_sentences_test, maxlen=MAXLEN)
X_test = pad_sequences(list_sentences_final_test, maxlen = MAXLEN)

In [13]:
def get_model(embedding_matrix, dropout = 0.2):
    inp = Input(shape=(MAXLEN,))
    x = Embedding(MAXFEATURES, EMBEDSIZE, weights=[ embedding_matrix])(inp)
    x = Conv1D(filters = 100, kernel_size = 4, padding = 'same', activation = 'relu' )(x)
    x = MaxPooling1D(pool_size =4)(x)
    x = Bidirectional(GRU(60, return_sequences=True, dropout=dropout, recurrent_dropout=0.2))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Dense(6, activation = "sigmoid")(x)
    model = Model(inputs= inp, outputs = x)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [14]:
def get_LSTM_model(embedding_matrix, dropout = 0.2):
    inp = Input(shape=(MAXLEN,))
    x = Embedding(MAXFEATURES, EMBEDSIZE, weights=[ embedding_matrix])(inp)
    x = Conv1D(filters = 100, kernel_size = 4, padding = 'same', activation = 'relu' )(x)
    x = MaxPooling1D(pool_size =4)(x)
    x = Bidirectional(GRU(60, return_sequences=True, dropout=dropout, recurrent_dropout=0.2))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Dense(6, activation = "sigmoid")(x)
    model = Model(inputs= inp, outputs = x)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [16]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE,encoding="utf8"))


In [17]:
word_index = tokenizer.word_index
num_words = min(MAXFEATURES, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDSIZE))
for word, i in word_index.items():
    if i >= MAXFEATURES:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [18]:
model = get_model(embedding_matrix, dropout=0.2)
model.summary()
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 50)           100000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 200, 100)          20100     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 50, 100)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 120)           57960     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 120)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6050      
__________

In [19]:
lstm_model = get_LSTM_model(embedding_matrix, dropout=0.2)
lstm_model.summary()
plot_model(lstm_model, to_file='model_lstm_plot.png', show_shapes=True, show_layer_names=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 200, 50)           100000    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 200, 100)          20100     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 50, 100)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 50, 120)           57960     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 120)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                6050      
__________

In [20]:
# Call backs
wtFile = "weights.best.hdf5"
checkpoint = ModelCheckpoint(wtFile, monitor = 'val_loss', verbose=1, save_best_only=True, mode = 'min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, early] #early

In [17]:
model.fit(X_t, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_te,y_test), callbacks=callbacks_list)
print("Saved Model Weights to file!!")

Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_loss improved from inf to 0.07072, saving model to weights.best.hdf5
Epoch 2/3

Epoch 00002: val_loss improved from 0.07072 to 0.06144, saving model to weights.best.hdf5
Epoch 3/3

Epoch 00003: val_loss improved from 0.06144 to 0.06117, saving model to weights.best.hdf5
Saved Model Weights to file!!


In [21]:
model.load_weights(wtFile)

# y_test = model.predict(X_te)

In [30]:
lstm_model.load_weights('weights.best_lstm.hdf5')


In [14]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
print("Saved Model to file!!")    

Saved Model to file!!


In [18]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Saved Tokenizer to file!!")  

Saved Tokenizer to file!!


In [175]:
with open('embedding_index.pickle', 'wb') as handle:
    pickle.dump(embeddings_index, handle, protocol= pickle.HIGHEST_PROTOCOL)
print("Saved Embedding Index to file!!!")

Saved Embedding Index to file!!!


In [36]:
wtFile = "weights.best_lstm.hdf5"
checkpoint = ModelCheckpoint(wtFile, monitor = 'val_loss', verbose=1, save_best_only=True, mode = 'min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, early] #early
lstm_model.fit(X_t, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_te,y_test),callbacks=callbacks_list)
model_lstm_json = lstm_model.to_json()
with open("model_lstm.json", "w") as json_file:
    json_file.write(model_lstm_json)
print("Saved LSTM Model to file!!") 
print("Saved  LSTM Model Weights to file!!")

Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_loss improved from inf to 0.07673, saving model to weights.best_lstm.hdf5
Epoch 2/3

Epoch 00002: val_loss improved from 0.07673 to 0.06452, saving model to weights.best_lstm.hdf5
Epoch 3/3

Epoch 00003: val_loss improved from 0.06452 to 0.05795, saving model to weights.best_lstm.hdf5
Saved LSTM Model to file!!
Saved  LSTM Model Weights to file!!


In [22]:
lstm_model.summary()
lstm_model.metrics
lstm_model.loss

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 50)           100000    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 100, 100)          20100     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 25, 100)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 25, 120)           57960     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 120)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 50)                6050      
__________

'binary_crossentropy'

In [143]:
NUM_OF_WORDS = 1

In [144]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [164]:
def highlight_sentence(toxic_text, words_to_highlight):
    listOfWords = toxic_text.split()
    out_str = ''
    for i in range(len(listOfWords)):
        if i in words_to_highlight:
            for k in range(NUM_OF_WORDS):
                listOfWords[i+k] = bcolors.WARNING + listOfWords[i+k] + bcolors.ENDC
#             listOfWords[i+1] = bcolors.WARNING + listOfWords[i+1] + bcolors.ENDC
    out_str = ' '.join(listOfWords)
    highlight_txt = bcolors.FAIL + 'Highlighted' + bcolors.ENDC
    print(highlight_txt, ' - ', out_str)
#     print(out_str)
    

In [165]:
def predict_toxicity(toxic_text, word):
#     print(toxic_text)
    text_pp = glove_preprocess(toxic_text)
#     print(text_pp)
    tok_text = tokenizer.texts_to_sequences([text_pp])
#     print(tok_text)
    x = pad_sequences(tok_text, maxlen=MAXLEN)
#     print(x.shape)
    y = model.predict(x)
#     print(word, '-', np.round(y,3))
    return y

In [166]:
def text_representation(toxic_text):
    listOfWords = toxic_text.split()
#     print(listOfWords)
    nWords = len(listOfWords)
    baseline = predict_toxicity(toxic_text, 'BASELINE')
    words_to_highlight = []
    for i in range(nWords-(NUM_OF_WORDS-1)):
        listOfWords = toxic_text.split()
        curWord = ' '.join(listOfWords[i:i+2])
    #     print('Iter - ', i)
    #     print(listOfWords)
        for k in range(NUM_OF_WORDS):
            del listOfWords[i]
    #     del listOfWords[i]
        cur_toxic_text = ' '.join(listOfWords)
        curValue = predict_toxicity(cur_toxic_text, curWord)
        if curValue[0][0] < baseline[0][0]:
            words_to_highlight.append(i)
#     print(words_to_highlight)        
    highlight_sentence(toxic_text, words_to_highlight)
    

In [168]:
idx =test_labels.index[test_labels['toxic'] ==1].tolist()
print('Number of Toxic Comments - ', len(idx))
for i in range(10):
    toxic_idx = idx[i]
    toxic_text = test['comment_text'][toxic_idx]
    orignial_txt = bcolors.BOLD + 'Original   ' + bcolors.ENDC
    print(orignial_txt, ' - ', toxic_text)
    text_representation(toxic_text)


Number of Toxic Comments -  6090
[1mOriginal   [0m  -  == Arabs are committing genocide in Iraq, but no protests in Europe. == 

 May Europe also burn in hell.
[91mHighlighted[0m  -  [93m==[0m [93mArabs[0m [93mare[0m committing [93mgenocide[0m [93min[0m Iraq, [93mbut[0m [93mno[0m protests in Europe. == [93mMay[0m [93mEurope[0m [93malso[0m [93mburn[0m in [93mhell.[0m
[1mOriginal   [0m  -  DJ Robinson is gay as hell! he sucks his dick so much!!!!!
[91mHighlighted[0m  -  DJ Robinson is [93mgay[0m [93mas[0m [93mhell![0m [93mhe[0m [93msucks[0m his [93mdick[0m [93mso[0m [93mmuch!!!!![0m
[1mOriginal   [0m  -  :Fuck off, you anti-semitic cunt.  |
[91mHighlighted[0m  -  [93m:Fuck[0m [93moff,[0m [93myou[0m anti-semitic [93mcunt.[0m [93m|[0m
[1mOriginal   [0m  -  How dare you vandalize that page about the HMS Beagle! Don't vandalize again, demon!
[91mHighlighted[0m  -  How dare [93myou[0m vandalize that page [93mabout[0m [93mt

In [41]:
lstm_model.evaluate(X_test, test_labels, verbose = 1)



[0.39151233960310006, 0.0025321204163931349]

In [42]:
y_predicted = lstm_model.predict(X_test)

In [65]:
y_predicted = lstm_model.predict([X_test], batch_size=1024, verbose=1)



In [64]:
y_pred = (y_predicted == y_predicted.max(axis=1)[:,None]).astype(int)
print(y_pred)
print(y_predicted[-2,:])

[[0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 ..., 
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]]
[ 0.43294343  0.33663076  0.470263    0.36656839  0.44876203  0.39546281]


In [27]:
y_true = np.array(test_labels)
y_true = y_true[:,1:]
print(y_true)

[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ..., 
 [0 0 0 0 0 0]
 [1 0 1 0 1 0]
 [0 0 0 0 0 0]]


In [21]:
y_predicted = model.predict([X_test], batch_size=1024, verbose=1)



In [22]:
print(y_predicted)

[[  5.24215028e-03   4.76808509e-06   5.76814346e-04   1.55739726e-05
    7.61029485e-04   6.87712673e-05]
 [  1.65921282e-02   3.43151223e-05   2.45818496e-03   1.28129454e-04
    3.00476723e-03   4.45229874e-04]
 [  1.06181921e-02   1.75847053e-05   1.67270552e-03   2.93064240e-05
    1.86486216e-03   1.52371911e-04]
 ..., 
 [  3.39214504e-01   1.34784477e-02   8.66590887e-02   3.15312371e-02
    1.76694095e-01   4.85188439e-02]
 [  9.26178515e-01   7.36789703e-02   8.10041368e-01   2.19827201e-02
    5.57457924e-01   6.67238161e-02]
 [  1.28179435e-02   1.73915632e-05   2.00568093e-03   4.35755865e-05
    2.16767122e-03   2.06897355e-04]]


In [25]:
print(y_true[-2,:])
print(y_predicted[-2,:])

['fffac2a094c8e0e2' 1 0 1 0 1 0]
[ 0.92617851  0.07367897  0.81004137  0.02198272  0.55745792  0.06672382]


In [28]:
model.evaluate(X_test, y_true, verbose = 1)



[0.075009704749482611, 0.97380088891535543]

In [37]:
lstm_model.evaluate(X_test, y_true, verbose = 1)



[0.073987366234456106, 0.97335542349150694]

In [6]:
train.shape

(159571, 8)

In [10]:
model = Model()
model.load_weights(wtFile)


ValueError: You are trying to load a weight file containing 5 layers into a model with 0 layers.

In [171]:
infile = open('tokenizer.pickle','rb')
new_dict = pickle.load(infile)
infile.close()

In [172]:
new_dict

<keras_preprocessing.text.Tokenizer at 0x25feb66acc0>