**Toxic comment classification** CNN

In [1]:
import re
from nltk.corpus import stopwords

In [2]:
from keras import regularizers, initializers, optimizers, callbacks
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.layers import *
from keras.models import Model

Using TensorFlow backend.


In [3]:
MAX_NB_WORDS = 200000    # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 400 # max length of each entry (sentence), including padding
VALIDATION_SPLIT = 0.2   # data for validation (not used in training)
EMBEDDING_DIM = 100      # embedding dimensions for word vectors (GloVe)

In [4]:
import numpy as np
import pandas as pd
data = pd.read_csv('data/train.csv')

In [5]:
data.sample(frac=1)
data.shape

(159571, 8)

In [6]:
labels = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values


In [7]:
num_test_samples = int(VALIDATION_SPLIT*data.shape[0])
x_train = data[: -num_test_samples]
y_train = labels[: -num_test_samples]
x_test = data[-num_test_samples: ]
y_test = labels[-num_test_samples: ]

In [8]:
y_train.shape, y_test.shape

((127657, 6), (31914, 6))

In [9]:
X_train = x_train['comment_text']
X_test = x_test['comment_text'] 

counting how many words are there in each type of toxic comment 


In [10]:
print('each count in train: ', y_train.sum(axis=0))
print('each count in test: ', y_test.sum(axis=0))

each count in train:  [12257  1284  6780   386  6295  1100]
each count in test:  [3037  311 1669   92 1582  305]


#### removing stop words, punctuations, making lowercase

In [11]:
X_train = X_train.apply(lambda x: x.strip().lower())
X_train = X_train.apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))

stopWords = set(stopwords.words('english'))
def remove_stopwords(line):
    clean_words = [word for word in line.split() if word not in stopWords]
    return ' '.join(clean_words)
X_train = X_train.apply(remove_stopwords)

In [12]:
X_test = X_test.apply(lambda x: x.strip().lower())
X_test = X_test.apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))

stopWords = set(stopwords.words('english'))
def remove_stopwords(line):
    clean_words = [word for word in line.split() if word not in stopWords]
    return ' '.join(clean_words)
X_test = X_test.apply(remove_stopwords)

In [13]:
X_train, X_test

(0         explanation edits made username hardcore metal...
 1         daww matches background colour im seemingly st...
 2         hey man im really trying edit war guy constant...
 3         cant make real suggestions improvement wondere...
 4                       sir hero chance remember page thats
                                 ...                        
 127652                                             well see
 127653    speedy deletion 27 tricor ave new paltz ny 125...
 127654    additions made 100 factual apparent youre anot...
 127655    nathan thousand times seems let try explain ar...
 127656    march 2007 utc imagemarist high school oregon ...
 Name: comment_text, Length: 127657, dtype: object,
 127657    notice changed username accordance wiki polici...
 127658    wp articles genealogical entries trees says wp...
 127659                  redirect talkjohn rogers footballer
 127660    nfl draft batch copyandpasting draft order tim...
 127661    discussion copied wp r

In [14]:
print('Sample data:', X_train[1], y_train[1])

Sample data: daww matches background colour im seemingly stuck thanks talk 2151 january 11 2016 utc [0 0 0 0 0 0]


**oov_token** (if it ever sees a new word)

In [15]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token = True)
tokenizer.fit_on_texts(X_train)

In [16]:

sequences = tokenizer.texts_to_sequences(X_train)
word_index = tokenizer.word_index
print('Vocabulary size:', len(word_index))

Vocabulary size: 213981


In [17]:
sequences2 = tokenizer.texts_to_sequences(X_test)
word_index2 = tokenizer.word_index
print('Vocabulary size:', len(word_index2))

Vocabulary size: 213981


In [18]:
sequences[0]

[517,
 44,
 48,
 512,
 4356,
 12081,
 1155,
 212,
 1928,
 10536,
 6576,
 2517,
 2689,
 37,
 1021,
 14712,
 2651,
 6,
 10,
 137,
 309,
 5,
 3,
 59,
 14,
 3457,
 58446]

In [19]:

data = pad_sequences(sequences, padding = 'post', maxlen = MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y_train.shape)

Shape of data tensor: (127657, 400)
Shape of label tensor: (127657, 6)


In [20]:
data2 = pad_sequences(sequences2, padding = 'post', maxlen = MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data2.shape)
print('Shape of label tensor:', y_test.shape)

Shape of data tensor: (31914, 400)
Shape of label tensor: (31914, 6)


In [21]:
print('Tokenized sentences: \n', data[0])
print('One hot label: \n', y_train[0])

Tokenized sentences: 
 [  517    44    48   512  4356 12081  1155   212  1928 10536  6576  2517
  2689    37  1021 14712  2651     6    10   137   309     5     3    59
    14  3457 58446     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0    

In [22]:
from keras.constraints import max_norm
from keras.layers import (
    Input, Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout
)
from keras.models import Model
from keras.optimizers import Adam

### Model

**References**:

Georgakopoulos, S. V., Tasoulis, S. K., Vrahatis, A. G., & Plagianakos, V. P. (2018, July). Convolutional neural networks for toxic comment classification. In Proceedings of the 10th Hellenic Conference on Artificial Intelligence (p. 35). ACM.

Kim, Y. (2014). Convolutional neural networks for sentence classification. arXiv preprint arXiv:1408.5882.

In [23]:
# params
# see Georgakopoulos et al. (2018)
n_filters = 128
dropout_rate = 0.5
fc_dim = 300
# l2_norm constraint, see Kim (2014)
s = 3.

learning_rate = 0.005
batch_size = 64
epochs = 1

In [24]:
# we get in (examples, words, embedding_size) tensor
input_tensor = Input(shape=(MAX_SEQUENCE_LENGTH,))

embedding_tensor = Embedding(
    MAX_NB_WORDS,  # vocabulary size
    EMBEDDING_DIM,  # dimension of dense embedding
    input_length=MAX_SEQUENCE_LENGTH 
)(input_tensor)  # outputs (, seq_length, embeddin_dims) tensor

# -- convolution block --
block_1 = Conv1D(
    n_filters, 
    kernel_size=3,
    padding='valid',
    activation='relu',
    strides=1,
    kernel_constraint=max_norm(s)
)(embedding_tensor)  # output is (batch, new_steps, filters)
# max over time pooling
block_1 = GlobalMaxPooling1D()(block_1)  # output is tensor of shape (batch, filters)

block_2 = Conv1D(
    n_filters, 
    kernel_size=4,
    padding='valid',
    activation='relu',
    strides=1,
    kernel_constraint=max_norm(s)
)(embedding_tensor)
block_2 = GlobalMaxPooling1D()(block_2)

block_3 = Conv1D(
    n_filters, 
    kernel_size=5,
    padding='valid',
    activation='relu',
    strides=1,
    kernel_constraint=max_norm(s)
)(embedding_tensor)
# max-over-time pooling
block_3 = GlobalMaxPooling1D()(block_3)


# -- fully-connected block --
# concatenate results of into tensor of shape (batch, filters + filters + filters)
concat = Concatenate()([block_1, block_2, block_3])
# dropout
concat = Dropout(dropout_rate)(concat)

fc = Dense(
    fc_dim,
    activation='relu',
    kernel_constraint=max_norm(s)
)(concat)
fc = Dropout(dropout_rate)(fc)

# fully-connected softmax layer with l2 regularization
predictions = Dense(
    6,
    activation='sigmoid',
    kernel_constraint=max_norm(s)
)(fc)

In [25]:
# This creates a model that includes
# the Input layer and three Dense layers
model = Model(inputs=input_tensor, outputs=predictions)


adam = Adam(lr=learning_rate)
model.compile(optimizer=adam,
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 400, 100)     20000000    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 398, 128)     38528       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 397, 128)     51328       embedding_1[0][0]                
____________________________________________________________________________________________

In [26]:
data.shape, y_train.shape

((127657, 400), (127657, 6))

In [27]:
import timeit 

In [28]:
start_time = timeit.default_timer()
history = model.fit(
    data, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(data2, y_test)
)
elapsed_pa = timeit.default_timer() - start_time

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 127657 samples, validate on 31914 samples
Epoch 1/1


In [29]:
print(elapsed_pa)

1096.59785493


In [30]:
predict = model.predict(data2)

In [31]:
from sklearn.metrics import hamming_loss #for multiclass
from sklearn.metrics import accuracy_score

#### Loss and accuracy

In [32]:
predict = np.round(predict)
loss = hamming_loss(y_test,predict)
print("Hamming_loss : {}".format(loss*100))
accuracy = accuracy_score(y_test,predict)
print("Accuracy : {}".format(accuracy*100))

Hamming_loss : 2.0064339579286
Accuracy : 91.11675126903553
