# Multi-label text Classification using CNNs

Dataset from: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

In [7]:
from __future__ import print_function, division
from builtins import range
from tqdm import tqdm
import os
import sys
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.metrics import roc_auc_score

#### Setting some configurations

In [3]:
max_sequence_length = 100 #Since input data is internet comments 100 is reasonable
max_vocab_size = 20000 #based on the study that an average native english speaker knows 20k words
embedding_dim = 100 #for pre-trained embeddings size is usually in multiples of 50
validation_split = 0.2
batch_size = 128
epochs = 10

#### Loading pre-trained word vectors

What we want to do is to create a dictionary in which the key is the word and the value is the word vector

In [9]:
print("...Loading Word Vectors...")

word_vec_dict = {}

with open(os.path.join('data/glove.6B.%sd.txt'%embedding_dim)) as word_vec_file:
    for line in tqdm(word_vec_file):
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:],dtype='float32')
        word_vec_dict[word] = vec
        
#     print('Found {} word vectors'.format(word_vec_dict))

7748it [00:00, 38732.79it/s]

...Loading Word Vectors...


400000it [00:10, 38094.21it/s]


#### Loading Dataset

In [12]:
train = pd.read_csv("data/train.csv")
sentences = train['comment_text'].values
possible_labels = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
targets = train[possible_labels].values

In [19]:
(sentences.shape,targets.shape)

((159571,), (159571, 6))

#### Preprocessing

In [18]:
#Tokenization
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [23]:
#Word-index mapping
word_index_mapping = tokenizer.word_index

In [21]:
word_index_mapping

{'the': 1,
 'to': 2,
 'of': 3,
 'and': 4,
 'a': 5,
 'you': 6,
 'i': 7,
 'is': 8,
 'that': 9,
 'in': 10,
 'it': 11,
 'for': 12,
 'this': 13,
 'not': 14,
 'on': 15,
 'be': 16,
 'as': 17,
 'have': 18,
 'are': 19,
 'your': 20,
 'with': 21,
 'if': 22,
 'article': 23,
 'was': 24,
 'or': 25,
 'but': 26,
 'page': 27,
 'wikipedia': 28,
 'my': 29,
 'an': 30,
 'from': 31,
 'by': 32,
 'do': 33,
 'at': 34,
 'me': 35,
 'about': 36,
 'so': 37,
 'talk': 38,
 'what': 39,
 'can': 40,
 'there': 41,
 'all': 42,
 'has': 43,
 'will': 44,
 'please': 45,
 'no': 46,
 'would': 47,
 'one': 48,
 'like': 49,
 'just': 50,
 'they': 51,
 'he': 52,
 'which': 53,
 'any': 54,
 'been': 55,
 'should': 56,
 'more': 57,
 'we': 58,
 "don't": 59,
 'some': 60,
 'other': 61,
 'who': 62,
 'here': 63,
 'see': 64,
 'also': 65,
 'his': 66,
 'think': 67,
 'because': 68,
 'know': 69,
 'how': 70,
 'edit': 71,
 'am': 72,
 "i'm": 73,
 'people': 74,
 'why': 75,
 'up': 76,
 'only': 77,
 "it's": 78,
 'out': 79,
 'articles': 80,
 'use': 81,

In [24]:
#padding sequences
data = pad_sequences(sequences, maxlen=max_sequence_length)

In [28]:
#Prepare embedding matrix
print("Filling pre-trained embeddings")
num_words = min(max_vocab_size, len(word_index_mapping)+1) #because keras embeddings start from 1, 0 is reserved for padding
embedding_matrix = np.zeros((num_words,embedding_dim))
for word,i in word_index_mapping.items():
    if i<max_vocab_size:
        embedding_vector = word_vec_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings


#### Building Model

In [32]:
embedding_layer = Embedding(num_words,
                           embedding_dim,
                           weights=[embedding_matrix],
                           input_length=max_sequence_length,
                           trainable=False)

input_layer = Input(shape=(max_sequence_length,))
x = embedding_layer(input_layer)
x = Conv1D(128,3,activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128,3,activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128,3,activation='relu')(x)
x = MaxPooling1D(3)(x)
x = GlobalMaxPooling1D()(x)
output_layer = Dense(len(possible_labels),activation='sigmoid')(x)
model = Model(input_layer,output_layer)

model.compile(loss='binary_crossentropy',metrics=['accuracy'],optimizer='rmsprop')

print("Training model")

history = model.fit(data,targets,batch_size=batch_size,
                    epochs=epochs,validation_split=validation_split)


Training model
Train on 127656 samples, validate on 31915 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
