In [28]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [29]:
# Constants
MAX_SEQUENCE_LENGTH = 80
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 1

#### Glove from standford
http://nlp.stanford.edu/data/glove.6B.zip

In [30]:
# load in pre-trained word vectors
word2vec = {}
with open(os.path.join('../../datasets/word2vec/glove.6B/glove.6B.%sd.txt' % EMBEDDING_DIM),
          encoding='utf8') as f:
    # is just a space-separated text file in the format:
    # word vec[0] vec[1] vec[2] ...
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
print('Found %s word vectors (from loaded encodings).' % len(word2vec))

Found 400000 word vectors (from loaded encodings).


#### Dataset
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/

In [31]:
# reading sentences and outputs (classes) 
train = pd.read_csv("../../datasets/classification/toxic comments/train.csv")
sentences = train["comment_text"].fillna('DUMMY').values
classes = train.columns[2:]
targets = train[classes].values

In [32]:
# convert the sentences (strings) into integers sequences
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [33]:
# some statistics
print("max sequence length:", max(len(s) for s in sequences))
print("min sequence length:", min(len(s) for s in sequences))
s = sorted(len(s) for s in sequences)
print("median sequence length:", s[len(s) // 2])

max sequence length: 1400
min sequence length: 0
median sequence length: 35


In [34]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens (from sentences)' % len(word2idx))

Found 210337 unique tokens (from sentences)


In [35]:
# pad sequences to have all sentences with the same length
data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

Shape of data tensor: (159571, 80)


In [36]:
# prepare embedding matrix (words not having an pre-trained embedding get all zeros as embedding)
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
    if i < num_words:
        if word in word2vec:
            embedding_matrix[i] = word2vec[word]       

Filling pre-trained embeddings...


In [37]:
# creates embedding layer (not trainable)
embedding = tf.keras.layers.Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
    trainable=False
)

In [38]:
# train a 1D convnet with global maxpooling
input_ = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding(input_)
x = tf.keras.layers.Conv1D(128, 3, activation='relu')(x)
x = tf.keras.layers.MaxPooling1D(3)(x)
x = tf.keras.layers.Conv1D(128, 3, activation='relu')(x)
x = tf.keras.layers.MaxPooling1D(3)(x)
x = tf.keras.layers.Conv1D(128, 3, activation='relu')(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
output_ = tf.keras.layers.Dense(len(classes), activation='sigmoid')(x)

In [39]:
model = tf.keras.Model(input_, output_)
model.compile(
  loss='binary_crossentropy',
  optimizer='rmsprop',
  metrics=['accuracy']
)

In [40]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 80)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 80, 100)           2000000   
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 78, 128)           38528     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 26, 128)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 24, 128)           49280     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 8, 128)            0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 6, 128)            4928

In [41]:
print('Training model...')
r = model.fit(
  data,
  targets,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)

Training model...
Train on 127656 samples, validate on 31915 samples
