In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import math

from keras.preprocessing.text import *
from keras.preprocessing.sequence import skipgrams, pad_sequences

Using TensorFlow backend.


In [2]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

df_train = pd.read_csv('input/train.csv')

x_train = df_train['comment_text']
y_train = df_train[categories].as_matrix()

display(df_train.head())
m = x_train.shape[0]
display(f'total m = {m}')


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


'total m = 159571'

In [3]:
tokenizer = Tokenizer(lower=True)

In [4]:
tokenizer.fit_on_texts(texts=x_train)
sequences = tokenizer.texts_to_sequences(texts=x_train)

max_seq_len = 0
for sequence in sequences:
    max_seq_len = max(max_seq_len, len(sequence))

print('max length {}'.format(max_seq_len))

vocabulary_size = len(tokenizer.word_index)

display('vocabulary size {}'.format(vocabulary_size))

max length 1403


'vocabulary size 210337'

In [5]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten

embedding_dim = 50
train_embedding = Embedding(vocabulary_size + 1, embedding_dim, input_length=2, name='embedding')

model = Sequential()
model.add(train_embedding)
model.add(Flatten())
model.add(Dense(1, name='dense'))
model.add(Activation('sigmoid', name='activation'))
    
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

negative_samples=5

def skipgram_inputs_generator(epochs):
    for i in range(epochs):
        for sequence in sequences:
            skipgram = skipgrams(sequence=sequence, vocabulary_size=vocabulary_size, negative_samples=negative_samples, shuffle=True)
            if not skipgram[0]:
                skipgram = np.zeros((1, 2)), np.zeros((1, 1))
            yield np.array(skipgram[0]), np.array(skipgram[1])
            
def skipgram_inputs():
    x_list = []
    y_list = []
    for sequence in tqdm(sequences):
        skipgram = skipgrams(sequence=sequence, vocabulary_size=vocabulary_size, negative_samples=negative_samples, shuffle=True)
        if not skipgram[0]:
            skipgram = np.zeros((1, 2)), np.zeros((1, 1))
        x_list.extend(skipgram[0]), y_list.extend(skipgram[1])
    return np.array(x_list), np.array(y_list)

x_emb_train, y_emb_train = skipgram_inputs()

100%|██████████| 159571/159571 [1:09:36<00:00, 38.21it/s]


In [None]:
#model.fit_generator(generator=skipgram_inputs_generator(3), epochs=3, steps_per_epoch=m)

model.fit(x=x_emb_train, y=y_emb_train, batch_size=2048, epochs=3)

Epoch 1/3

In [None]:
embedding = Embedding(vocabulary_size + 1, embedding_dim, weights=train_embedding.get_weights(), trainable=False)

from keras.layers import Bidirectional, LSTM, GRU, Dense

input_model = Sequential()
input_model.add(embedding)
input_model.add(GRU(32))
input_model.add(Dense(6, activation='sigmoid'))

input_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
batch_size = 1024

def inputs():
    padded_sequences = pad_sequences(sequences=sequences, maxlen=max_seq_len, padding='post')
    x_list = []
    y_list = []
    for sequence, y_train_val in zip(padded_sequences, y_train):
        x_list.append(sequence)
        y_list.append(y_train_val)
    return np.array(x_list), np.array(y_list)

x_inputs, y_inputs = inputs()

hist = input_model.fit(x=x_inputs, y=y_inputs, epochs=6, batch_size=batch_size)

print(hist.history.items())


In [None]:
y_pred = input_model.predict(x=x_inputs)

In [None]:
display(y_pred)

y_pred_labels = np.zeros((y_pred.shape[0], y_pred.shape[1]))

for i in range(len(categories)):
    y_pred_labels[y_pred[:,i] > 0.5, i] = 1

In [None]:
display(y_pred_labels)


In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_train, y_pred_labels)