In [1]:
import numpy as np
import pandas as pd


In [2]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

df_train = pd.read_csv('input/train.csv')

df_train = df_train.iloc[0:10000]
df_train_toxic = df_train.loc[(df_train['toxic'] == 1)]
df_train_toxic = df_train_toxic.iloc[0:100]

df_train_severe_toxic = df_train.loc[(df_train['severe_toxic'] == 1)]
df_train_severe_toxic = df_train_severe_toxic.iloc[0:30]

df_train = pd.concat([df_train, df_train_toxic, df_train_severe_toxic]).sample(frac=1)

x_train = df_train['comment_text']
y_train = df_train[categories].as_matrix()

display(df_train.head())
m = x_train.shape[0]
display(f'total m = {m}')


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
2885,07c7b3ccc7c5f128,ow! someone just blocked me! I don't think I c...,1,0,1,0,0,0
6586,119399eb6301e601,That was nice and civil wasn't it?,0,0,0,0,0,0
5007,0d41e1ae1b2dacab,older talk at: /helpfile\n notes as /notes,0,0,0,0,0,0
435,011c5f909f6956d2,Perhaps GoodDay you could provide a diff for t...,0,0,0,0,0,0
4619,0c3f6b56f0aa8355,I agree with everyone (except 202.47.52.206). ...,0,0,0,0,0,0


'total m = 10130'

In [3]:
from keras.preprocessing.text import *
from keras.preprocessing.sequence import skipgrams, pad_sequences

Using TensorFlow backend.


In [4]:
tokenizer = Tokenizer(lower=True)

In [5]:
tokenizer.fit_on_texts(texts=x_train)
sequences = tokenizer.texts_to_sequences(texts=x_train)

max_seq_len = 0
for sequence in sequences:
    max_seq_len = max(max_seq_len, len(sequence))

print('max length {}'.format(max_seq_len))

vocabulary_size = len(tokenizer.word_index)

display('vocabulary size {}'.format(vocabulary_size))

max length 1403


'vocabulary size 38830'

In [6]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten

embedding_dim = 50
train_embedding = Embedding(vocabulary_size + 1, embedding_dim, input_length=2, name='embedding')

model = Sequential()
model.add(train_embedding)
model.add(Flatten())
model.add(Dense(1, name='dense'))
model.add(Activation('sigmoid', name='activation'))
    
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [7]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

negative_samples=5

def skpgram_generator():
    padded_sequences = pad_sequences(sequences=sequences, maxlen=max_seq_len, padding='post')
    for sequence in padded_sequences:
        skipgram = skipgrams(sequence=sequence, vocabulary_size=vocabulary_size, negative_samples=negative_samples, shuffle=True)
        if not skipgram[0]:
            skipgram = (np.array(np.zeros((1, 2))), np.array(np.zeros((1, 1))))
        x_input = np.array(skipgram[0])
        y_input = np.array(skipgram[1])
        yield x_input, y_input


In [8]:
model.fit_generator(generator=skpgram_generator(), steps_per_epoch=m, epochs=1)


Epoch 1/1


<keras.callbacks.History at 0x109bc6cc0>

In [10]:
embedding = Embedding(vocabulary_size + 1, embedding_dim, input_length=max_seq_len, 
                            weights=train_embedding.get_weights(), trainable=False)

from keras.layers import Bidirectional, LSTM, GRU, Dense

input_model = Sequential()
input_model.add(embedding)
input_model.add(GRU(6))
input_model.add(Dense(6, activation='sigmoid'))

input_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
batch_size = 1024

def seq_generator(epochs):
    padded_sequences = pad_sequences(sequences=sequences, maxlen=max_seq_len, padding='post')
    for i in range(epochs):
        for sequence, y_train_val in zip(padded_sequences, y_train):
            x_input = np.array(sequence).reshape(1, max_seq_len)
            y_input = y_train_val.reshape(1, len(categories))
            yield x_input, y_input
     
hist = input_model.fit_generator(generator=seq_generator(2), steps_per_epoch=m, epochs=2)

print(hist.history.items)


Epoch 1/2
Epoch 2/2

In [None]:
def eval_generator():
    sequences = tokenizer.texts_to_sequences(df_train['comment_text'])
    for sequence in pad_sequences(sequences, maxlen=max_seq_len, padding='post'):
        yield np.array(sequence).reshape(1, max_seq_len)

evaluation_result = input_model.predict_generator(generator=eval_generator(), steps=df_train.shape[0])

In [None]:
display(df_train)


In [None]:
display(pd.DataFrame(evaluation_result))