In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import math

from keras.preprocessing.text import *
from keras.preprocessing.sequence import skipgrams, pad_sequences

Using TensorFlow backend.


In [2]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

df_train = pd.read_csv('input/train.csv')

x_train = df_train['comment_text']
y_train = df_train[categories].as_matrix()

display(df_train.head())
m = x_train.shape[0]
display(f'total m = {m}')

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


'total m = 159571'

In [3]:
embedding_index = {}

f = open('input/glove.6B.300d.txt')
for line in f:
    values = line.split()
    w = values[0]
    weights = np.asarray(values[1:], dtype='float32')
    embedding_index[w] = weights
f.close()

In [4]:
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(texts=x_train)

vocabulary_size = len(tokenizer.word_index)
sequences = tokenizer.texts_to_sequences(texts=x_train)

max_seq_len = 0
for sequence in sequences:
    max_seq_len = max(max_seq_len, len(sequence))

print('max length {}'.format(max_seq_len))

min_seq_len = max_seq_len
for sequence in sequences:
    min_seq_len = min(min_seq_len, len(sequence))
    
print('min length {}'.format(min_seq_len))

max length 1403
min length 1


In [5]:
np.mean([len(x) for x in sequences])


68.221569082101382

In [6]:
embedding_dim = 300

embedding_matrix = np.zeros((vocabulary_size + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten
from keras.metrics import categorical_accuracy
from keras.layers import Bidirectional, LSTM, GRU, Dense

import keras.backend as K
import tensorflow as tf

embedding = Embedding(vocabulary_size + 1, embedding_dim, weights=[embedding_matrix], trainable=False)

def init_weighted_binary_crossentropy(one_weights, zero_weights):

    def weight_binary_crossentropy(y_true, y_pred):
        y_pred = K.clip(y_pred, K.epsilon(), (1.0 - K.epsilon()))
        
        o_terms = tf.multiply(tf.multiply(y_true, tf.log(y_pred)), one_weights)
        z_terms = tf.multiply(tf.subtract(1.0, y_true), tf.log(tf.subtract(1.0, y_pred)))

        terms = tf.add(o_terms, z_terms)
        terms = tf.negative(K.mean(terms))

        return K.mean(terms)
    
    return weight_binary_crossentropy

input_model = Sequential()
input_model.add(embedding)
input_model.add(Bidirectional(LSTM(32)))
input_model.add(Dense(6, activation='sigmoid'))

c_count = [15294, 1595, 8449, 478, 7877, 1405]
o_weights = [max(1.0, math.log(0.50 * m / float(c))) for c in c_count]
print(o_weights)
z_weights = np.zeros(6)

input_model.compile(optimizer='adam', loss=init_weighted_binary_crossentropy(o_weights, z_weights), metrics=[categorical_accuracy])

[1.6518811890628589, 3.912468047448084, 2.2452936925141844, 5.1174863301755735, 2.3153946629637048, 4.039304480899183]


In [8]:
batch_size = 3072

max_len = 300

def inputs():
    padded_sequences = pad_sequences(sequences=sequences, maxlen=max_len, padding='post')
    x_list = []
    y_list = []
    for sequence, y_train_val in zip(padded_sequences, y_train):
        x_list.append(sequence)
        y_list.append(y_train_val)
    return np.array(x_list), np.array(y_list)

x_inputs, y_inputs = inputs()

hist = input_model.fit(x=x_inputs, y=y_inputs, epochs=25, batch_size=batch_size, shuffle=True)

print(hist.history.items())


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40

KeyboardInterrupt: 

In [9]:
y_pred = input_model.predict(x=x_inputs[0:1000, :])

In [10]:
cat_max = [np.max(y_pred[:,i]) for i in range(len(categories))]

display(cat_max)

display(y_pred)

y_pred_labels = np.zeros((y_pred.shape[0], y_pred.shape[1]))

for cmax, i in zip(cat_max, range(len(categories))):
    y_pred_labels[y_pred[:,i] >= cmax, i] = 1
    y_pred_labels[y_pred[:,i] < cmax, i] = 0


[0.99770749, 0.89564937, 0.99714655, 0.71901119, 0.97666901, 0.93951154]

array([[ 0.00125607,  0.00013607,  0.00068925,  0.00038308,  0.0005201 ,
         0.00022621],
       [ 0.00251123,  0.00021255,  0.00112786,  0.00041527,  0.00075355,
         0.00038411],
       [ 0.01852244,  0.00060659,  0.00832517,  0.00072019,  0.00672409,
         0.00055937],
       ..., 
       [ 0.0018507 ,  0.00017301,  0.00100719,  0.00036031,  0.00060202,
         0.00031139],
       [ 0.09676775,  0.00080178,  0.01402193,  0.00266502,  0.01525144,
         0.00199935],
       [ 0.00909878,  0.00030876,  0.00228652,  0.00049891,  0.0013282 ,
         0.00060197]], dtype=float32)

In [11]:
display(y_pred_labels)



array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])

In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train[0:1000,:], y_pred_labels)

0.89200000000000002

In [13]:
df_test = pd.read_csv('input/test.csv')

x_test = df_test['comment_text']

print(x_test.shape)

(153164,)


In [14]:
sequences_test = tokenizer.texts_to_sequences(x_test)
padded_sequences_test = pad_sequences(sequences=sequences_test, maxlen=max_len, padding='post')

print(len(padded_sequences_test))

153164


In [15]:
y_test_pred = input_model.predict(padded_sequences_test)

In [20]:
print(y_test_pred.shape)

test_cat_max = [np.max(y_test_pred[:,i]) for i in range(len(categories))]

print(test_cat_max)

test_cat_min = [np.min(y_test_pred[:,i]) for i in range(len(categories))]

print(test_cat_min)

(153164, 6)
[0.99961036, 0.99925131, 0.99978405, 0.98086286, 0.99352783, 0.99145246]
[0.00027808954, 1.4030286e-05, 0.00012915889, 2.5833902e-05, 0.00011248782, 8.6635955e-06]


In [21]:
y_test_pred_labels = np.zeros((y_test_pred.shape[0], y_test_pred.shape[1]))

print(y_test_pred_labels.shape)

(153164, 6)


In [23]:
threshold = [0.95, 0.95, 0.95, 0.95, 0.95, 0.95]

for t, i in zip(threshold, range(len(threshold))):
    y_test_pred_labels[y_test_pred[:,i] >= t, i] = 1

display(y_test_pred_labels)

array([[ 1.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])

In [24]:
df_submission = pd.DataFrame()
df_submission['id'] = df_test['id']
for c, i in zip(categories, range(len(categories))):
    df_submission[c] = y_test_pred_labels[:, i]

    
print(df_submission.shape)

df_submission.to_csv('output/lstm_glve_submission.csv',index=False)

(153164, 7)
