In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import math

from keras.preprocessing.text import *
from keras.preprocessing.sequence import skipgrams, pad_sequences

Using TensorFlow backend.


In [2]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

df_train = pd.read_csv('input/train.csv')

x_train = df_train['comment_text']
y_train = df_train[categories].as_matrix()

display(df_train.head())
m = x_train.shape[0]
display(f'total m = {m}')

tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(texts=x_train)

vocabulary_size = len(tokenizer.word_index)
sequences = tokenizer.texts_to_sequences(texts=x_train)

max_seq_len = 0
for sequence in sequences:
    max_seq_len = max(max_seq_len, len(sequence))

print('max length {}'.format(max_seq_len))

min_seq_len = max_seq_len
for sequence in sequences:
    min_seq_len = min(min_seq_len, len(sequence))
    
print('min length {}'.format(min_seq_len))



Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


'total m = 159571'

max length 1403
min length 1


In [3]:
np.mean([len(x) for x in sequences])


68.221569082101382

In [4]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten
from keras.metrics import categorical_accuracy

embedding_dim = 50
weights = np.load('output/embedding_weights_20180312.npy')

embedding = Embedding(vocabulary_size + 1, embedding_dim, weights=weights, trainable=False)

from keras.layers import Bidirectional, LSTM, GRU, Dense

import keras.backend as K

toxic_weighting = 2.0

import tensorflow as tf

def init_weighted_binary_crossentropy(one_weights, zero_weights):

    def weight_binary_crossentropy(y_true, y_pred):
        y_pred = K.clip(y_pred, K.epsilon(), (1.0 - K.epsilon()))
        
        o_terms = tf.multiply(tf.multiply(y_true, tf.log(y_pred)), one_weights)
        z_terms = tf.multiply(tf.subtract(1.0, y_true), tf.log(tf.subtract(1.0, y_pred)))

        terms = tf.add(o_terms, z_terms)
        terms = tf.negative(K.mean(terms))

        return K.mean(terms)
    
    return weight_binary_crossentropy

input_model = Sequential()
input_model.add(embedding)
input_model.add(Bidirectional(LSTM(64)))
input_model.add(Dense(6, activation='sigmoid'))

c_count = [15294, 1595, 8449, 478, 7877, 1405]
o_weights = [max(1.0, math.log(0.70 * m / float(c))) for c in c_count]
print(o_weights)
z_weights = np.zeros(6)

input_model.compile(optimizer='adam', loss=init_weighted_binary_crossentropy(o_weights, z_weights), metrics=[categorical_accuracy])

[1.9883534256840718, 4.248940284069297, 2.5817659291353974, 5.4539585667967865, 2.6518668995849177, 4.375776717520396]


In [5]:
batch_size = 3072

max_len = 100

def inputs():
    padded_sequences = pad_sequences(sequences=sequences, maxlen=max_len, padding='post')
    x_list = []
    y_list = []
    for sequence, y_train_val in zip(padded_sequences, y_train):
        x_list.append(sequence)
        y_list.append(y_train_val)
    return np.array(x_list), np.array(y_list)

x_inputs, y_inputs = inputs()

hist = input_model.fit(x=x_inputs, y=y_inputs, epochs=20, batch_size=batch_size, shuffle=True)

print(hist.history.items())


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
dict_items([('loss', [0.34074443263684417, 0.2818681114807482, 0.27754681579824081, 0.27389591039838868, 0.26729537701827283, 0.25675827328538092, 0.25345750279738277, 0.24349939269811804, 0.24115482206150995, 0.23422783297207442, 0.22853277482879084, 0.22547081999436414, 0.22389961879653333, 0.22194479719589613, 0.21411467051836766, 0.21533431935822175, 0.21684206476135109, 0.2084069662868826, 0.20713438770546325, 0.2026038954302094]), ('categorical_accuracy', [0.7108434464198905, 0.9241027502954563, 0.95536156752201995, 0.94212607490279865, 0.93131583942807816, 0.94261488480163358, 0.94953343053787176, 0.95806255180951627, 0.95477875010499635, 0.96431682644855521, 0.95865163767592776, 0.97210646424475888, 0.95373219411690535, 0.95516729258524868, 0.97349142

In [6]:
y_pred = input_model.predict(x=x_inputs[0:1000, :])

In [7]:
cat_max = [np.max(y_pred[:,i]) for i in range(len(categories))]

display(cat_max)

display(y_pred)

y_pred_labels = np.zeros((y_pred.shape[0], y_pred.shape[1]))

for cmax, i in zip(cat_max, range(len(categories))):
    y_pred_labels[y_pred[:,i] >= cmax, i] = 1
    y_pred_labels[y_pred[:,i] < cmax, i] = 0


[0.89519352, 0.82692993, 0.90450907, 0.41762441, 0.90842015, 0.56149977]

array([[ 0.04515218,  0.00324848,  0.02079026,  0.00253346,  0.02343567,
         0.00421558],
       [ 0.11117588,  0.01238674,  0.06037787,  0.00652811,  0.0509252 ,
         0.01808086],
       [ 0.05455003,  0.00463146,  0.04517697,  0.00419739,  0.03413592,
         0.00487014],
       ..., 
       [ 0.0293682 ,  0.00140283,  0.01754333,  0.00195153,  0.01610556,
         0.00315573],
       [ 0.01945841,  0.00098746,  0.01308971,  0.00067254,  0.00921343,
         0.00192902],
       [ 0.06026293,  0.00364184,  0.03679466,  0.00436013,  0.03072782,
         0.00960463]], dtype=float32)

In [8]:
display(y_pred_labels)



array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])

In [10]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train[0:1000,:], y_pred_labels)

0.89200000000000002

In [34]:
df_test = pd.read_csv('input/test.csv')

x_test = df_test['comment_text']

print(x_test.shape)

(153164,)


In [35]:
sequences_test = tokenizer.texts_to_sequences(x_test)
padded_sequences_test = pad_sequences(sequences=sequences_test, maxlen=max_len, padding='post')

print(len(padded_sequences_test))

153164


In [41]:
y_test_pred = input_model.predict(padded_sequences_test)

In [42]:
print(y_test_pred.shape)

test_cat_max = [np.max(y_test_pred[:,i]) for i in range(len(categories))]

print(test_cat_max)

(153164, 6)
[0.97336537, 0.91963977, 0.96198243, 0.68882245, 0.96235347, 0.81159014]


In [43]:
y_test_pred_labels = np.zeros((y_test_pred.shape[0], y_test_pred.shape[1]))

print(y_test_pred_labels.shape)

(153164, 6)


In [44]:
y_test_pred_labels[y_test_pred[:,0] >= 0.90, 0] = 1
y_test_pred_labels[y_test_pred[:,1] >= 0.90, 1] = 1
y_test_pred_labels[y_test_pred[:,2] >= 0.90, 2] = 1
y_test_pred_labels[y_test_pred[:,3] >= 0.60, 3] = 1
y_test_pred_labels[y_test_pred[:,4] >= 0.90, 4] = 1
y_test_pred_labels[y_test_pred[:,5] >= 0.80, 5] = 1


display(y_test_pred_labels)

array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])

In [45]:
df_submission = pd.DataFrame()
df_submission['id'] = df_test['id']
for c, i in zip(categories, range(len(categories))):
    df_submission[c] = y_test_pred_labels[:, i]

    
print(df_submission.shape)

df_submission.to_csv('output/lstm_submission.csv',index=False)

(153164, 7)
