In [5]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')



In [6]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

In [9]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ..., 
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [11]:

max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train) + list(list_sentences_test))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
list_tokenized_train[0:5]

[[733,
  78,
  1,
  140,
  131,
  182,
  30,
  712,
  4438,
  10284,
  1252,
  86,
  368,
  51,
  2230,
  14039,
  49,
  6744,
  15,
  60,
  2624,
  151,
  7,
  2832,
  33,
  115,
  1246,
  16129,
  2517,
  5,
  50,
  59,
  256,
  1,
  370,
  31,
  1,
  46,
  29,
  144,
  72,
  3931,
  89,
  4208,
  6368,
  2687,
  1183],
 [52,
  2911,
  13,
  450,
  3782,
  72,
  4871,
  2676,
  21,
  95,
  46,
  912,
  3225,
  1024,
  616,
  9983,
  216],
 [455,
  400,
  72,
  126,
  14,
  268,
  2,
  79,
  326,
  71,
  49,
  9,
  13,
  613,
  8,
  2470,
  530,
  511,
  105,
  5,
  609,
  2,
  37,
  330,
  140,
  382,
  3,
  30,
  46,
  29,
  52,
  195,
  2,
  466,
  58,
  35,
  1,
  2404,
  93,
  1,
  743,
  487],
 [58,
  7,
  229,
  97,
  55,
  325,
  1408,
  15,
  2120,
  7,
  5715,
  22,
  1,
  116,
  2388,
  56,
  16,
  519,
  15,
  25,
  4,
  3854,
  3,
  1369,
  3,
  10640,
  7,
  65,
  1,
  295,
  87,
  122,
  13597,
  36,
  9,
  51,
  18,
  40,
  10,
  1,
  1523,
  139,
  1296,
  2126,
  435

In [13]:
tokenizer.word_counts

OrderedDict([('explanation', 3095),
             ('why', 31804),
             ('the', 917801),
             ('edits', 16189),
             ('made', 17181),
             ('under', 12228),
             ('my', 78385),
             ('username', 3172),
             ('hardcore', 320),
             ('metallica', 91),
             ('fan', 1679),
             ('were', 28080),
             ('reverted', 6605),
             ('they', 51322),
             ("weren't", 820),
             ('vandalisms', 54),
             ('just', 52182),
             ('closure', 178),
             ('on', 161441),
             ('some', 42158),
             ('gas', 662),
             ('after', 15124),
             ('i', 364217),
             ('voted', 599),
             ('at', 72989),
             ('new', 19602),
             ('york', 1685),
             ('dolls', 43),
             ('fac', 696),
             ('and', 408961),
             ('please', 51937),
             ("don't", 42820),
             ('remove', 8962),
   

In [18]:
import collections

collections.OrderedDict(sorted(tokenizer.word_counts.items(), key=lambda item: item[1], reverse = True ))


OrderedDict([('the', 917801),
             ('to', 538991),
             ('of', 410735),
             ('a', 409108),
             ('and', 408961),
             ('you', 370216),
             ('i', 364217),
             ('is', 333927),
             ('that', 282864),
             ('in', 269391),
             ('it', 241839),
             ('for', 185939),
             ('this', 181030),
             ('not', 168604),
             ('on', 161441),
             ('be', 153140),
             ('as', 140529),
             ('are', 132419),
             ('have', 128011),
             ('your', 111923),
             ('with', 109334),
             ('if', 106433),
             ('article', 102987),
             ('was', 99465),
             ('or', 96861),
             ('but', 95605),
             ('an', 81065),
             ('wikipedia', 79242),
             ('page', 78860),
             ('my', 78385),
             ('from', 74592),
             ('by', 74398),
             ('at', 72989),
             ('do', 7

In [19]:
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [20]:
inp = Input(shape=(maxlen, ))

In [24]:
X_t[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [None]:


inp = Input(shape=(maxlen, ))
embed_size = 128
x = Embedding(max_features, embed_size)(inp)
x = LSTM(60, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

print("start fitting...")
model.fit(X_t,y, epochs=2, batch_size=32, validation_split=0.1)

print("start preditcting...")
y_pred = model.predict(X_te, batch_size=1024)

submission = pd.DataFrame.from_dict({'id': test['id']})
for idx, col in enumerate(list_classes):
    submission[col] = y_pred[:,idx]
submission.to_csv('submission.csv', index=False)

start fitting...
Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2