In [3]:
import numpy as np
import pandas as pd

In [39]:
train = pd.read_csv('./toxic/train.csv')
train = train.sample(frac=0.1)
test = pd.read_csv('./toxic/test.csv')

In [57]:
test = test.sample(frac=0.01)

In [40]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
20875,215761881896,She did it to me. How come you are not warning...,0,0,0,0,0,0
11266,115875573056,F1 2015 \nWhy did you delete the page F1 2015 ...,0,0,0,0,0,0
46463,485097075683,"""\nWell, the USA has an embassy in Beijing, Ch...",0,0,0,0,0,0
23173,239429844802,"""\nYes, that seems strange for Top Gear (2002 ...",0,0,0,0,0,0
87230,910308952502,I'm just a strong adherant to avoiding groupin...,0,0,0,0,0,0


In [42]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9585 entries, 20875 to 47139
Data columns (total 8 columns):
id               9585 non-null int64
comment_text     9585 non-null object
toxic            9585 non-null int64
severe_toxic     9585 non-null int64
obscene          9585 non-null int64
threat           9585 non-null int64
insult           9585 non-null int64
identity_hate    9585 non-null int64
dtypes: int64(7), object(1)
memory usage: 673.9+ KB


In [43]:
train.describe()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,9585.0,9585.0,9585.0,9585.0,9585.0,9585.0,9585.0
mean,500020000000.0,0.09734,0.00939,0.056025,0.004382,0.050913,0.009703
std,288608700000.0,0.296435,0.096449,0.229982,0.066054,0.219831,0.098028
min,139353100.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,245621000000.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,503793800000.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,749613000000.0,0.0,0.0,0.0,0.0,0.0,0.0
max,999982800000.0,1.0,1.0,1.0,1.0,1.0,1.0


In [44]:
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [45]:
y = train[classes].values

In [46]:
features = 20000
max_length = 100

In [47]:
from keras.preprocessing import text, sequence
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.models import Model


In [48]:
tokenizer = text.Tokenizer(num_words=features)

In [49]:
tokenizer

<keras.preprocessing.text.Tokenizer at 0x7f2bedc0c090>

In [58]:
list_sentences_test = test["comment_text"].fillna("CVxTz").values
list_sentences_train = train["comment_text"].fillna("CVxTz").values


In [51]:
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)

In [59]:
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [60]:
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=max_length)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=max_length)

In [54]:
def get_model():
    embed_size = 128
    inp = Input(shape=(max_length, ))
    x = Embedding(features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [55]:
model = get_model()
batch_size = 32
epochs = 2

In [56]:
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1)


Train on 8626 samples, validate on 959 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f2c3ce12550>

In [61]:
y_test = model.predict(X_te)

In [62]:
X_te[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,    99,    12,     1,  2291,
          81,    15,    13,     7,    68,   646,     9,     8,    90,
        1236,     9,     7,    79,  8692, 11950,    54,     8,     3,
         472,  1302,     4,    44,   691,    17,   105,    15,    28,
         101,    37,    29,    99,    12,  3262,    36,    69,     3,     1], dtype=int32)

In [63]:
y_test[0]

array([ 0.00951733,  0.00016216,  0.00263019,  0.00014698,  0.00204822,
        0.00094031], dtype=float32)

In [69]:
test.head(40)

Unnamed: 0,id,comment_text
21654,95539727572,":Thanks Reaverdrop, for the heads up on this. ..."
67874,299573338687,Pancho\nA nice job on Gonzales now if the bus...
99905,440825241207,"""\n Dear 207.165.148.1, hello, and welcome to ..."
221874,977476566922,== Merge from [WIKI_LINK: Supply chain diversi...
113231,499796815989,":I understand that, but you just disregarded w..."
15548,68386207364,This would be especially relevant to understan...
128002,564109329859,"""\n\n Holy Redeemer Catholic parish, Belize Ci..."
20712,91363299147,== Speedy deletion nomination of Plaza 8 ==
107816,475984698298,== Screw yourself ==
122827,541602497430,== Top infobox image ==


In [73]:
[np.argmax(y) for y in y_test[:1200] if np.argmax(y) != 0]

[]