In [2]:
import os
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, Dense, Conv1D, Flatten, LSTM, GlobalMaxPooling1D, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [3]:
embed_num_dims = 100
max_seq_len = 1000

In [4]:
data = pd.read_csv('data/train.csv')

In [6]:
sentences = data['comment_text']

In [29]:
sentences.shape

(159571,)

In [7]:
sentences[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [8]:
dictt = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [10]:
# Y: dependent variable
Y = data[dictt].values

In [13]:
# Tokenizer
tokenizer = Tokenizer(num_words = 4000) # 规定需要保留的最大的词数
tokenizer.fit_on_texts(sentences) # 要训练的texts

In [18]:
sequence = tokenizer.texts_to_sequences(sentences) # 返回sentences的sequence

In [31]:
len(sequence)

159571

In [33]:
print(sequence[0])

[688, 75, 1, 126, 130, 177, 29, 672, 1116, 86, 331, 51, 2278, 50, 15, 60, 2756, 148, 7, 2937, 34, 117, 1221, 2825, 4, 45, 59, 244, 1, 365, 31, 1, 38, 27, 143, 73, 3462, 89, 3085, 2273, 985]


In [21]:
index_of_words = tokenizer.word_index # 讲单词映射为索引

In [37]:
index_of_words['we']

58

In [34]:
len(index_of_words)

210337

In [25]:
padded_seq = pad_sequences(sequence, maxlen = max_seq_len)

In [26]:
padded_seq

array([[   0,    0,    0, ..., 3085, 2273,  985],
       [   0,    0,    0, ...,  992,  589,  182],
       [   0,    0,    0, ...,    1,  737,  468],
       ...,
       [   0,    0,    0, ...,   23,   12, 3509],
       [   0,    0,    0, ...,  151,   34,   11],
       [   0,    0,    0, ..., 1627, 2056,   88]], dtype=int32)

In [27]:
padded_seq.shape

(159571, 1000)

In [38]:
from keras.utils import to_categorical

In [39]:
from sklearn.cross_validation import train_test_split

In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(padded_seq, Y, train_size = 0.55)

In [41]:
Y_train.shape

(87764, 6)

In [42]:
X_train.shape

(87764, 1000)

In [43]:
X_test.shape

(71807, 1000)

In [None]:
embedd_index = {}

f = open('data/glove.6B.100d.txt')

for line in f:
    val = line.split()
    word = val[0]
    coff = np.asarray(val[1:], dtype = 'float')
    embedd_index[word] = coff
    
f.close()

In [45]:
print('Found %s word vectors.' % len(embedd_index))

Found 400000 word vectors.


In [46]:
embedd_index['good']

array([-0.030769 ,  0.11993  ,  0.53909  , -0.43696  , -0.73937  ,
       -0.15345  ,  0.081126 , -0.38559  , -0.68797  , -0.41632  ,
       -0.13183  , -0.24922  ,  0.441    ,  0.085919 ,  0.20871  ,
       -0.063582 ,  0.062228 , -0.051234 , -0.13398  ,  1.1418   ,
        0.036526 ,  0.49029  , -0.24567  , -0.412    ,  0.12349  ,
        0.41336  , -0.48397  , -0.54243  , -0.27787  , -0.26015  ,
       -0.38485  ,  0.78656  ,  0.1023   , -0.20712  ,  0.40751  ,
        0.32026  , -0.51052  ,  0.48362  , -0.0099498, -0.38685  ,
        0.034975 , -0.167    ,  0.4237   , -0.54164  , -0.30323  ,
       -0.36983  ,  0.082836 , -0.52538  , -0.064531 , -1.398    ,
       -0.14873  , -0.35327  , -0.1118   ,  1.0912   ,  0.095864 ,
       -2.8129   ,  0.45238  ,  0.46213  ,  1.6012   , -0.20837  ,
       -0.27377  ,  0.71197  , -1.0754   , -0.046974 ,  0.67479  ,
       -0.065839 ,  0.75824  ,  0.39405  ,  0.15507  , -0.64719  ,
        0.32796  , -0.031748 ,  0.52899  , -0.43886  ,  0.6740

In [56]:
embedding_matrix = np.zeros((len(index_of_words) + 1, embed_num_dims))

tokens = []
labels = []

for word, i in index_of_words.items():
    temp = embedd_index.get(word)
    
    if temp is not None:
        embedding_matrix[i] = temp
        
        # for plotting 
        tokens.append(embedding_matrix[i])
        labels.append(word)

In [57]:
embedding_matrix.shape

(210338, 100)

In [58]:
def tsne():
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    
    new_values = tsne_model.fit_transform(tokens[:200])
    print(new_values.shape)
    
    x = []
    y = []
    
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16,16))
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i],
                    xy=(x[i], y[i]),
                    xytext=(5,2),
                    textcoords='offset points',
                    ha='right',
                    va='bottom')
        
    plt.show()
    
tsne()

ValueError: array must not contain infs or NaNs

In [59]:
# embedding layer before the actual BLSTM

embedd_layer = Embedding(len(index_of_words) + 1, embed_num_dims, input_length = max_seq_len, weights = [embedding_matrix])

In [61]:
model = Sequential()
model.add(embedd_layer)
model.add(Bidirectional(LSTM(30, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model.add(GlobalMaxPooling1D())
model.add(Dense(30,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(6,activation='sigmoid'))

In [62]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 100)         21033800  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1000, 60)          31440     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 60)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 30)                1830      
_________________________________________________________________
dropout_1 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 186       
Total params: 21,067,256
Trainable params: 21,067,256
Non-trainable params: 0
________________________________________________________________

In [63]:
from keras.models import model_from_json

def load():
    load_json = open('weights.json', 'r')
    loaded = load_json.read()
    
    load_json.close()
    
    load = model_from_json(loaded)
    load.load_weights('model.h5')
    print('Loaded')
    model = load

In [64]:
from keras.optimizers import Adam

add = Adam(lr = 0.01)
model.compile(loss = 'categorical_crossentropy', optimizer=add, metrics=['accuracy'])


In [66]:
hist = model.fit(X_train, Y_train, epochs=1, batch_size=500, validation_data=(X_test, Y_test))

Train on 87764 samples, validate on 71807 samples
Epoch 1/1


In [67]:
result = model.evaluate(X_test, Y_test)



In [70]:
print(result)

[0.2841468418561061, 0.9922709485147687]
