In [1]:
import keras

import pandas as pd
import numpy as np
import re, string

from keras.models import Sequential
from keras.layers import Input, Embedding, Dense, Dropout, LSTM, GlobalMaxPool1D, Bidirectional
from keras.layers import BatchNormalization, concatenate
from keras.preprocessing import text, sequence
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import initializers, regularizers, constraints, optimizers, layers
import matplotlib.pyplot as plt
%matplotlib inline

from nltk.corpus import stopwords
import collections
import nltk
nltk.download('stopwords')
stopwords = set(stopwords.words("english"))
import gensim
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import wordpunct_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#train = pd.read_csv('../comment/train_wf.csv')
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/comment/train.csv')
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/comment/test.csv')

In [4]:
#nb_features = 20000
max_length = 100

Y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
print (Y_train.shape)

simple_tokens = train.comment_text.apply(gensim.utils.simple_preprocess)
phrases = gensim.models.phrases.Phrases(simple_tokens)
tokenizer = gensim.models.phrases.Phraser(phrases)
tokenized_text = list(tokenizer[simple_tokens])
corpus_dict = gensim.corpora.dictionary.Dictionary(tokenized_text)

word2vec = gensim.models.word2vec.Word2Vec(tokenized_text, window=5, size=100, min_count=2, workers=6)

(159571, 6)


In [5]:
print(train.comment_text[1])
print(tokenized_text[1])
word2vec.wv.most_similar('shit')

D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)
['aww', 'he', 'matches', 'this', 'background', 'colour', 'seemingly', 'stuck', 'with', 'thanks', 'talk', 'january_utc']


[('crap', 0.8439592123031616),
 ('fucking', 0.816267192363739),
 ('bullshit', 0.7952636480331421),
 ('bitch', 0.7541518211364746),
 ('asshole', 0.7289239168167114),
 ('garbage', 0.720348060131073),
 ('dick', 0.7169774174690247),
 ('loser', 0.7101936340332031),
 ('scum', 0.7099797129631042),
 ('fuck', 0.7084515690803528)]

In [6]:
# features = np.zeros((len(tokenized_text), word2vec.vector_size))
# for i, tokens in enumerate(tokenized_text):
#     tokens = [t for t in tokens if t in word2vec.wv.vocab]
#     if tokens:
#         features[i, :] = np.mean([word2vec.wv[t] / word2vec.wv.vocab[t].count for t in tokens], axis=0)

docs = [[idx + 1 for idx in corpus_dict.doc2idx(doc)]  for doc in tokenized_text]
MAX_SEQ_LEN = 100
X_train = keras.preprocessing.sequence.pad_sequences(docs, maxlen=MAX_SEQ_LEN, truncating='post', value=0)

max_idx = max(c for d in docs for c in d)
print (X_train.shape, max_idx)

(159571, 100) 185872


In [7]:
embeddings = np.array([np.random.normal(size=word2vec.vector_size)]+ 
                      [word2vec.wv[corpus_dict[idx]]
                      if corpus_dict[idx] in word2vec.wv.vocab
                      else np.random.normal(size=word2vec.vector_size)
                      for idx in range(max_idx)])
embeddings.shape

(185873, 100)

In [8]:
inp_text = Input(shape=(max_length, ))

x= Embedding(max_idx + 1, word2vec.vector_size, weights=[embeddings], input_length=MAX_SEQ_LEN)(inp_text)
x = Bidirectional(LSTM(200, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)

x = Dense(75, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)

model = Model(inputs=[inp_text], outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 100)          18587300  
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 400)          481600    
_________________________________________________________________
global_max_pooling1d (Global (None, 400)               0         
_________________________________________________________________
dense (Dense)                (None, 75)                30075     
_________________________________________________________________
dropout (Dropout)            (None, 75)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 456   

In [None]:
file_path="/content/drive/My Drive/Colab Notebooks/comment/bilstm_w2v_weight.h5"
# model.load_weights(file_path)
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, early] 

hist = model.fit([X_train], Y_train, epochs=3, batch_size=32, validation_split=0.1, shuffle=False, callbacks = callbacks_list)


Epoch 1/3

In [None]:
print(hist.history.keys())
plt.plot(hist.history['acc'])
plt.plot(hist.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
def comment_to_sequential_input(comment):
    tokens = tokenizer[gensim.utils.simple_preprocess(comment)]
    t_ids = [corpus_dict.token2id[t] + 1 for t in tokens if t in word2vec.wv.vocab and t in corpus_dict.token2id]
    return keras.preprocessing.sequence.pad_sequences([t_ids], maxlen=MAX_SEQ_LEN)[0]

X_test = np.array([comment_to_sequential_input(doc) for doc in test.comment_text])

In [None]:
model.load_weights(file_path)
Y_test = model.predict([X_test], batch_size=1024, verbose=1)

In [None]:
final_test = pd.read_csv("../data/sample_submission.csv")
final_test[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = Y_test
final_test.to_csv("bilstm_w2v_weight.csv", index=False)
final_test.head()