In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Embedding, Input, Flatten, SpatialDropout1D
from keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, Dropout, AveragePooling1D, AveragePooling2D, GRU
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

#train = pd.read_csv('./train.csv')
#test = pd.read_csv('./test.csv')
df = pd.read_csv('./train.csv')
train, test = train_test_split(df, test_size=0.2)

Using TensorFlow backend.


In [61]:
#Replace all blank comments with text in training set
#extract training comments 
comments_train = train["comment_text"].fillna("cbarcelon").values
#extract the toxciity ratings
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_ratings = train[classes].values
test_ratings = test[classes].values
#extract test comments
comments_test = test["comment_text"].fillna("cbarcelon").values

#tokenizer the text
#vectorize text
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(comments_train))
tokenized_comments_train = tokenizer.texts_to_sequences(comments_train)
tokenized_comments_test = tokenizer.texts_to_sequences(comments_test)
#pad the text so each comment is uniform in length
X_train = sequence.pad_sequences(tokenized_comments_train, maxlen=150, truncating='post')
X_test = sequence.pad_sequences(tokenized_comments_test, maxlen=150,  truncating='post')

In [62]:
#load in glove embeddings
embeddings_index = dict()
f = open('glove.twitter.27B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1193514 word vectors.


In [70]:
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [2]:
#define GRU sequential model
gru = Sequential()
gru.add(Embedding(vocab_size, output_dim=100, weights=[embedding_matrix], input_length=150, trainable=True))
gru.add(SpatialDropout1D(.2))
gru.add(Bidirectional(GRU(128, return_sequences=True)))
gru.add(Bidirectional(GRU(64, return_sequences=True)))
gru.add(Bidirectional(GRU(32, return_sequences=True)))
gru.add(GlobalMaxPooling1D())
gru.add(GlobalAveragePooling1D())

gru.add(Dense(2000, activation = 'relu'))
gru.add(Dense(6, activation = "sigmoid"))

gru.summary()

NameError: name 'vocab_size' is not defined

In [114]:
gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [115]:
#create checkpoint file
file_path = "gru_weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

#early stop checkpoint
early = EarlyStopping(monitor='val_loss', mode='min', patience=5)
callbacks_list = [checkpoint, early] 

In [116]:
gru.fit(X_train, train_ratings, batch_size=700, epochs=4, validation_split=0.1, callbacks=callbacks_list)

Train on 114890 samples, validate on 12766 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fd638f01690>

In [117]:
#load the best weights
gru.load_weights(file_path)

#make predictions on test set
grupred = gru.predict(X_test)

In [111]:
from sklearn.metrics import roc_auc_score
def calc_auc(y_true, y_pred):
    return np.mean([roc_auc_score(y_true[:, i], y_pred[:, i]) 
                    for i in range(y_true.shape[1])])

In [118]:
auc = calc_auc(test_ratings, grupred)
print('gru aucroc ', auc)

('gru aucroc ', 0.98181420980178613)
