In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Embedding, Input, Flatten, SpatialDropout1D
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, AveragePooling1D, AveragePooling2D, GRU
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
#df = pd.read_csv('./train.csv')
#train, test = train_test_split(df, test_size=0.2)

Using TensorFlow backend.


In [2]:
#Replace all blank comments with text in training set
#extract training comments 
comments_train = train["comment_text"].fillna("cbarcelon").values
#extract the toxciity ratings
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_ratings = train[classes].values
#test_ratings = test[classes].values
#extract test comments
comments_test = test["comment_text"].fillna("cbarcelon").values

#tokenizer the text
#vectorize text
tokenizer = text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(list(comments_train))
tokenized_comments_train = tokenizer.texts_to_sequences(comments_train)
tokenized_comments_test = tokenizer.texts_to_sequences(comments_test)
#pad the text so each comment is uniform in length
X_train = sequence.pad_sequences(tokenized_comments_train, maxlen=800, truncating='post')
X_test = sequence.pad_sequences(tokenized_comments_test, maxlen=800,  truncating='post')
vocab_size = len(tokenizer.word_index) + 1

In [47]:
#X_train2 = sequence.skipgrams(tokenized_comments_train, vocab_size, window_size=4, negative_samples=1.0, shuffle=False, categorical=False, sampling_table=None, seed=None)

ValueError: setting an array element with a sequence.

In [33]:
vocab_size= len(tokenizer.word_index)+1
print(vocab_size)

206


In [3]:
#define GRU sequential model
gru = Sequential()
gru.add(Embedding(vocab_size, output_dim=800, input_length=800))
gru.add(SpatialDropout1D(.2))
gru.add(Bidirectional(GRU(128, return_sequences=True)))
gru.add(AveragePooling1D())
gru.add(Bidirectional(GRU(64, return_sequences=True)))
gru.add(AveragePooling1D())
gru.add(Bidirectional(GRU(32, return_sequences=True)))
gru.add(Flatten())
gru.add(Dropout(.3))
gru.add(Dense(1500, activation = 'relu'))
gru.add(Dropout(.3))
gru.add(Dense(6, activation = "sigmoid"))

gru.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 800, 800)          164800    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 800, 800)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 800, 256)          713472    
_________________________________________________________________
average_pooling1d_1 (Average (None, 400, 256)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 400, 128)          123264    
_________________________________________________________________
average_pooling1d_2 (Average (None, 200, 128)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 200, 64)           30912     
__________

In [4]:
gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [5]:
#create checkpoint file
file_path = "gru_weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

#early stop checkpoint
early = EarlyStopping(monitor='val_loss', mode='min', patience=5)
callbacks_list = [checkpoint, early] 

In [7]:
gru.fit(X_train, train_ratings, batch_size=350, epochs=15, validation_split=0.1, callbacks=callbacks_list)

Train on 143613 samples, validate on 15958 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15


<keras.callbacks.History at 0x7f12e0f11990>

In [8]:
#load the best weights
gru.load_weights(file_path)

#make predictions on test set
grupred = gru.predict(X_test)

In [66]:
from sklearn.metrics import roc_auc_score
def calc_auc(y_true, y_pred):
    return np.mean([roc_auc_score(y_true[:, i], y_pred[:, i]) 
                    for i in range(y_true.shape[1])])

In [67]:
auc = calc_auc(test_ratings, grupred)
print('gru aucroc ', auc)

('gru aucroc ', 0.90867178569693507)


In [9]:
subm =pd.read_csv('./sample_submission.csv')
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(grupred, columns = classes)], axis=1)
submission.to_csv('GRU_char_submission.csv', index=False)

In [21]:
nbsvm_sub = pd.read_csv('./NBSVM_char_submission.csv')
gru_sub = pd.read_csv('./GRU_submission.csv')

In [27]:
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
ensemble = gru_sub.copy()
ensemble[labels] = (gru_sub[labels] + nbsvm_sub[labels]) / 2

In [28]:
subm =pd.read_csv('./sample_submission.csv')
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(ensemble, columns = classes)], axis=1)
submission.to_csv('GRU_NBSVM_char_submission.csv', index=False)