In [1]:
import numpy as np
import pandas as pd

In [2]:
#read train and test data

In [3]:
train_df = pd.read_csv('./Data/train.csv')
test_df = pd.read_csv('./Data/test.csv')
print(train_df.shape)
print(test_df.shape)

(159571, 8)
(153164, 2)


In [4]:
FASTTEXT_EMBEDFILE = './fasttext_wordvec/wiki.en.vec'

In [5]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
train_df = train_df.dropna(subset=['comment_text'])

In [7]:
X_train = train_df['comment_text'].values
label_column_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train_df[label_column_names].values
X_test = test_df['comment_text'].values

In [8]:
#preprocess the text

In [9]:
from keras.preprocessing import text,sequence

Using TensorFlow backend.


In [10]:
vocab_size = 30000
max_len = 100
embed_size = 300

In [11]:
def text2sequence(vocab_size,max_len,X_train,X_test):
    tokenizer = text.Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(list(X_train)+list(X_test))
    word_index = tokenizer.word_index
    X_train = sequence.pad_sequences(tokenizer.texts_to_sequences(X_train),maxlen=max_len)
    X_test = sequence.pad_sequences(tokenizer.texts_to_sequences(X_test),maxlen=max_len)
    return X_train, X_test , word_index

In [12]:
X_train, X_test, word_index = text2sequence(vocab_size,max_len,X_train,X_test)

In [13]:
embed_mat_file = './Data/embed_mat.npy'
#embedding_matrix.dump(embed_mat_file)

In [14]:
embedding_matrix = np.load(embed_mat_file)

In [15]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Lambda, dot, Activation
from keras.layers import GRU ,CuDNNGRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras import backend as K
from keras.initializers import Constant
from keras.callbacks import Callback
from sklearn.metrics import roc_auc_score
from keras.optimizers import adam

In [16]:
K.clear_session()

In [17]:
weights = Constant(embedding_matrix)

In [18]:
inputs = Input(shape=(max_len,))
embed_layer = Embedding(vocab_size+1,embed_size,embeddings_initializer=weights)(inputs)
embed_layer = SpatialDropout1D(0.2)(embed_layer)
bi_rnn = Bidirectional(CuDNNGRU(80,return_sequences=True))(embed_layer)
avg_pool = GlobalAveragePooling1D()(bi_rnn)
max_pool = GlobalMaxPooling1D()(bi_rnn)
concat = concatenate([avg_pool,max_pool])
outputs = Dense(6,activation='sigmoid')(concat)

opt = adam(lr=0.001)

model = Model(inputs=inputs,outputs=outputs)
model.compile(loss='binary_crossentropy',
             optimizer=opt,
             metrics=['accuracy'])

In [19]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     9000300     input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 100, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 100, 160)     183360      spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
global_ave

In [20]:
batch_size = 32
epochs = 2

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_tra, X_val, y_tra, y_val = train_test_split(X_train,y_train,train_size=0.95,random_state=42)



In [23]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [24]:
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

In [25]:
history = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                    callbacks=[RocAuc])

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.988060 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.988112 



In [26]:
predictions = model.predict(X_test,batch_size=1024)

In [27]:
test_labels_df = pd.read_csv('./Data/test_labels.csv')

In [28]:
#the value in lable -1 not use for scoring
count_indices = test_labels_df.loc[test_labels_df['toxic']>=0].index.values

In [29]:
test_labels = test_labels_df[label_column_names].values

In [30]:
roc_auc_score(test_labels[count_indices],predictions[count_indices])

0.9799266234463476

In [31]:
submission = pd.read_csv('./Data/sample_submission.csv')
submission[label_column_names] = predictions
submission.to_csv('submission2.csv',index=False)