In [1]:
import numpy as np
import pandas as pd

In [2]:
#read train and test data

In [3]:
train_df = pd.read_csv('./Data/train.csv')
train_de_df = pd.read_csv('./Data/train_de.csv')
train_es_df = pd.read_csv('./Data/train_es.csv')
train_fr_df = pd.read_csv('./Data/train_fr.csv')
test_df = pd.read_csv('./Data/test.csv')
print(train_df.shape)
print(test_df.shape)

(159571, 8)
(153164, 2)


In [4]:
FASTTEXT_WIKI_EMBEDFILE = './fasttext_wordvec/wiki.en.vec'

In [5]:
FASTTEXT_CRAWL_EMBEDFILE = './fasttext_wordvec/ycrawl-300d-2M.vec'

In [6]:
GLOVE_EMBEDFILE = './fasttext_wordvec/glove.840B.300d.txt'

In [7]:
X_train = train_df['comment_text'].values
X_train_de = train_de_df['comment_text'].values
X_train_es = train_es_df['comment_text'].values
X_train_fr = train_fr_df['comment_text'].values
label_column_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train_df[label_column_names].values
X_test = test_df['comment_text'].values

In [8]:
del train_df
del train_de_df
del train_es_df
del train_fr_df

In [9]:
from keras.preprocessing import text,sequence

Using TensorFlow backend.


In [10]:
vocab_size = 60000
max_len = 150
embed_size = 300

In [11]:
ftwiki_mat_file = 'F:/ftwiki.npy'
#fasttext_wiki_mat.dump(ftwiki_mat_file)

In [12]:
ftwiki_mat = np.load(ftwiki_mat_file)

In [13]:
ftcrawl_mat_file = 'F:/ftcrawl.npy'
#fasttext_crawl_mat.dump(ftcrawl_mat_file)

In [14]:
ftcrawl_mat = np.load(ftcrawl_mat_file)

In [15]:
glove_mat_file = 'F:/glove_mat.npy'
#glove_crawl_mat.dump(glove_mat_file)

In [16]:
glove_matrix = np.load(glove_mat_file)

In [17]:
from sklearn.model_selection import KFold

In [18]:
X_train = np.load('./Data/X_train.npy')
X_train_de = np.load('./Data/X_train_de.npy')
X_train_es = np.load('F:/X_train_es.npy')
X_train_fr = np.load('F:/X_train_fr.npy')

In [19]:
def generate(splits,X_train,y_train,*args):
    kf = KFold(splits)
    for train_idx, valid_idx in kf.split(X_train):
        train_x = np.concatenate([x[train_idx] for x in args],axis=0)
        train_x = np.concatenate([X_train[train_idx],train_x],axis=0)
        train_y = np.concatenate([y_train[train_idx] for _ in args],axis=0)
        train_y = np.concatenate([y_train[train_idx],train_y],axis=0)
        
        valid_x = np.concatenate([x[valid_idx] for x in args],axis=0)
        valid_x = np.concatenate([X_train[valid_idx],valid_x],axis=0)
        valid_y = np.concatenate([y_train[valid_idx] for _ in args],axis=0)
        valid_y = np.concatenate([y_train[valid_idx],valid_y],axis=0)
        
        
        
        yield train_x, train_y, valid_x, valid_y

In [20]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, average
from keras.layers import GRU ,CuDNNGRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D
from keras.layers.normalization import BatchNormalization
from keras.engine.topology import Layer
from keras import backend as K
from keras.initializers import Constant
from keras import initializers,regularizers,constraints
from keras.callbacks import Callback,EarlyStopping,ModelCheckpoint
from sklearn.metrics import roc_auc_score
from keras.optimizers import adam

In [21]:
def get_model_norm():
    config = tf.ConfigProto()
    config.gpu_options.allocator_type = 'BFC'
    with tf.Session(config = config) as s:
        wiki_weights = Constant(ftwiki_mat)

        inputs = Input(shape=(max_len,))

        embed_layer1 = Embedding(vocab_size+1,embed_size,embeddings_initializer=wiki_weights)
        embed_layer1.trainable = False

        embed_out1 = embed_layer1(inputs)


        s_dout = SpatialDropout1D(0.2)(embed_out1)

        bi_rnn = Bidirectional(CuDNNGRU(80,return_sequences=True))(s_dout)

        avg_pool = GlobalAveragePooling1D()(bi_rnn)
        max_pool = GlobalMaxPooling1D()(bi_rnn)
        concat = concatenate([avg_pool,max_pool])

        outputs = Dense(6,activation='sigmoid')(concat)

        opt = adam(lr=0.001)

        model = Model(inputs=[inputs],outputs=outputs)
        model.compile(loss='binary_crossentropy',
                     optimizer=opt,
                     metrics=['accuracy'])
    return model

In [22]:
model = get_model_norm()

In [23]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 300)     18000300    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 150, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 150, 160)     183360      spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
global_ave

In [24]:
batch_size = 32
epochs = 5

In [25]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

## use simple avg different embeddings

In [26]:
from sklearn.model_selection import train_test_split

def load_data(*args,train_size=0.95):
    X_tra, X_val, X_tra1, X_val1,X_tra2, X_val2, X_tra3, X_val3, y_tra, y_val = train_test_split(*args,train_size=train_size,random_state=42)
    X_tra_aug = np.concatenate([X_tra,X_tra1,X_tra2,X_tra3],axis=0)
    X_val_aug = np.concatenate([X_val,X_val1,X_val2,X_val3],axis=0)
    
    y_tra_aug = np.concatenate([y_tra for _ in range(len(args)-1)],axis=0)
    y_val_agu = np.concatenate([y_val for _ in range(len(args)-1)],axis=0)
    return X_tra_aug,X_val_aug,y_tra_aug,y_val_agu

X_tra,X_val,y_tra,y_val = load_data(X_train,X_train_de,X_train_es,X_train_fr,y_train,train_size=0.9)

filepath="weights_wiki_best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=2)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
callback_list = [checkpoint,early,RocAuc]

model = None
model = get_model_norm()

history = model.fit(X_tra,y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                callbacks=callback_list,shuffle=True)



Train on 574452 samples, validate on 63832 samples
Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.98293, saving model to weights_wiki_best.hdf5

 ROC-AUC - epoch: 1 - score: 0.984765 

Epoch 2/5

Epoch 00002: val_acc improved from 0.98293 to 0.98335, saving model to weights_wiki_best.hdf5

 ROC-AUC - epoch: 2 - score: 0.985631 

Epoch 3/5

Epoch 00003: val_acc did not improve from 0.98335

 ROC-AUC - epoch: 3 - score: 0.985750 

Epoch 4/5

Epoch 00004: val_acc did not improve from 0.98335

 ROC-AUC - epoch: 4 - score: 0.985322 



In [None]:
X_test = np.load('F:/X_test.npy')

## use CV

In [27]:
model.load_weights(filepath)

In [29]:
predictions = model.predict(X_test,batch_size=64)

In [30]:
test_labels_df = pd.read_csv('./Data/test_labels.csv')

In [31]:
#the value in lable -1 not use for scoring
count_indices = test_labels_df.loc[test_labels_df['toxic']>=0].index.values

In [32]:
test_labels = test_labels_df[label_column_names].values

In [33]:
roc_auc_score(test_labels[count_indices],predictions[count_indices])

0.9821473815187113

In [42]:
submission = pd.read_csv('./Data/sample_submission.csv')
submission[label_column_names] = predictions
submission.to_csv('submission_wiki.csv',index=False)