In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from multiprocessing import Pool

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['train.csv', 'test.csv', 'embeddings']


In [2]:
train_df = pd.read_csv("../input/train.csv")
X_train = train_df["question_text"].fillna("_na_").values
test_df = pd.read_csv("../input/test.csv")
X_test = test_df["question_text"].fillna("_na_").values
y = train_df["target"]

In [3]:
train_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Dropout, SpatialDropout1D
from keras.layers import CuDNNGRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, GlobalMaxPool1D
from keras.layers import Reshape, Flatten, Concatenate, Conv2D, MaxPool2D
from keras.preprocessing import text, sequence
from keras.engine.topology import Layer
from keras import backend as K
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import Callback

from gensim.models import KeyedVectors

from sklearn import metrics

Using TensorFlow backend.


In [5]:
maxlen = 150
max_features = 50000

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [6]:
print (X_train[0])

[9, 48, 6692, 7163, 158, 55, 5999, 36, 4, 1207, 6, 1, 8262]


In [7]:
print(x_train[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    9   48 6692
 7163  158   55 5999   36    4 1207    6    1 8262]


In [8]:
from sklearn.model_selection import train_test_split
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y, test_size = 0.1, random_state=42)

In [9]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

def get_embedding_matrix(embedding_file):
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_file, encoding="utf8", errors='ignore') if len(o)>100)
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix

def get_word2vac_matrix(word2vac_file, embed_size = 300):
    embeddings_index = KeyedVectors.load_word2vec_format(word2vac_file, binary=True)
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = (np.random.rand(nb_words, embed_size) - 0.5) / 5.0
    for word, i in word_index.items():
        if i >= max_features: continue
        if word in embeddings_index:
            embedding_vector = embeddings_index.get_vector(word)
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

%%time
EMBEDDING_FILE1 = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
embedding1 = get_embedding_matrix(EMBEDDING_FILE1)
print (EMBEDDING_FILE1, embedding1.shape)

%%time
EMBEDDING_FILE2 = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
embedding2 = get_embedding_matrix(EMBEDDING_FILE2)
print (EMBEDDING_FILE2, embedding2.shape)

%%time
EMBEDDING_FILE3 = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
embedding3 = get_embedding_matrix(EMBEDDING_FILE3)
print (EMBEDDING_FILE3, embedding3.shape)

%%time
EMBEDDING_FILE4 = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embedding4 = get_word2vac_matrix(EMBEDDING_FILE4)
print (EMBEDDING_FILE4, embedding4.shape)

In [10]:
%%time
EMBEDDING_FILE1 = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
EMBEDDING_FILE2 = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
EMBEDDING_FILE3 = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
EMBEDDING_FILE4 = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

pool = Pool()
result1 = pool.apply_async(get_embedding_matrix, [EMBEDDING_FILE1]) # evaluate "solve1(A)" asynchronously
result2 = pool.apply_async(get_embedding_matrix, [EMBEDDING_FILE2]) # evaluate "solve1(A)" asynchronously
result3 = pool.apply_async(get_embedding_matrix, [EMBEDDING_FILE3]) # evaluate "solve1(A)" asynchronously
result4 = pool.apply_async(get_word2vac_matrix, [EMBEDDING_FILE4]) # evaluate "solve1(A)" asynchronously

embedding1 = result1.get(timeout=600)
embedding2 = result2.get(timeout=600)
embedding3 = result3.get(timeout=600)
embedding4 = result4.get(timeout=600)

print (EMBEDDING_FILE1, embedding1.shape)
print (EMBEDDING_FILE2, embedding2.shape)
print (EMBEDDING_FILE3, embedding3.shape)
print (EMBEDDING_FILE4, embedding4.shape)


../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec (50000, 300)
../input/embeddings/glove.840B.300d/glove.840B.300d.txt (50000, 300)
../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt (50000, 300)
../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin (50000, 300)
CPU times: user 201 ms, sys: 677 ms, total: 878 ms
Wall time: 2min 9s


In [11]:
class F1Evaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            y_pred = (y_pred > threshold).astype(int)
            score = metrics.f1_score(self.y_val, y_pred)
            print("\n F1 Score - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [12]:
filter_sizes = [1,2,3,5]
num_filters = 42

def get_model(embedding_matrix):
    embed_size = embedding_matrix.shape[1]
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
#    x = SpatialDropout1D(0.4)(x)
    x = Reshape((maxlen, embed_size, 1))(x)
    
    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size),
                                 kernel_initializer='he_normal', activation='tanh')(x)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size),
                                 kernel_initializer='he_normal', activation='tanh')(x)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), 
                                 kernel_initializer='he_normal', activation='tanh')(x)
    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size),
                                 kernel_initializer='he_normal', activation='tanh')(x)
    
    maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1))(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1))(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(maxlen - filter_sizes[3] + 1, 1))(conv_3)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
    z = Flatten()(z)
    z = Dropout(0.1)(z)
        
    outp = Dense(1, activation="sigmoid")(z)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [13]:
embedding_matrix = np.concatenate((embedding1, embedding2, embedding3, embedding4), axis=1)

model = get_model(embedding_matrix)

In [None]:
batch_size = 16 #256
epochs = 2

F1_Score = F1Evaluation(validation_data=(X_val, y_val), interval=1)
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs,
                 validation_data=(X_val, y_val),
                 callbacks=[F1_Score])

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2


In [None]:
y_pred = model.predict(X_val, batch_size=2048)