In [1]:
import numpy as np
import pandas as pd
from sklearn.externals.joblib import dump
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, SpatialDropout1D, Convolution1D, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from common import SEED, TARGETS

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
VOCABULARY_SIZE = 100000
SEQ_MAX_LEN = 100
EMBEDDINGS_FILE = "crawl-300d-2M.vec"
EMBEDDINGS_SIZE = 300
WEIGHTS_CACHE = "cache/cnn_fasttext_weights.hdf5"
VALIDATION_PRED_FILE = "cache/cnn_fasttext_validation_pred_fold_%s.pkl"
SUBMISSION_FILE = "submissions/submission_cnn_fasttext.csv.gz"
K_FOLDS = 10

In [3]:
np.random.seed(SEED)
np.set_printoptions(suppress=True)

In [4]:
from tensorflow.python.client import device_lib

def get_available_devices():  
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print(get_available_devices())

['/device:CPU:0', '/device:GPU:0']


In [5]:
train = pd.read_csv("data/train_clean.csv", encoding="utf-8")
test = pd.read_csv("data/test_clean.csv", encoding="utf-8")
submission = pd.read_csv("data/sample_submission.csv", encoding="utf-8")

In [6]:
%%time
word_tokenizer = text.Tokenizer(num_words=VOCABULARY_SIZE)
word_tokenizer.fit_on_texts(train.comment_text.values)

CPU times: user 11.9 s, sys: 72 ms, total: 12 s
Wall time: 12 s


In [7]:
%%time
list_tokenized_word_train = word_tokenizer.texts_to_sequences(train.comment_text.values)
list_tokenized_word_test = word_tokenizer.texts_to_sequences(test.comment_text.values)

CPU times: user 16.7 s, sys: 108 ms, total: 16.8 s
Wall time: 16.8 s


In [8]:
%%time
x_train_word_sequences = sequence.pad_sequences(list_tokenized_word_train, maxlen=SEQ_MAX_LEN)
x_test_word_sequences = sequence.pad_sequences(list_tokenized_word_test, maxlen=SEQ_MAX_LEN)

CPU times: user 2.67 s, sys: 156 ms, total: 2.83 s
Wall time: 2.83 s


In [9]:
def load_embeddings():
    embeddings = {}
    f = open(EMBEDDINGS_FILE, 'r', encoding="utf-8", errors="ignore")
    for index, line in enumerate(f):
        try:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings[word] = coefs
        except Exception:
            print("Unable to parse line %d, skipping" % index)
    f.close()
    return embeddings

In [10]:
%%time
embeddings = load_embeddings()
print("Loaded %d word vectors" % len(embeddings))

NameError: name 'embeddings_index' is not defined

In [11]:
def compute_embedding_matrix(embeddings, word_index):
    embedding_matrix = np.zeros((VOCABULARY_SIZE, EMBEDDINGS_SIZE))
    for word, i in word_index.items():
        if i >= VOCABULARY_SIZE:
            break
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [12]:
%%time
embedding_matrix = compute_embedding_matrix(embeddings, word_tokenizer.word_index)
print(embedding_matrix.shape)

(100000, 300)
CPU times: user 240 ms, sys: 80 ms, total: 320 ms
Wall time: 320 ms


In [13]:
def build_network(embedding_matrix):
    nn = Sequential()
    nn.add(Embedding(VOCABULARY_SIZE, EMBEDDINGS_SIZE, weights=[embedding_matrix], trainable=False, input_length=SEQ_MAX_LEN))
    nn.add(SpatialDropout1D(0.3))
    nn.add(Convolution1D(120, 3, padding="valid", activation="relu", strides=1))
    nn.add(GlobalMaxPooling1D())
    nn.add(Dense(120, activation="sigmoid"))
    nn.add(Dropout(0.5))
    nn.add(Dense(6, activation="sigmoid"))
    nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return nn

In [14]:
kfold = KFold(n_splits=K_FOLDS, random_state=SEED)

In [15]:
scores = []
test_preds = []

In [16]:
for index, (train_index, test_index) in enumerate(kfold.split(x_train_word_sequences, train[TARGETS].values)):
    
    x_train, x_val = x_train_word_sequences[train_index], x_train_word_sequences[test_index]
    y_train, y_val = train[TARGETS].values[train_index], train[TARGETS].values[test_index]
    
    callbacks = [
        EarlyStopping(monitor="val_loss"),
        ModelCheckpoint(filepath=WEIGHTS_CACHE, save_best_only=True)
    ]
    
    network = build_network(embedding_matrix)
    history = network.fit(
        x_train, y_train,
        epochs=100, batch_size=32, validation_data=(x_val, y_val), verbose=1, callbacks=callbacks
    )
    
    network.load_weights(WEIGHTS_CACHE)
    
    y_pred = network.predict_proba(x_val)
    dump(y_pred, VALIDATION_PRED_FILE % index)
    
    score = roc_auc_score(y_val, y_pred)
    print("ROC-AUC score: %0.4f" % score)
    scores.append(score)
    
    test_preds.append(network.predict_proba(x_test_word_sequences))

Train on 143613 samples, validate on 15958 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
ROC-AUC score: 0.9856
Train on 143614 samples, validate on 15957 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
ROC-AUC score: 0.9870
Train on 143614 samples, validate on 15957 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
ROC-AUC score: 0.9886
Train on 143614 samples, validate on 15957 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
ROC-AUC score: 0.9834
Train on 143614 samples, validate on 15957 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
ROC-AUC score: 0.9851
Train on 143614 samples, validate on 15957 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
ROC-AUC score: 0.9833
Train on 143614 samples, validate on 15957 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
ROC-AUC score: 0.9851
Train on 143614 samples, validate on 15957 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
ROC-AUC score: 0.9849
Train on 143614 samples, validat

In [21]:
print("Average ROC-AUC: %0.4f" % np.mean(scores))

Average ROC-AUC: 0.9852


In [22]:
for index, target in enumerate(TARGETS):
    y = 0
    for fold in range(0, K_FOLDS):
        y = y + test_preds[fold][:, index]
    submission[target] = y / K_FOLDS

In [23]:
submission.to_csv(SUBMISSION_FILE, index=False, encoding="utf-8", compression="gzip")
# Scores 0.9825