In [1]:
## Some imports

# python built-in imports
import json, re, gc, time, os
from collections import defaultdict
from warnings import filterwarnings
filterwarnings(category=FutureWarning, action="ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# data manipulation
import pandas as pd
import numpy as np
from scipy.stats import rankdata

# keras imports
import tensorflow as tf
import keras.backend as K
from keras.models import Model
from keras.layers import (Input, Embedding, Dropout, Dense,
                          Concatenate, GlobalMaxPooling1D,
                          SpatialDropout1D, Bidirectional,
                          CuDNNLSTM, CuDNNGRU)

# scikit-learn imports
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

# progress-bar imports
from tqdm import tqdm
tqdm.pandas('my Bar ! ')

Using TensorFlow backend.


In [2]:
## Translate LaTeX in texts
LATEX_DICT = {r'\\mathrm': ' LaTex math mode ', r'\\mathbb': ' LaTex math mode ',
  r'\\boxed': ' LaTex equation ', r'\\begin': ' LaTex equation ', r'\\end': ' LaTex equation ',
  r'\\left': ' LaTex equation ', r'\\right': ' LaTex equation ', r'\\(over|under)brace': ' LaTex equation ',
  r'\\text': ' LaTex equation ', r'\\vec': ' vector ', r'\\var': ' variable ', r'\\theta': ' theta ',
  r'\\mu': ' average ', r'\\min': ' minimum ', r'\\max': ' maximum ', r'\\sum': ' + ', r'\\times': ' * ',
  r'\\cdot': ' * ', r'\\hat': ' ^ ', r'\\frac': ' / ', r'\\div': ' / ', r'\\sin': ' Sine ', r'\\cos': ' Cosine ',
  r'\\tan': ' Tangent ', r'\\infty': ' infinity ', r'\\int': ' integer ', r'\\in': ' in '}

RE_LATEX = re.compile('(%s)' % '|'.join(LATEX_DICT.keys()))
LATEX_DICT = {k.strip('\\'): v for k, v in LATEX_DICT.items()}


## Some RegExs to filter the texts
RE_URL = re.compile(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)")
RE_ALPHA_NUMERIC = re.compile(r'[^\w *+?!-]')
RE_REPEATS = re.compile(r"([A-Za-z])\1{2,}", re.DOTALL)
RE_NUMBERS = re.compile(r'[0-9]{2,}', re.DOTALL)
RE_NON_ASCII = re.compile(r'^[\u0080-\uFFFF]+$')


## Composed words to be splitted
TO_SPLIT = set([w.strip() for w in open('to_split.txt', 'r', encoding='utf8').readlines()])


# Annoying exadecimal characters
hex_chars = ['\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
             '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
             '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
             '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f']


def replace_latex(match):
    try:
        word = LATEX_DICT.get(match.group(0).strip('\\'))
    except KeyError:
        word = match.group(0)
    return word


def clean_latex(text):
    text = re.sub(r'\[math\]', ' LaTex math ', text)
    text = re.sub(r'\[\/math\]', ' LaTex math ', text)

    return RE_LATEX.sub(replace_latex, text)


def replace_numbers(match):
    try:
        n = match.group(0)
        if n == "11":
            return n
        n = min(len(n), 7)
        return " 1" + "0" * (n - 1) + " "
    except:
        return "number"


def clean_numbers(text):
    text = RE_NUMBERS.sub(replace_numbers, text)
    return text


def strip_non_ascii(s):
    return RE_NON_ASCII.sub('<NON_ASCII>', s)


# Loads spelling corrections stored on the Disk
def load_dicts():
    wmaps = dict()

    def load_json(path):
        with open(path, 'r', encoding='utf8') as fp:
            data = json.load(fp)
        return data

    for file in os.listdir('./'):
        if file.endswith('.json'):
            wmaps[file.split('.')[0]] = load_json(file)
            
    return wmaps


# Sort keys such that substrings are well ordered
def sort_keys(wmap):
    keys = list(wmap.keys())
    keys = sorted(keys, key=lambda k: -len(k))
    return keys


wmaps = load_dicts()

keys_emojis = sort_keys(wmaps["emojis"])
keys_first_order = sort_keys(wmaps["first_order_corrections"])
keys_second_order = sort_keys(wmaps["second_order_corrections"])
keys_lemmas = sort_keys(wmaps["lemmas_corrections"])

# Loop over keys, word_maps
keys_wmaps_iterator = list(zip([keys_emojis, keys_first_order,
                           keys_second_order, keys_lemmas],
                           [wmaps["emojis"], wmaps["first_order_corrections"],
                           wmaps["second_order_corrections"], wmaps["lemmas_corrections"]]))

In [3]:
def process(s):
    s = RE_URL.sub(' URL ', s)
    s = re.sub('’', "'", s)
    s = re.sub("' ", "'", s)

    s = clean_latex(s)

    for hc in hex_chars:
        s = s.replace(hc, ' ')

    # Remove repeatitions
    s = RE_REPEATS.sub(r"\1" * 2, s)
      
    for keys, wmap in keys_wmaps_iterator:
        for word in keys:
            s = s.replace(word, wmap[word])

    s = RE_ALPHA_NUMERIC.sub(' ', s).replace('_', '')
    s = " ".join(s.split()) + " "
    s = s.replace(' 9 11 ', ' 9/11 ')
    s = s.replace(" s ", " 's ")
    s = clean_numbers(s)

    tokens = sum([t.split('-') if t in TO_SPLIT else [t]
                  for t in s.split()], [])

    tokens = [strip_non_ascii(t) for t in tokens]

    return tokens

In [4]:
## Adapted from keras tokenizer Object
class MyTokenizer:
    def __init__(self, maxwords):
        self.word_index = defaultdict(int)
        self.maxwords = maxwords
        self.oov = None
        self.pad = 0
        
    def fit(self, docs):
        for doc in docs:
            for word in doc:
                self.word_index[word] += 1
        
        self.word_index = sorted(self.word_index.items(),
                        key=lambda k : -k[1])[:self.maxwords]

        self.word_index = {w: ix + 1 for ix, (w, _) in
                           enumerate(self.word_index)}
        
        self.word_index['<OOV>'] = self.oov = max(self.word_index.values()) + 1

    def to_sequences(self, docs, maxlen):
        seqs = [self.__index_and_pad(d, maxlen=maxlen) for d in docs]

        return seqs

    def __index_and_pad(self, tokens, maxlen):
        seq = [self.word_index.get(t, self.oov) for t in tokens]
        seq = seq + [self.pad] * (maxlen - len(seq)) if len(seq) < maxlen else seq[:maxlen]

        return seq

In [5]:
## Loads embeddings from filepath and compute embedding matrix
def load_embeddings(word_index, embedding_file, corrections):
    embedding_dim = 300
    nb_words = len(word_index)+1
    
    embeddings_index = {}
    f = open(embedding_file, 'r', encoding='utf8', errors="ignore")
    for line in tqdm(f):
        values = line.split()
        word = values[0]
        if len(values) == embedding_dim + 1 and word in word_index:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    f.close()
    
    embeddings_index['<NON_ASCII>'] = np.random.normal(-0.005838499, 0.48782197, (embedding_dim,))

    embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
    oov_vector = np.zeros((embedding_dim,), dtype=np.float32) - 1.
    
    for word, i in word_index.items():
        if word == "<OOV>":
            embedding_matrix[i] = oov_vector
            continue
        
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
            
        embedding_vector = embeddings_index.get(word.lower())
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
            
        embedding_vector = embeddings_index.get(word.upper())
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
            
        embedding_vector = embeddings_index.get(word.capitalize())
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
            
        corr = corrections.get(word)
        if corr is not None:
            embedding_vector = embeddings_index.get(corr)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                continue
        
        embedding_matrix[i] = oov_vector
        
    return embedding_matrix

In [6]:
## Search optimal threshold for predictions
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in GRID:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

## Build Neural model
def get_lstm_model():
    inputs = Input(shape=(MAX_LEN,), dtype='int32')
    embedding = Embedding(input_dim=MAX_WORDS,
                                output_dim=EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                trainable=False)(inputs)
    embedding = SpatialDropout1D(0.3)(embedding)
    x1 = Bidirectional(CuDNNLSTM(256, return_sequences=True))(embedding)
    x2 = Bidirectional(CuDNNGRU(128, return_sequences=True))(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    conc = Concatenate()([max_pool1, max_pool2])
    predictions = Dense(1, activation='sigmoid')(conc)
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [7]:
t0 = time.time()

df = pd.concat([pd.read_csv('../input/train.csv'),
                pd.read_csv('../input/test.csv')],
               axis=0).reset_index(drop=True)

print("Dataset loaded in {:02d}m{:02d}s".format(*divmod(int(time.time() - t0), 60)))
print("Process dataset ...")
time.sleep(0.5)

docs = df.question_text.progress_apply(process)
docs = docs.tolist()

print('Dataset successfully processed')
print('Tokenize corpus ...')

MAX_WORDS = 100000
tokenizer = MyTokenizer(maxwords=MAX_WORDS)
tokenizer.fit(docs)
word_index = tokenizer.word_index
print('Corpus successfully tokenized')
print('All words : %d' % len(word_index))

print('Load GLOVE embeddings ...')
time.sleep(0.5)
glove_embeddings = load_embeddings(word_index,
                                   "../input/embeddings/glove.840B.300d/glove.840B.300d.txt",
                                   spell_corrections_glove)

print('Load PARAGRAM embeddings ...')
time.sleep(0.5)
paragram_embeddings = load_embeddings(word_index,
                                   "../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt",
                                   spell_corrections_paragram)

embedding_matrix = np.concatenate([glove_embeddings, paragram_embeddings], axis = 1)

print('Embedding matrix ready')
print('Compute text sequences ...')

MAX_WORDS, EMBEDDING_DIM = embedding_matrix.shape
MAX_LEN = 55

n_train = df[~pd.isnull(df.target)].shape[0]

X_train = tokenizer.to_sequences(docs[:n_train], maxlen=MAX_LEN)
X_test = tokenizer.to_sequences(docs[n_train:], maxlen=MAX_LEN)

X_train = np.array(X_train)
X_test = np.array(X_test)

Y_train = df[~pd.isnull(df.target)].target.values.astype(int)

print('Sequences ready')

print('Remove garbage ...')
del df, docs, glove_embeddings, paragram_embeddings, tokenizer
gc.collect()

print('Starts training ...\n')

# Grid to inspect for optimal prediction threshold
GRID = np.arange(0.9, 0.95, 0.001)

N_FOLDS = 8

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True)
model_count = 1

Y_pred = np.zeros((len(X_test),))

# Stores threshold search results
df_thresholds = pd.DataFrame(GRID, columns = ['th'])

for idx_train, idx_val in skf.split(Y_train, Y_train):
    print("MODEL:", model_count)
    x_train, y_train = X_train[idx_train], Y_train[idx_train]
    x_val, y_val = X_train[idx_val], Y_train[idx_val]
    
    K.clear_session()
    model = get_lstm_model()

    hist = model.fit(x_train, y_train, batch_size = 512, epochs=4, verbose=1)
                 
    preds_val = model.predict(x_val, batch_size=512).squeeze()
    preds_val = rankdata(preds_val, method = "min")/len(preds_val) # Compute threshold on ranks rather than probs
    search_result = threshold_search(y_val, preds_val)
    best_th, best_f1 = search_result['threshold'], search_result["f1"]
    
    print('\nTh : %.3f  -->  F1 : %.5f \n' % (best_th, best_f1))
    
    devs = []
    for threshold in GRID:
        score = f1_score(y_true=y_val, y_pred=preds_val > threshold)
        devs.append(abs(score - best_f1))
        
    df_thresholds['fold_%d' % model_count] = devs
                 
    Y_pred += rankdata(model.predict(X_test, batch_size=512).squeeze(), method="min")/len(X_test)
        
    model_count += 1
    
print('Finished training!')
print('Computes optimal threshold ...')
df_thresholds['mean_dev'] = df_thresholds.apply(lambda x : x[1:].mean(), axis = 1, raw=True)
opt_threshold = df_thresholds.sort_values('mean_dev').reset_index(drop=True).th[0]
print('Found optimal threshold %.3f' % opt_threshold)
print('Prepare submission.csv ...')
Y_pred_ = Y_pred / N_FOLDS
Y_pred_ = (Y_pred_ > opt_threshold).astype(int)

sub = pd.read_csv('../input/sample_submission.csv', encoding='utf8')
sub['prediction'] = Y_pred_
sub.to_csv('submission.csv', encoding='utf8', index=False)

S = int(time.time() - t0)
M, S = divmod(S, 60)
H, M = divmod(M, 60)

print('\n\n DONE ! Total run-time : {:02d}h{:02d}m{:02d}s'.format(H, M, S))

Dataset loaded in 00m03s
Process dataset ...

100%|██████████| 1681928/1681928 [06:04<00:00, 4614.53it/s]

Dataset successfully processed
Tokenize corpus ...
Corpus successfully tokenized
All words : 100001
Load GLOVE embeddings ...

2196017it [00:59, 36810.86it/s]

Load PARAGRAM embeddings ...

1703756it [00:47, 35916.62it/s]

Embedding matrix ready
Compute text sequences ...
Sequences ready
Remove garbage ...
Starts training ...

MODEL: 1
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

Th : 0.929  -->  F1 : 0.69678 

MODEL: 2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

Th : 0.932  -->  F1 : 0.69257 

MODEL: 3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

Th : 0.930  -->  F1 : 0.69076 

MODEL: 4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

Th : 0.927  -->  F1 : 0.69655 

MODEL: 5
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

Th : 0.927  -->  F1 : 0.69255 

MODEL: 6
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

Th : 0.927  -->  F1 : 0.69246 

MODEL: 7
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

Th : 0.931  -->  F1 : 

<h1> Now the solution ranks 27th!</h1>
<p><img src="score_0.70704.png" width=800></p>