In [None]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors

from collections import Counter
from functools import reduce
import random
import re
import gc
from tqdm.auto import tqdm
import time

import torch
import torch.utils.data
import torch.nn as nn
import torch.tensor as tensor
import torch.autograd as autograd
from torch.autograd import Variable

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import GridSearchCV, StratifiedKFold

import scipy.stats
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

tqdm.pandas()

## Load dataset

In [None]:
DATA_DIR = "../input"
TRAIN_CSV = f"{DATA_DIR}/train.csv"
TEST_CSV = f"{DATA_DIR}/test.csv"

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print(f"Train shape: {train_df.shape}; cols: {list(train_df.columns)}")
print(f"Test shape: {test_df.shape}; cols: {list(test_df.columns)}")

In [None]:
sincere = train_df.loc[train_df['target'] == 0]
insincere = train_df.loc[train_df['target'] == 1]

print(
    f"sincere: {len(sincere)} ({round(100.0 * len(sincere)/len(train_df), 3)}%); "
    f"insincere: {len(insincere)} ({round(100.0 * len(insincere)/len(train_df), 3)}%)\n"
)

print(
    f"sincere: {sincere.iloc[random.randint(0, len(sincere))]['question_text']}\n\n"
    f"insincere: {insincere.iloc[random.randint(0, len(insincere))]['question_text']}"
)
print()

## Load embeddings

In [None]:
EMB_GLOVE = f"{DATA_DIR}/embeddings/glove.840B.300d/glove.840B.300d.txt"
EMB_WORD2VEC = f"{DATA_DIR}/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"
EMB_PARAGRAM = f"{DATA_DIR}/embeddings/paragram_300_sl999/paragram_300_sl999.txt"
EMB_WIKI = f"{DATA_DIR}/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec"

### word2vec

In [None]:
# emb_word2vec = KeyedVectors.load_word2vec_format(EMB_WORD2VEC, binary=True)

In [None]:
# len(emb_word2vec.vocab)
# print("barbiturates" in emb_word2vec)

### GloVe

In [None]:
def load_glove():
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMB_GLOVE, encoding='latin'))
        
    return embeddings_index

emb_glove = load_glove()

In [None]:
print(f"{len(emb_glove)} x {emb_glove['a'].size}")
print("xiaomi" in emb_glove)

## Clean dataset

In [None]:
SEP_PUNCTS         = u'\u200b' + "/-'´‘…—−–"
SHOULD_KEEP_PUNCTS = "&"
TO_REMOVE_PUNCTS   = '?!.,，"#$%\'()*+-/:;<=>@[\\]^_`{|}~“”’™•°'

GLOVE_SYN_DICT = {
    'cryptocurrencies': 'crypto currencies',
    'ethereum'        : 'crypto currency',
    'fortnite'        : 'video game',
    'quorans'         : 'quora members',
    'brexit'          : 'britain exit',
    'redmi'           : 'xiaomi',
    '√'               : 'square root',
    '÷'               : 'division',
    '∞'               : 'infinity',
    '€'               : 'euro',
    '£'               : 'pound sterling',
    '$'               : 'dollar',
    '₹'               : 'rupee',
    '×'               : 'product',
    'ã'               : 'a',
    'è'               : 'e',
    'é'               : 'e',
    'ö'               : 'o',
    '²'               : 'squared',
    '∈'               : 'in',
    '∩'               : 'intersection',
    u'\u0398'         : 'Theta',
    u'\u03A0'         : 'Pi',
    u'\u03A9'         : 'Omega',
    u'\u0392'         : 'Beta',
    u'\u03B8'         : 'theta',
    u'\u03C0'         : 'pi',
    u'\u03C9'         : 'omega',
    u'\u03B2'         : 'beta',
}

def tokenize(s: str):
    return list(map(lambda w : w.strip(), s.split()))


def clean_text(x):
    x = x.lower()
    
    for p in SEP_PUNCTS:
        x = x.replace(p, " ")
    for p in SHOULD_KEEP_PUNCTS:
        x = x.replace(p, f" {p} ")
    for p in TO_REMOVE_PUNCTS:
        x = x.replace(p, "")
    
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    
    return x

def clean_syn(x):
    regex = re.compile('(%s)' % '|'.join(GLOVE_SYN_DICT.keys()))
    return regex.sub(lambda m : GLOVE_SYN_DICT.get(m.group(0), ''), x)

def clean_all(x):
    x = clean_text(x)
    x = clean_syn(x)
    return x
    


def build_vocabulary(df: pd.DataFrame) -> Counter:
    sentences = df.progress_apply(tokenize).values
    vocab = Counter()
    s_len = []
    
    for sentence in tqdm(sentences):  
        s_len.append(len(sentence))
        for word in sentence:
            vocab[word] += 1
    return vocab, np.array(s_len)


# clean
train_df["clean_question_text"] = train_df["question_text"].progress_apply(clean_all)
test_df["clean_question_text"] = test_df["question_text"].progress_apply(clean_all)

# vocab
train_vocab, train_s_len = build_vocabulary(train_df["clean_question_text"])
test_vocab, test_s_len = build_vocabulary(test_df["clean_question_text"])

In [None]:
d_train = scipy.stats.describe(train_s_len)
d_test = scipy.stats.describe(test_s_len)
print(f"train: {d_train}")
print(f"test: {d_test}")

nb = 60

plt.figure(figsize=(10, 6))

plt.hist(train_s_len, bins=nb, range=[0, 60], facecolor='red', label='train')

plt.hist(test_s_len, bins=nb, range=[0, 60], facecolor='blue', label='test')
plt.axvline(x=d_test.mean, color='cyan')

plt.title("Sentence length", size=24)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., prop={'size': 16})
plt.xticks([5*i for i in range(14)])
plt.tight_layout()
plt.show()

In [None]:
_n = 10
print(train_vocab.most_common(_n))
print(train_vocab.most_common()[:-_n-1:-1])
print("-"*64)
print(test_vocab.most_common(_n))
print(test_vocab.most_common()[:-_n-1:-1])

In [None]:
def get_emb_vocab_coverage(vocab, emb) -> (Counter, Counter):
    oov = Counter() # out-of-vocab
    inv = Counter() # in-vocab
    oov_uniq_num = inv_uniq_num = 0.0
    oov_all_num = inv_all_num = 0.0
    
    for w in tqdm(vocab):
        if w in emb:
            inv[w] = vocab[w]
            inv_uniq_num += 1
            inv_all_num += vocab[w]
        else:
            oov[w] = vocab[w]
            oov_uniq_num += 1
            oov_all_num += vocab[w]
    
    cov_uniq = 100.0 * round(inv_uniq_num / len(vocab), 5)
    cov_all = 100.0 * round(inv_all_num / (inv_all_num + oov_all_num), 5)
    
    print(f"oov_uniq: {oov_uniq_num}; inv_uniq: {inv_uniq_num}; all_uniq: {len(vocab)}")
    print("embeddings-vocabulary coverage (unique): %.3f%%" % cov_uniq)
    print("embeddings-vocabulary coverage (all text): %.3f%%" % cov_all)
    
    return oov, inv

In [None]:
oov, inv = get_emb_vocab_coverage(train_vocab, emb_glove)
oov.most_common(20)

In [None]:
oov, inv = get_emb_vocab_coverage(test_vocab, emb_glove)
oov.most_common(50)

## Make embedding matrix

In [None]:
def make_data(df, len_voc, sentence_maxlen):
    t = Tokenizer(num_words=len_voc, filters='')
    t.fit_on_texts(df['clean_question_text'])
    
    X = pad_sequences(
        t.texts_to_sequences(df['clean_question_text']), 
        maxlen=sentence_maxlen
    )
    Y = df['target'].values
    
    return X, Y, t.word_index

# X = (train_size x sentence_maxlen)
X_train, Y_train, word_index = make_data(
    train_df, len_voc=len(train_vocab), sentence_maxlen=60
)

In [None]:
def make_embedding_matrix(embeddings_dict, word_index, len_voc):
    """
    Random values of oov words
    """
    all_embs = np.stack(list(embeddings_dict.values()))
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len_voc, embed_size))
    
    for word, i in tqdm(word_index.items()):
        if i >= len_voc:
            continue
        embedding_vector = embeddings_dict.get(word, None)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix


embedding_matrix = make_embedding_matrix(emb_glove, word_index, len_voc=len(train_vocab))
print(all(embedding_matrix[word_index['test'], :] == emb_glove['test']))

del word_index
gc.collect()

In [None]:
embedding_matrix.shape