In [1]:
import gc
import re
import os
import pandas as pd
import numpy as np
import random
from sklearn import metrics
import string
import math
import operator
import time
from keras.preprocessing import text, sequence
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.utils.data
from gensim import utils

Using TensorFlow backend.


In [2]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# GENERAL HYPERPARAMS
num_folds = 5
seed = 42

# HYPERPARAMS FOR TEXT PROCESSING
max_features = 120000
maxlen = 100

# HYPERPARAMS FOR NN
batch_size = 1024
epochs_fixed = 4
epochs_trainable = 1
embed_size = 300
early_stopping_patience = 2
hidden_size = 60

set_seed(seed)

In [3]:
PATH = "./input/"

puncts = {'\u200b', ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√'}

def clean_text(x):
    x = str(x)
    table = str.maketrans({key: ' {punct} ' for key in puncts})
    return x.translate(table)

def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x

mispell_dict = {
"tamilans" : "tamilians",
"feku" : "liar",
"quorans" : "people who use quora",
"qoura": "quora",
"xiomi" : "phone",
"ipill" : "contraception",
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "i would",
"i'd" : "i had",
"i'll" : "i will",
"i'm" : "i am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "i have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" we will",
"didn't": "did not",
"tryin'":"trying"}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

def replace_typical_misspell(text):
    mispellings, mispellings_re = _get_mispell(mispell_dict)
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [4]:
%%time

train_df = pd.read_csv(PATH+'train.csv', usecols=['question_text', 'target'])
test_df = pd.read_csv(PATH+'test.csv', usecols = ['question_text'])

# 3RD PARTY CLEAN
#train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower())
#test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower())

train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))

train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_numbers(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_numbers(x))

train_df["question_text"] = train_df["question_text"].apply(lambda x: replace_typical_misspell(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: replace_typical_misspell(x))

# FOR CREATING PROCESSED DATA AND LABELS
train_sentences = train_df['question_text']
train_labels = train_df['target']
test_sentences = test_df['question_text']

del train_df, test_df

gc.collect()

# TOKENIZE TEXT
tokenizer = text.Tokenizer(num_words=max_features, oov_token='OOV', lower=False)
tokenizer.fit_on_texts(list(train_sentences) + list(test_sentences))

tokenized_train = tokenizer.texts_to_sequences(train_sentences)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

tokenized_test = tokenizer.texts_to_sequences(test_sentences)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

del tokenized_test, tokenized_train, train_sentences, test_sentences
gc.collect()

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index)+1)

CPU times: user 1min 47s, sys: 573 ms, total: 1min 47s
Wall time: 1min 47s


In [5]:
# LIST OF ALL EMBEDDINGS USED
embedding_list = [PATH+'embeddings/paragram_300_sl999/paragram_300_sl999.txt', 
PATH+'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec',
PATH+'embeddings/glove.840B.300d/glove.840B.300d.txt']

# MEAN AND STD VALUES FOR EMBEDDINGS
emb_mean_dict = {'paragram_300_sl999':-0.005324783269315958,
            'wiki-news-300d-1M':-0.0033469984773546457,
            'glove.840B.300d':-0.005838498938828707,
            'GoogleNews-vectors-negative300':-0.0051106834}

emb_std_dict = {'paragram_300_sl999':0.4934646189212799,
            'wiki-news-300d-1M':0.10985549539327621,
            'glove.840B.300d':0.4878219664096832,
            'GoogleNews-vectors-negative300':0.18445626}

global_mean = np.mean([i for i in emb_mean_dict.values()])
global_std = np.mean([i for i in emb_std_dict.values()])
global_embedding = np.random.normal(global_mean, global_std, (nb_words, embed_size))
embedding_count = np.zeros((nb_words,1))

In [6]:
%%time
for EMBEDDING_FILE in embedding_list:
    embedding_name = EMBEDDING_FILE.split('/')[3]
    for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore'):
        word, vec = o.split(' ', 1)
        if word not in word_index:
            continue
        i = word_index[word]
        if i >= nb_words:
            continue
        embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:embed_size]
        if len(embedding_vector) == embed_size:
            if embedding_count[i] == 0:
                global_embedding[i] = embedding_vector
            else:
                global_embedding[i] = (embedding_count[i]*global_embedding[i] + embedding_vector)/(embedding_count[i] + 1)
            embedding_count[i] += 1
    del embedding_vector
    gc.collect()

CPU times: user 32.4 s, sys: 2.62 s, total: 35 s
Wall time: 35 s


In [7]:
pd.Series(embedding_count.flatten()).value_counts().sort_index()/len(embedding_count) * 100

0.0     7.446667
1.0     3.472500
2.0    42.301667
3.0    46.779167
dtype: float64

In [8]:
word2vecpath = PATH + 'embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

In [9]:
unique_words = set()

In [10]:
%%time
with utils.smart_open(word2vecpath) as fin:
    header = utils.to_unicode(fin.readline(), encoding='utf8')
    vocab_size, vector_size = (int(x) for x in header.split())
    binary_len = np.dtype(np.float32).itemsize * vector_size
    for _ in tqdm(range(vocab_size)):
        word = []
        while True:
            ch = fin.read(1)
            if ch == b' ':
                break
            if ch == b'':
                raise EOFError("unexpected end of input")
            if ch != b'\n':
                word.append(ch)
        word = utils.to_unicode(b''.join(word), encoding='utf8', errors='strict')
        weights = np.fromstring(fin.read(binary_len), dtype=np.float32).astype(np.float32)
        if word not in word_index:
            continue
        i = word_index[word]
        if i >= nb_words or embedding_count[i] > 0:
            continue
        unique_words.add((word,i))
        global_embedding[i] = weights
        embedding_count[i] += 1

  app.launch_new_instance()
100%|██████████| 3000000/3000000 [00:33<00:00, 90613.33it/s]

CPU times: user 32.2 s, sys: 1.51 s, total: 33.7 s
Wall time: 33.1 s





In [11]:
pd.Series(embedding_count.flatten()).value_counts().sort_index()/len(embedding_count) * 100

0.0     7.184167
1.0     3.735000
2.0    42.301667
3.0    46.779167
dtype: float64

In [12]:
len(unique_words)

315

In [13]:
sorted(unique_words, key=lambda tup: tup[1])

[('bhakts', 12849),
 ('NICMAR', 13079),
 ('Awdhesh', 15039),
 ('Skripal', 16686),
 ('Alshamsi', 17181),
 ('BIPC', 17547),
 ('IISERs', 20109),
 ('Byju', 21234),
 ('UPSE', 22209),
 ('Kalpit', 23119),
 ('EFLU', 25644),
 ('Kovind', 26999),
 ('Strzok', 27353),
 ('Kattankulathur', 27857),
 ('Kainerugaba', 28464),
 ('Warmbier', 28831),
 ('PESU', 29713),
 ('RGIPT', 30995),
 ('DSCE', 32526),
 ('LHMC', 33946),
 ('KGMU', 34097),
 ('Gurmehar', 37441),
 ('PRMO', 38584),
 ('NIFTEM', 39386),
 ('IPHO', 40070),
 ('Jozwik', 42042),
 ('Bregoli', 43445),
 ('1crore', 45068),
 ('Delloite', 46168),
 ('Rupay', 47777),
 ('Kejariwal', 48961),
 ('Kahoot', 49659),
 ('PIVX', 51176),
 ('GTPL', 51465),
 ('multibaggers', 51954),
 ('JN0', 52726),
 ('SJMSOM', 52989),
 ('Mersal', 53257),
 ('MNLU', 53935),
 ('SGTB', 54938),
 ('PSIT', 55481),
 ('IGRUA', 56196),
 ('Bijwasan', 56394),
 ('Kolkey', 56666),
 ('Awadesh', 57230),
 ('Hampankatta', 57999),
 ('Dwarikesh', 58225),
 ('Netralaya', 59341),
 ('OPTCL', 59489),
 ('Meghman

In [14]:
missing_index = set()
for i in range(1,nb_words):
    if not embedding_count[i]:
        missing_index.add(i)

In [15]:
missing_words = {}
for k, v in word_index.items():
    if v in missing_index:
        missing_words[k] = v

In [16]:
len(missing_words)

8620

In [17]:
missing_words

{'Quorans': 1939,
 'UCEED': 8354,
 'BNBR': 9579,
 'Machedo': 9601,
 'Qoura': 11097,
 'LNMIIT': 11984,
 'Zerodha': 12348,
 'Kavalireddi': 12540,
 'Doklam': 13167,
 'Vajiram': 13318,
 'Unacademy': 13523,
 'MUOET': 14021,
 'AlShamsi': 14074,
 'Bhakts': 14745,
 'HackerRank': 14785,
 'eLitmus': 15602,
 'SRMJEE': 15894,
 'coinbase': 16222,
 'SGSITS': 16618,
 'upwork': 17566,
 'BMSCE': 17576,
 'Binance': 17722,
 'SRMJEEE': 17985,
 'Zebpay': 17990,
 'Golang': 18566,
 'MHCET': 19298,
 'Adhaar': 20110,
 'PESSAT': 20299,
 'Koinex': 20329,
 'adhaar': 20485,
 'demonitisation': 20511,
 'USICT': 20823,
 'LBSNAA': 20863,
 'unacademy': 21260,
 'Whst': 21455,
 'ReactJS': 21721,
 'Codeforces': 22331,
 'Demonetization': 22390,
 'Fiitjee': 22422,
 'Trumpcare': 22461,
 'Bittrex': 22624,
 'Irodov': 22638,
 'FTRE': 22672,
 'tensorflow': 22692,
 'Tensorflow': 22803,
 'Simpliv': 22983,
 'Howcan': 23015,
 'JoSAA': 23197,
 'NLUs': 23362,
 'MHTCET': 23456,
 'RLWL': 23509,
 'IIITH': 23541,
 'IGDTUW': 23621,
 'apist

In [18]:
high_freq = set()
for i in range(1,nb_words):
    if embedding_count[i] > 10:
        high_freq.add(i)

In [19]:
high_freq_words = {}
for k, v in word_index.items():
    if v in high_freq:
        high_freq_words[k] = v

In [20]:
len(high_freq_words)

0

In [21]:
high_freq_words

{}

# Changing ordering of embeddings and creating matrix

In [26]:
# LIST OF ALL EMBEDDINGS USED
embedding_list = [PATH+'embeddings/glove.840B.300d/glove.840B.300d.txt', PATH+'embeddings/paragram_300_sl999/paragram_300_sl999.txt', PATH+'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec']

# MEAN AND STD VALUES FOR EMBEDDINGS
emb_mean_dict = {'paragram_300_sl999':-0.005324783269315958,
            'wiki-news-300d-1M':-0.0033469984773546457,
            'glove.840B.300d':-0.005838498938828707,
            'GoogleNews-vectors-negative300':-0.0051106834}

emb_std_dict = {'paragram_300_sl999':0.4934646189212799,
            'wiki-news-300d-1M':0.10985549539327621,
            'glove.840B.300d':0.4878219664096832,
            'GoogleNews-vectors-negative300':0.18445626}

global_mean = np.mean([i for i in emb_mean_dict.values()])
global_std = np.mean([i for i in emb_std_dict.values()])
global_embedding = np.random.normal(global_mean, global_std, (nb_words, embed_size))
embedding_count = np.zeros((nb_words,1))

In [27]:
%%time
word_count_embedding = [0, 0 ,0]
for k in range(len(embedding_list)):
    embedding_name = embedding_list[k].split('/')[3]
    for o in open(embedding_list[k], encoding="utf8", errors='ignore'):
        word, vec = o.split(' ', 1)
        if word not in word_index:
            continue
        i = word_index[word]
        if i >= nb_words or embedding_count[i] > 0:
            continue
        embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:embed_size]
        if len(embedding_vector) == embed_size:
            global_embedding[i] = embedding_vector
            embedding_count[i] += 1
            word_count_embedding[k] += 1
    print(embedding_name, '\t', word_count_embedding[k])
    del embedding_vector
    gc.collect()

glove.840B.300d 	 109017
paragram_300_sl999 	 1164
wiki-news-300d-1M 	 883
CPU times: user 19 s, sys: 2.66 s, total: 21.6 s
Wall time: 21.5 s


In [39]:
pd.Series(embedding_count.flatten()).value_counts().sort_index()/len(embedding_count) * 100

0.0     7.446667
1.0    92.553333
dtype: float64

In [40]:
word2vecpath = PATH + 'embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

In [41]:
unique_words = set()

In [42]:
%%time
with utils.smart_open(word2vecpath) as fin:
    header = utils.to_unicode(fin.readline(), encoding='utf8')
    vocab_size, vector_size = (int(x) for x in header.split())
    binary_len = np.dtype(np.float32).itemsize * vector_size
    for _ in tqdm(range(vocab_size)):
        word = []
        while True:
            ch = fin.read(1)
            if ch == b' ':
                break
            if ch == b'':
                raise EOFError("unexpected end of input")
            if ch != b'\n':
                word.append(ch)
        word = utils.to_unicode(b''.join(word), encoding='utf8', errors='strict')
        weights = np.fromstring(fin.read(binary_len), dtype=np.float32).astype(np.float32)
        if word not in word_index:
            continue
        i = word_index[word]
        if i >= nb_words or embedding_count[i] > 0:
            continue
        unique_words.add((word,i))
        global_embedding[i] = weights
        embedding_count[i] += 1

  app.launch_new_instance()
100%|██████████| 3000000/3000000 [00:27<00:00, 109972.39it/s]

CPU times: user 26.5 s, sys: 1.1 s, total: 27.6 s
Wall time: 27.3 s





In [43]:
pd.Series(embedding_count.flatten()).value_counts().sort_index()/len(embedding_count) * 100

0.0     7.184167
1.0    92.815833
dtype: float64

In [44]:
len(unique_words)

315

In [45]:
sorted(unique_words, key=lambda tup: tup[1])

[('bhakts', 12849),
 ('NICMAR', 13079),
 ('Awdhesh', 15039),
 ('Skripal', 16686),
 ('Alshamsi', 17181),
 ('BIPC', 17547),
 ('IISERs', 20109),
 ('Byju', 21234),
 ('UPSE', 22209),
 ('Kalpit', 23119),
 ('EFLU', 25644),
 ('Kovind', 26999),
 ('Strzok', 27353),
 ('Kattankulathur', 27857),
 ('Kainerugaba', 28464),
 ('Warmbier', 28831),
 ('PESU', 29713),
 ('RGIPT', 30995),
 ('DSCE', 32526),
 ('LHMC', 33946),
 ('KGMU', 34097),
 ('Gurmehar', 37441),
 ('PRMO', 38584),
 ('NIFTEM', 39386),
 ('IPHO', 40070),
 ('Jozwik', 42042),
 ('Bregoli', 43445),
 ('1crore', 45068),
 ('Delloite', 46168),
 ('Rupay', 47777),
 ('Kejariwal', 48961),
 ('Kahoot', 49659),
 ('PIVX', 51176),
 ('GTPL', 51465),
 ('multibaggers', 51954),
 ('JN0', 52726),
 ('SJMSOM', 52989),
 ('Mersal', 53257),
 ('MNLU', 53935),
 ('SGTB', 54938),
 ('PSIT', 55481),
 ('IGRUA', 56196),
 ('Bijwasan', 56394),
 ('Kolkey', 56666),
 ('Awadesh', 57230),
 ('Hampankatta', 57999),
 ('Dwarikesh', 58225),
 ('Netralaya', 59341),
 ('OPTCL', 59489),
 ('Meghman

In [46]:
missing_index = set()
for i in range(1,nb_words):
    if not embedding_count[i]:
        missing_index.add(i)

In [47]:
missing_words = {}
for k, v in word_index.items():
    if v in missing_index:
        missing_words[k] = v

In [48]:
len(missing_words)

8620

In [49]:
missing_words

{'Quorans': 1939,
 'UCEED': 8354,
 'BNBR': 9579,
 'Machedo': 9601,
 'Qoura': 11097,
 'LNMIIT': 11984,
 'Zerodha': 12348,
 'Kavalireddi': 12540,
 'Doklam': 13167,
 'Vajiram': 13318,
 'Unacademy': 13523,
 'MUOET': 14021,
 'AlShamsi': 14074,
 'Bhakts': 14745,
 'HackerRank': 14785,
 'eLitmus': 15602,
 'SRMJEE': 15894,
 'coinbase': 16222,
 'SGSITS': 16618,
 'upwork': 17566,
 'BMSCE': 17576,
 'Binance': 17722,
 'SRMJEEE': 17985,
 'Zebpay': 17990,
 'Golang': 18566,
 'MHCET': 19298,
 'Adhaar': 20110,
 'PESSAT': 20299,
 'Koinex': 20329,
 'adhaar': 20485,
 'demonitisation': 20511,
 'USICT': 20823,
 'LBSNAA': 20863,
 'unacademy': 21260,
 'Whst': 21455,
 'ReactJS': 21721,
 'Codeforces': 22331,
 'Demonetization': 22390,
 'Fiitjee': 22422,
 'Trumpcare': 22461,
 'Bittrex': 22624,
 'Irodov': 22638,
 'FTRE': 22672,
 'tensorflow': 22692,
 'Tensorflow': 22803,
 'Simpliv': 22983,
 'Howcan': 23015,
 'JoSAA': 23197,
 'NLUs': 23362,
 'MHTCET': 23456,
 'RLWL': 23509,
 'IIITH': 23541,
 'IGDTUW': 23621,
 'apist

In [50]:
high_freq = set()
for i in range(1,nb_words):
    if embedding_count[i] > 10:
        high_freq.add(i)

In [51]:
high_freq_words = {}
for k, v in word_index.items():
    if v in high_freq:
        high_freq_words[k] = v

In [52]:
len(high_freq_words)

0

In [53]:
high_freq_words

{}