In [1]:
import gc
import re
import os
import pandas as pd
import numpy as np
import random
from sklearn import metrics
import string
import math
import operator
import time
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import gensim

In [14]:
tqdm.pandas()

In [2]:
PATH = './input/'

In [3]:
puncts = {',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√'}

def clean_text(x):
    x = str(x)
    table = str.maketrans({key: ' {punct} ' for key in puncts})
    return x.translate(table)

def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x

mispell_dict = {"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying",
"colour":"color", # Added lines after this (inclusive)
"centre":"center",
"didnt":"did not",
"doesnt":"does not",
"isnt":"is not",
"shouldnt":"should not",
"favourite":"favorite",
"travelling":"traveling",
"counselling":"counseling",
"theatre":"theater",
"cancelled":"canceled",
"labour":"labor",
"organisation":"organization",
"wwii": "world war 2",
"citicise":"criticize",
"instagram": "social medium",
"whatsapp": "social medium",
"snapchat": "social medium"}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

def replace_typical_misspell(text):
    mispellings, mispellings_re = _get_mispell(mispell_dict)
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [4]:
train_df = pd.read_csv(PATH+'train.csv', usecols=['question_text', 'target'])
test_df = pd.read_csv(PATH+'test.csv', usecols = ['question_text'])

In [5]:
word2vec_path = PATH+'embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

In [6]:
embedding_list = [PATH+'embeddings/paragram_300_sl999/paragram_300_sl999.txt', 
PATH+'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec',
PATH+'embeddings/glove.840B.300d/glove.840B.300d.txt']

In [7]:
emb_mean_dict = {'paragram_300_sl999':-0.005324783269315958,
            'wiki-news-300d-1M':-0.0033469984773546457,
            'glove.840B.300d':-0.005838498938828707}

emb_std_dict = {'paragram_300_sl999':0.4934646189212799,
            'wiki-news-300d-1M':0.10985549539327621,
            'glove.840B.300d':0.4878219664096832}

In [33]:
def check_coverage(data_vocab, embedding_word_list):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(data_vocab):
        if word in embedding_word_list:
            a[word] = 1
            k += data_vocab[word]
        else:
            oov[word] = data_vocab[word]
            i += data_vocab[word]

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(data_vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [12]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [8]:
embedding_word_list = set()

In [9]:
for EMBEDDING_FILE in embedding_list:
    embedding_name = EMBEDDING_FILE.split('/')[3]
    print('Adding words from {}\n'.format(embedding_name))
    for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore'):
        word, _ = o.split(' ', 1)
        embedding_word_list.add(word)

Adding words from paragram_300_sl999

Adding words from wiki-news-300d-1M

Adding words from glove.840B.300d



In [57]:
embedding_word_list

{'02:55:06',
 '07:19:01',
 'HANLEY',
 'tilleys',
 '1.256',
 'happyy',
 'MimeMessage',
 'Olcan',
 'HOPKIN',
 'xavin',
 'Nelisse',
 'Ostersund',
 '29219',
 'May-05-09',
 'shallowness',
 '1.819',
 '25.98',
 'Mtoni',
 'dc500',
 'KellyGang',
 'Polanski',
 'results.htm',
 'Hermanni',
 'Bayston',
 'SIP/VoIP',
 'Rev-A-Shelf',
 '24/08/2012',
 'Cario',
 'arven',
 'estácio',
 'natality',
 '00607',
 '03:31:09',
 'Hoshimittsu',
 'mediasanta',
 '15-mar-2013',
 'Maxconsole',
 'PMStart',
 'BFFL',
 'self-programming',
 'agonuvid',
 'Balconies/Patio',
 'bepreve',
 'schologirl',
 'Droops',
 '17/10/2008',
 'aeromechanical',
 'Nedelcheva',
 'dukehealth.org',
 '16:23:36',
 '2002,2003',
 'C.C.H.',
 '4,947',
 'PCI-X',
 'eusb',
 'nv400',
 'Sonnier',
 'unfavourable',
 'chaseville',
 '10:42:33',
 'monhs',
 'jurmain',
 'G35s',
 'rq-4',
 '22:10:37',
 'speedweeks',
 'polyken',
 'palhaço',
 'Sentral',
 'zdnet',
 'Grauerholz',
 'tsukuyomi',
 'MVJ',
 'Victoriana',
 'Sadir',
 'dentalplans.com',
 'crÈme',
 'plichta',
 '

# No cleaning

In [35]:
train_sentences = train_df["question_text"].progress_apply(lambda x: x.split()).values
test_sentences = test_df["question_text"].progress_apply(lambda x: x.split()).values
data_vocab = build_vocab(np.concatenate((test_sentences, train_sentences)))
len(data_vocab)

100%|██████████| 1306122/1306122 [00:04<00:00, 301994.67it/s]
100%|██████████| 56370/56370 [00:00<00:00, 445662.59it/s]
100%|██████████| 1362492/1362492 [00:05<00:00, 258262.72it/s]


522569

In [36]:
oov = check_coverage(data_vocab,embedding_word_list)

100%|██████████| 522569/522569 [00:00<00:00, 940715.39it/s]


Found embeddings for 34.58% of vocab
Found embeddings for  88.28% of all text


In [37]:
oov[:10]

[('India?', 17082),
 ('it?', 13436),
 ("What's", 12985),
 ('do?', 9112),
 ('life?', 8074),
 ('you?', 6553),
 ('me?', 6485),
 ('them?', 6421),
 ('time?', 5994),
 ('world?', 5632)]

# Lower

In [38]:
train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower())
test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower())

train_sentences = train_df["question_text"].progress_apply(lambda x: x.split()).values
test_sentences = test_df["question_text"].progress_apply(lambda x: x.split()).values
data_vocab = build_vocab(np.concatenate((test_sentences, train_sentences)))
len(data_vocab)

100%|██████████| 1306122/1306122 [00:05<00:00, 241413.38it/s]
100%|██████████| 56370/56370 [00:00<00:00, 450499.44it/s]
100%|██████████| 1362492/1362492 [00:05<00:00, 259922.69it/s]


462453

In [39]:
oov = check_coverage(data_vocab,embedding_word_list)

100%|██████████| 462453/462453 [00:00<00:00, 961131.67it/s]


Found embeddings for 31.88% of vocab
Found embeddings for  88.29% of all text


In [40]:
oov[:10]

[('india?', 17092),
 ("what's", 13977),
 ('it?', 13702),
 ('do?', 9125),
 ('life?', 8114),
 ('why?', 7674),
 ('you?', 6572),
 ('me?', 6525),
 ('them?', 6423),
 ('time?', 6021)]

# Clean punct

In [41]:
train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))

train_sentences = train_df["question_text"].progress_apply(lambda x: x.split()).values
test_sentences = test_df["question_text"].progress_apply(lambda x: x.split()).values
data_vocab = build_vocab(np.concatenate((test_sentences, train_sentences)))
len(data_vocab)

100%|██████████| 1306122/1306122 [00:06<00:00, 210630.66it/s]
100%|██████████| 56370/56370 [00:00<00:00, 431326.50it/s]
100%|██████████| 1362492/1362492 [00:05<00:00, 264052.24it/s]


200396

In [42]:
oov = check_coverage(data_vocab,embedding_word_list)

100%|██████████| 200396/200396 [00:00<00:00, 925217.04it/s]

Found embeddings for 74.45% of vocab
Found embeddings for  87.68% of all text





In [44]:
oov[:10]

51211

# Lower + punct

In [45]:
train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower())
test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower())

train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))

train_sentences = train_df["question_text"].progress_apply(lambda x: x.split()).values
test_sentences = test_df["question_text"].progress_apply(lambda x: x.split()).values
data_vocab = build_vocab(np.concatenate((test_sentences, train_sentences)))
len(data_vocab)

100%|██████████| 1306122/1306122 [00:05<00:00, 253850.76it/s]
100%|██████████| 56370/56370 [00:00<00:00, 376318.35it/s]
100%|██████████| 1362492/1362492 [00:06<00:00, 216370.83it/s]


200397

In [46]:
oov = check_coverage(data_vocab,embedding_word_list)

100%|██████████| 200397/200397 [00:00<00:00, 905695.24it/s]


Found embeddings for 74.45% of vocab
Found embeddings for  80.39% of all text


In [47]:
oov[:10]

[('{punct}', 4831944),
 ('quorans', 885),
 ('redmi', 398),
 ('coinbase', 150),
 ('oneplus', 144),
 ('uceed', 126),
 ('bhakts', 118),
 ('upwork', 117),
 ('machedo', 112),
 ('gdpr', 110)]

# lower, punct, num

In [48]:
train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower())
test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower())

train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))

train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_numbers(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_numbers(x))

train_sentences = train_df["question_text"].progress_apply(lambda x: x.split()).values
test_sentences = test_df["question_text"].progress_apply(lambda x: x.split()).values
data_vocab = build_vocab(np.concatenate((test_sentences, train_sentences)))
len(data_vocab)

100%|██████████| 1306122/1306122 [00:08<00:00, 154528.85it/s]
100%|██████████| 56370/56370 [00:00<00:00, 293366.20it/s]
100%|██████████| 1362492/1362492 [00:07<00:00, 170613.16it/s]


192114

In [49]:
oov = check_coverage(data_vocab,embedding_word_list)

100%|██████████| 192114/192114 [00:00<00:00, 804125.61it/s]

Found embeddings for 73.91% of vocab
Found embeddings for  71.87% of all text





In [50]:
oov[:10]

[('{punct}', 9663888),
 ('##th', 6188),
 ('##k', 1336),
 ('##s', 1187),
 ('####s', 1121),
 ('quorans', 885),
 ('###k', 537),
 ('##st', 430),
 ('redmi', 398),
 ('##m', 259)]

# Lower, punct, num, misspelling

In [54]:
train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower())
test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower())

train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))

train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_numbers(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_numbers(x))

train_df["question_text"] = train_df["question_text"].apply(lambda x: replace_typical_misspell(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: replace_typical_misspell(x))

train_sentences = train_df["question_text"].progress_apply(lambda x: x.split()).values
test_sentences = test_df["question_text"].progress_apply(lambda x: x.split()).values
data_vocab = build_vocab(np.concatenate((test_sentences, train_sentences)))
len(data_vocab)

100%|██████████| 1306122/1306122 [00:15<00:00, 85337.24it/s] 
100%|██████████| 56370/56370 [00:00<00:00, 151752.66it/s]
100%|██████████| 1362492/1362492 [00:18<00:00, 75467.28it/s]


189726

In [58]:
data_vocab

{'my': 117716,
 'voice': 1603,
 'range': 1220,
 'is': 462412,
 'a2': 95,
 '{punct}': 39427450,
 'punct': 36625529,
 'c5': 13,
 'chest': 685,
 'goes': 1381,
 'up': 20130,
 'to': 425532,
 'f4': 16,
 'included': 588,
 'sample': 480,
 'in': 394408,
 'higher': 2257,
 'what': 491736,
 'type': 5292,
 'how': 302822,
 'much': 24490,
 'does': 90391,
 'a': 429577,
 'tutor': 155,
 'earn': 3137,
 'bangalore': 3028,
 'are': 253577,
 'the': 694607,
 'best': 65202,
 'made': 9386,
 'pocket': 267,
 'knives': 102,
 'under': 6490,
 'why': 171117,
 'would': 64130,
 'they': 46242,
 'add': 2475,
 'hypothetical': 235,
 'scenario': 727,
 'that': 112556,
 's': 90840,
 'impossible': 596,
 'happen': 8828,
 'link': 1641,
 'below': 1034,
 'it': 154035,
 'shows': 1711,
 'meters': 179,
 'rise': 1033,
 'sea': 1023,
 'level': 4153,
 'look': 8077,
 'like': 51549,
 'dresscode': 5,
 'for': 213328,
 'techmahindra': 12,
 'freshers': 391,
 'well': 5540,
 'you': 210975,
 'adapting': 39,
 'trump': 15382,
 'era': 746,
 'should'

In [55]:
oov = check_coverage(data_vocab,embedding_word_list)

100%|██████████| 189726/189726 [00:00<00:00, 890805.93it/s]

Found embeddings for 74.91% of vocab
Found embeddings for  57.85% of all text





In [56]:
oov[:10]

[('{punct}', 39427450),
 ('quorans', 885),
 ('redmi', 399),
 ('coinbase', 150),
 ('oneplus', 144),
 ('uceed', 127),
 ('bhakts', 118),
 ('upwork', 117),
 ('machedo', 112),
 ('gdpr', 110)]