In [1]:
import os
import sys
sys.path.append('..')
from embeddings import load_glove

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
data_path = os.path.join('.input')
glove_filepath = os.path.join(data_path, 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt')
quora_path = os.path.join(data_path, 'train.csv')

In [3]:
MAX_FEATURES = 50_000
MAX_LEN = 50

## Data Preprocessing

### Spacy Playground

In [30]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [32]:
doc = nlp("She hasn't been there. Harry Potter walked home. She's at Hermione's house", disable=['tagger', 'parser', 'ner'])
print(' '.join([token.orth_ for token in doc]))

She has n't been there . Harry Potter walked home . She 's at Hermione 's house


### Load Data

In [44]:
quora = pd.read_csv(quora_path, usecols=['question_text'])
quora.head()

Unnamed: 0,question_text
0,How did Quebec nationalists see their province...
1,"Do you have an adopted dog, how would you enco..."
2,Why does velocity affect time? Does velocity a...
3,How did Otto von Guericke used the Magdeburg h...
4,Can I convert montra helicon D to a mountain b...


### Preprocess Text using Spacy

In [45]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])

In [46]:
def clean_text(doc):
    return ' '.join([token.orth_ for token in doc])

In [47]:
%%time

text = [clean_text(doc) for doc in nlp.pipe(tqdm(quora.question_text))]
quora['text'] = text

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1306122/1306122 [09:00<00:00, 2415.69it/s]


Wall time: 9min


In [48]:
quora.head()

Unnamed: 0,question_text,text
0,How did Quebec nationalists see their province...,How did Quebec nationalists see their province...
1,"Do you have an adopted dog, how would you enco...","Do you have an adopted dog , how would you enc..."
2,Why does velocity affect time? Does velocity a...,Why does velocity affect time ? Does velocity ...
3,How did Otto von Guericke used the Magdeburg h...,How did Otto von Guericke used the Magdeburg h...
4,Can I convert montra helicon D to a mountain b...,Can I convert montra helicon D to a mountain b...


### Build Word Index using Keras Tokenizer

In [49]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = MAX_FEATURES, lower=False, filters='')

In [50]:
%%time

tokenizer.fit_on_texts(quora.text)

Wall time: 18.7 s


In [51]:
seqs = tokenizer.texts_to_sequences(quora.question_text[:1000])

In [52]:
seqs[:2]

[[10, 63, 7009, 8085, 180, 67, 6740, 39, 5, 1224, 6, 2],
 [57, 16, 28, 34, 3898, 77, 46, 16, 3663, 41, 4, 3096, 11, 47]]

In [53]:
tokenizer.texts_to_sequences(['privet kak dela'.split()])

[[]]

In [54]:
from keras.preprocessing.sequence import pad_sequences

seqs = pad_sequences(seqs, maxlen = MAX_LEN)

In [55]:
seqs[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   10,   63, 7009, 8085,  180,   67,
        6740,   39,    5, 1224,    6,    2],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,   57,   16,   28,   34, 3898,   77,   46,   16,
        3663,   41,    4, 3096,   11,   47]])

## Word Vectors Loading

In [40]:
glove = load_glove(glove_filepath)

===> running load_glove ...
<=== finished load_glove in 233.58 s.


In [38]:
glove[","]

array([-0.082752 ,  0.67204  , -0.14987  , -0.064983 ,  0.056491 ,
        0.40228  ,  0.0027747, -0.3311   , -0.30691  ,  2.0817   ,
        0.031819 ,  0.013643 ,  0.30265  ,  0.0071297, -0.5819   ,
       -0.2774   , -0.062254 ,  1.1451   , -0.24232  ,  0.1235   ,
       -0.12243  ,  0.33152  , -0.006162 , -0.30541  , -0.13057  ,
       -0.054601 ,  0.037083 , -0.070552 ,  0.5893   , -0.30385  ,
        0.2898   , -0.14653  , -0.27052  ,  0.37161  ,  0.32031  ,
       -0.29125  ,  0.0052483, -0.13212  , -0.052736 ,  0.087349 ,
       -0.26668  , -0.16897  ,  0.015162 , -0.0083746, -0.14871  ,
        0.23413  , -0.20719  , -0.091386 ,  0.40075  , -0.17223  ,
        0.18145  ,  0.37586  , -0.28682  ,  0.37289  , -0.16185  ,
        0.18008  ,  0.3032   , -0.13216  ,  0.18352  ,  0.095759 ,
        0.094916 ,  0.008289 ,  0.11761  ,  0.34046  ,  0.03677  ,
       -0.29077  ,  0.058303 , -0.027814 ,  0.082941 ,  0.1862   ,
       -0.031494 ,  0.27985  , -0.074412 , -0.13762  , -0.2186

## Emmbeddings

In [67]:
%%time

from collections import defaultdict


def get_vector(index, word, stats, unknowns):
    vector = index.get(word)
    if vector is not None:
        stats['found'] += 1
        return vector
        
    vector = index.get(word.capitalize())
    if vector is not None:
        stats['found_capitalized'] += 1
        return vector
        
        
    vector = index.get(word.upper())
    if vector is not None:
        stats['found_upper'] += 1
        return vector
    
    vector = index.get(word.lower())
    if vector is not None:
        stats['found_lower'] += 1
        return vector

    stats['not_found'] += 1
    unknowns.append(word)
    return None
    

def build_embeddings(embeddings_index, word_index, max_features):
    embed_mean, embed_std = -0.005838499,0.48782197
    embed_size = 300
    num_words = min(max_features, len(word_index))
    print(embed_size, num_words)
    embeddings = np.random.normal(embed_mean, embed_std, (num_words, embed_size))
    
    stats = defaultdict(int)
    unknowns = []
    for word, index in word_index.items():
        if index >= max_features: continue
        vector = get_vector(embeddings_index, word, stats, unknowns)
        if vector is not None: embeddings[index] = vector
    
    print('Statistics')
    stats = pd.DataFrame(data = {'Counts': stats.values()}, index=stats.keys())
    print(stats)
    
    print('==== Unknows Words ====')
    for words in zip(sorted(unknowns), sorted(unknowns, key=lambda x: x[::-1])):
        print('{:20} {:>20}'.format(*words))
    
    return embeddings

embeddings = build_embeddings(glove, tokenizer.word_index, MAX_FEATURES)

print(embeddings[:1])

300 50000
Statistics
                                     Counts
found              (49077, 819, 17, 54, 32)
not_found          (49077, 819, 17, 54, 32)
found_lower        (49077, 819, 17, 54, 32)
found_capitalized  (49077, 819, 17, 54, 32)
found_upper        (49077, 819, 17, 54, 32)
==== Unknows Words ====

                                       

"-                                     :(
,-                                     8)
-1/12                                90%+
-1/2                                 95%+
-i                                     A+
-ve                                    B+
-x                                    AB+
-x^2                                   C+
..                                     D+
.22LR                                  G+
.How                                   H+
.I                                     K+
.If                                    O+
.Is                                  H3O+
.NET                               LGBTQ+
.Net               

Byju                                   5=
C+                                     A=
C-                                     B=
CAT'17                                 x=
CIWG                                  dx=
CPTSD                                  y=
CUCET                              LBSNAA
CarPlay                             AIEEA
Cas9                                JCPOA
Caulifla                            BALLB
Chromecast                           JGEC
Ciaz                                 LHMC
Codeforces                           BIPC
Coinbase                            JECRC
Cryptocurrencies                     E&TC
D+                                   EXTC
D3300                               1300D
D3400                               UCEED
D5300                               CPTSD
D7200                               IIITD
DCEU                                 DSCE
DILR                                BMSCE
DSATM                             SRMJEEE
DSCE                              

Otsutsuki                          Azdome
P(x                            dichlorine
PESSAT                            Zenfone
PGDBF                             zenfone
PGDHRM                        WikiTribune
PGDIE                         Wikitribune
PMAY                               Mbappe
PRMO                            Trumpcare
Padmaavat                       Hashflare
Padmavat                             ftre
Parmanu                          Coinbase
Patreon                          coinbase
Petuhov                       chapterwise
Philando                       uncollapse
PhonePe                        Arrowverse
Pizzagate                       polyhouse
Plancess                        Pizzagate
Poloniex                         Delloite
Practo                          YourQuote
Puneri                                -ve
PyQt5                            OneDrive
PyTorch                              .exe
Qidian                                .If
Qoura                             

bhakts                            Aksener
binance                          Antminer
biromantic                     gaslighter
bitconnect                        Jupyter
blockchain.info                       /or
blockchains                        Trezor
book(s                              toppr
boruto                     Kattankulathur
boys’                              empowr
brexit                           friend(s
bschools                            god(s
byju                           language(s
byjus                             movie(s
c^2                                 one(s
can`t                             thing(s
can´t                              book(s
chapterwise                      reason(s
class11                          parent(s
class12                             way(s
class9                               A2As
clickbait                             .Is
cm^2                                LSTMs
cm^3                                 RNNs
coinbase                          

w/                                CarPlay
way(s                              Snapay
wish.com                           Zebpay
without​                           zebpay
wumao                           covalency
x(t                         crytocurrency
x+1                         cryptocurency
x+1/x                                .Why
x+2                                  .why
x+3                            musical.ly
x=                            WebAssembly
x=0                             Unacademy
x=2                             unacademy
x=3                         cryptocurreny
x[/math                          WannaCry
x^                                sallery
x^2                                \infty
x^2+y^2                           Nearbuy
x^3                                  Ciaz
x^4                                   |x|
x^5                                   mc²
x^n                                    m²
x^x                                  करना
y(t                               

In [51]:
def strip_non_alpha(word):
    return ''.join(c for c in word if c.isalpha())

strip_non_alpha("'real-5'")

'real'