In [1]:
import os
import sys
sys.path.append('..')
from embeddings import load_glove

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
data_path = os.path.join('.input')
glove_filepath = os.path.join(data_path, 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt')
quora_path = os.path.join(data_path, 'train.csv')

In [3]:
MAX_FEATURES = 50_000
MAX_LEN = 50

## Data Preprocessing

### NLTK Playground

In [4]:
from nltk.tokenize import word_tokenize

doc = word_tokenize("She hasn't been there. Harry Potter walked home. She's at Hermione's house")
print(' '.join([token for token in doc]))

She has n't been there . Harry Potter walked home . She 's at Hermione 's house


### Load Data

In [5]:
quora = pd.read_csv(quora_path, usecols=['question_text'])
quora.head()

Unnamed: 0,question_text
0,How did Quebec nationalists see their province...
1,"Do you have an adopted dog, how would you enco..."
2,Why does velocity affect time? Does velocity a...
3,How did Otto von Guericke used the Magdeburg h...
4,Can I convert montra helicon D to a mountain b...


### Preprocess Text using NLTK

In [6]:
from nltk.tokenize import word_tokenize

def clean_data(df):
    df['text'] = df.question_text.apply(lambda x: ' '.join(word_tokenize(x)))

In [7]:
%%time

clean_data(quora)

Wall time: 3min 48s


In [8]:
quora.head()

Unnamed: 0,question_text,text
0,How did Quebec nationalists see their province...,How did Quebec nationalists see their province...
1,"Do you have an adopted dog, how would you enco...","Do you have an adopted dog , how would you enc..."
2,Why does velocity affect time? Does velocity a...,Why does velocity affect time ? Does velocity ...
3,How did Otto von Guericke used the Magdeburg h...,How did Otto von Guericke used the Magdeburg h...
4,Can I convert montra helicon D to a mountain b...,Can I convert montra helicon D to a mountain b...


### Build Word Index using Keras Tokenizer

In [9]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = MAX_FEATURES, lower=False, filters='')

Using TensorFlow backend.


In [10]:
%%time

tokenizer.fit_on_texts(quora.text)

Wall time: 21.9 s


In [11]:
seqs = tokenizer.texts_to_sequences(quora.question_text[:1000])

In [12]:
seqs[:2]

[[10, 62, 6988, 8328, 179, 67, 6705, 39, 5, 1231, 6, 2],
 [55, 16, 28, 33, 3873, 76, 44, 16, 3616, 40, 4, 3080, 11, 45]]

In [13]:
tokenizer.texts_to_sequences(['privet kak dela'.split()])

[[]]

In [14]:
from keras.preprocessing.sequence import pad_sequences

seqs = pad_sequences(seqs, maxlen = MAX_LEN)

In [15]:
seqs[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   10,   62, 6988, 8328,  179,   67,
        6705,   39,    5, 1231,    6,    2],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,   55,   16,   28,   33, 3873,   76,   44,   16,
        3616,   40,    4, 3080,   11,   45]])

## Word Vectors Loading

In [16]:
glove = load_glove(glove_filepath)

===> running load_glove ...
<=== finished load_glove in 227.84 s.


In [17]:
glove[","]

array([-0.082752 ,  0.67204  , -0.14987  , -0.064983 ,  0.056491 ,
        0.40228  ,  0.0027747, -0.3311   , -0.30691  ,  2.0817   ,
        0.031819 ,  0.013643 ,  0.30265  ,  0.0071297, -0.5819   ,
       -0.2774   , -0.062254 ,  1.1451   , -0.24232  ,  0.1235   ,
       -0.12243  ,  0.33152  , -0.006162 , -0.30541  , -0.13057  ,
       -0.054601 ,  0.037083 , -0.070552 ,  0.5893   , -0.30385  ,
        0.2898   , -0.14653  , -0.27052  ,  0.37161  ,  0.32031  ,
       -0.29125  ,  0.0052483, -0.13212  , -0.052736 ,  0.087349 ,
       -0.26668  , -0.16897  ,  0.015162 , -0.0083746, -0.14871  ,
        0.23413  , -0.20719  , -0.091386 ,  0.40075  , -0.17223  ,
        0.18145  ,  0.37586  , -0.28682  ,  0.37289  , -0.16185  ,
        0.18008  ,  0.3032   , -0.13216  ,  0.18352  ,  0.095759 ,
        0.094916 ,  0.008289 ,  0.11761  ,  0.34046  ,  0.03677  ,
       -0.29077  ,  0.058303 , -0.027814 ,  0.082941 ,  0.1862   ,
       -0.031494 ,  0.27985  , -0.074412 , -0.13762  , -0.2186

## Emmbeddings

In [28]:
%%time

from collections import defaultdict


def get_vector(index, word, stats, unknowns):
    vector = index.get(word)
    if vector is not None:
        stats['found'] += 1
        return vector
        
    vector = index.get(word.capitalize())
    if vector is not None:
        stats['found_capitalized'] += 1
        return vector
        
        
    vector = index.get(word.upper())
    if vector is not None:
        stats['found_upper'] += 1
        return vector
    
    vector = index.get(word.lower())
    if vector is not None:
        stats['found_lower'] += 1
        return vector
    
    if word.startswith("'") and len(word) > 1:
        return get_vector(index, word[1:], stats, unknowns)
    
    if word.startswith(".") and len(word) > 1:
        return get_vector(index, word[1:], stats, unknowns)

    stats['not_found'] += 1
    unknowns.append(word)
    return None
    

def build_embeddings(embeddings_index, word_index, max_features):
    embed_mean, embed_std = -0.005838499,0.48782197
    embed_size = 300
    num_words = min(max_features, len(word_index))
    print(embed_size, num_words)
    embeddings = np.random.normal(embed_mean, embed_std, (num_words, embed_size))
    
    stats = defaultdict(int)
    unknowns = []
    for word, index in word_index.items():
        if index >= max_features: continue
        vector = get_vector(embeddings_index, word, stats, unknowns)
        if vector is not None: embeddings[index] = vector
    
    print('Statistics')
    stats = pd.DataFrame(data = {'Counts': list(stats.values())}, index=stats.keys())
    print(stats)
    
    print('==== Unknows Words ====')
    for words in zip(sorted(unknowns), sorted(unknowns, key=lambda x: x[::-1])):
        print('{:20} {:>20}'.format(*words))
    
    return embeddings

embeddings = build_embeddings(glove, tokenizer.word_index, MAX_FEATURES)

print(embeddings[:1])

300 50000
Statistics
                   Counts
found               49004
not_found             900
found_lower            16
found_capitalized      48
found_upper            31
==== Unknows Words ====
*p                                     A*
++                                     ++
+/-                                 C/C++
+1/                                 c/c++
+\frac                              1000+
+\sqrt                              2000+
+f                                   100+
+ve                                  200+
+x                                   300+
+…                                   400+
-1/12                                500+
-1/2                                1500+
-\frac                               600+
-a                                    10+
-i                                    20+
-ve                                  320+
-x                                    30+
-x^2                                  40+
//www.youtube.com/watch                 140

BASLP                                  =3
BIPC                                  x=3
BJP/RSS                                ^3
BMSCE                                 a^3
BMSIT                                 b^3
BNBR                                  x^3
BREXIT                               GE14
BS-MS                                  =4
BVCOE                                  ^4
BYJU                                  x^4
Baahubali                           note4
Bajjika                             kmno4
Bartetzko                             x+5
Beerus                               FZ25
Bhakts                               fz25
Binance                             2+2=5
BitConnect                             ^5
Bitconnect                          PyQt5
Bitfinex                          ESP8266
Bittrex                              XeF6
Bolsonaro                            24*7
Boruto                             CAT'17
Bregoli                          neet2017
Brexit                            

MH17                                   b^
MH370                                  e^
MHCET                                  n^
MHTCET                                 x^
ML/AI                                 1x^
MMMUT                                 2x^
MPSTME                                3x^
MSQE                                   a_
MU-OET                               log_
MUOET                               \lim_
MVIT                                \sum_
Machedo                \displaystyle\sum_
Mangalapuram                 \sum\limits_
Mangalyaan                   \int\limits_
Mbappe                              \int_
Microservices          \displaystyle\int_
Midoriya                               x_
Minance                                -a
MobiKwik                               =a
NC-OBC                             lbsnaa
NCERTs                        Kainerugaba
NEET-PG                           Zerodha
NEET-UG                           zerodha
NEET/AIIMS                        

\\                                  ipill
\alpha                            Randyll
\begin                               0.1m
\beta                                I'am
\big                               Doklam
\binom                             doklam
\boxed                           Gurugram
\cdot                             Vajiram
\cos                         Mangalapuram
\dfrac                          Trump-Kim
\displaystyle                        \lim
\displaystyle\int                   3.5mm
\displaystyle\int_              Quora.com
\displaystyle\sum_               wish.com
\div                               \binom
\end                              B.pharm
\frac                             b.pharm
\gamma                            \mathrm
\geq                             fullform
\in                              Trumpism
\infty                        Electroneum
\int                          electroneum
\int\limits_                         \sum
\int_                             

millionaire/billionaire           Bitconnect
mmmut                                lnct
muoet                               mhcet
musical.ly                         mhtcet
musigma                             muoet
n+1                              alt-left
n+2                                 \left
n=0                             Uber/Lyft
n=1                             Alt-Right
n\to\infty                      alt-right
n^                                 \right
n^2                                 ARKit
nanodegree                      clickbait
narcisists                        Steemit
narcissit                          Kalpit
ncerts                          narcissit
neet2017                           Brexit
no.1                          post-Brexit
non-                               brexit
note4                              did'nt
o+                          electrovalent
oswaal                               \int
phycopath               \displaystyle\int
polyhouse                      

  -0.0735891   0.19720939 -0.62764985  0.28531321  0.03653605 -0.26083545]]
Wall time: 2.38 s


In [19]:
def strip_non_alpha(word):
    return ''.join(c for c in word if c.isalpha())

strip_non_alpha("'real-5'")

'real'