In [1]:
import os
import sys
sys.path.append('..')
from embeddings import load_glove

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
data_path = os.path.join('.input')
glove_filepath = os.path.join(data_path, 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt')
quora_path = os.path.join(data_path, 'train.csv')

In [3]:
MAX_FEATURES = 50_000
MAX_LEN = 50

## Data Preprocessing

### Spacy Playground

In [30]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [32]:
doc = nlp("She hasn't been there. Harry Potter walked home. She's at Hermione's house", disable=['tagger', 'parser', 'ner'])
print(' '.join([token.orth_ for token in doc]))

She has n't been there . Harry Potter walked home . She 's at Hermione 's house


### Load Data

In [33]:
quora = pd.read_csv(quora_path, usecols=['question_text'], nrows=100_000)
quora.head()

Unnamed: 0,question_text
0,How did Quebec nationalists see their province...
1,"Do you have an adopted dog, how would you enco..."
2,Why does velocity affect time? Does velocity a...
3,How did Otto von Guericke used the Magdeburg h...
4,Can I convert montra helicon D to a mountain b...


### Preprocess Text using Spacy

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])

In [20]:
def clean_text(doc):
    return ' '.join([token.orth_ for token in doc])

In [21]:
%%time

text = [clean_text(doc) for doc in nlp.pipe(tqdm(quora.question_text))]
quora['text'] = text

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [01:03<00:00, 1563.51it/s]


Wall time: 1min 3s


In [25]:
quora.head()

Unnamed: 0,question_text,text
0,How did Quebec nationalists see their province...,How did Quebec nationalists see their province...
1,"Do you have an adopted dog, how would you enco...","Do you have an adopted dog , how would you enc..."
2,Why does velocity affect time? Does velocity a...,Why does velocity affect time ? Does velocity ...
3,How did Otto von Guericke used the Magdeburg h...,How did Otto von Guericke used the Magdeburg h...
4,Can I convert montra helicon D to a mountain b...,Can I convert montra helicon D to a mountain b...


### Build Word Index using Keras Tokenizer

In [23]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = MAX_FEATURES, lower=False, filters='')

In [26]:
%%time

tokenizer.fit_on_texts(quora.text)

Wall time: 2.96 s


In [27]:
seqs = tokenizer.texts_to_sequences(quora.question_text[:1000])

In [28]:
seqs[:2]

[[10, 61, 4596, 7058, 192, 66, 6675, 41, 4, 1107, 6, 2],
 [57, 16, 28, 34, 4197, 77, 46, 16, 3307, 39, 5, 3459, 11, 47]]

In [34]:
tokenizer.texts_to_sequences(['privet kak dela'.split()])

[[]]

In [35]:
from keras.preprocessing.sequence import pad_sequences

seqs = pad_sequences(seqs, maxlen = MAX_LEN)

In [36]:
seqs[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   10,   61, 4596, 7058,  192,   66,
        6675,   41,    4, 1107,    6,    2],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,   57,   16,   28,   34, 4197,   77,   46,   16,
        3307,   39,    5, 3459,   11,   47]])

## Word Vectors Loading

In [None]:
glove = load_glove(glove_filepath)

===> running load_glove ...


In [38]:
glove[","]

array([-0.082752 ,  0.67204  , -0.14987  , -0.064983 ,  0.056491 ,
        0.40228  ,  0.0027747, -0.3311   , -0.30691  ,  2.0817   ,
        0.031819 ,  0.013643 ,  0.30265  ,  0.0071297, -0.5819   ,
       -0.2774   , -0.062254 ,  1.1451   , -0.24232  ,  0.1235   ,
       -0.12243  ,  0.33152  , -0.006162 , -0.30541  , -0.13057  ,
       -0.054601 ,  0.037083 , -0.070552 ,  0.5893   , -0.30385  ,
        0.2898   , -0.14653  , -0.27052  ,  0.37161  ,  0.32031  ,
       -0.29125  ,  0.0052483, -0.13212  , -0.052736 ,  0.087349 ,
       -0.26668  , -0.16897  ,  0.015162 , -0.0083746, -0.14871  ,
        0.23413  , -0.20719  , -0.091386 ,  0.40075  , -0.17223  ,
        0.18145  ,  0.37586  , -0.28682  ,  0.37289  , -0.16185  ,
        0.18008  ,  0.3032   , -0.13216  ,  0.18352  ,  0.095759 ,
        0.094916 ,  0.008289 ,  0.11761  ,  0.34046  ,  0.03677  ,
       -0.29077  ,  0.058303 , -0.027814 ,  0.082941 ,  0.1862   ,
       -0.031494 ,  0.27985  , -0.074412 , -0.13762  , -0.2186

## Emmbeddings

In [39]:
%%time

from collections import defaultdict


def get_vector(index, word, stats):
    vector = index.get(word)
    if vector is not None:
        stats['found'] += 1
        return vector
        
    vector = index.get(word.capitalize())
    if vector is not None:
        stats['found_capitalized'] += 1
        return vector
        
        
    vector = index.get(word.upper())
    if vector is not None:
        stats['found_upper'] += 1
        return vector
    
    vector = index.get(word.lower())
    if vector is not None:
        stats['found_lower'] += 1
        return vector

    stats['not_found'] += 1
    print(word)
    return None
    

def build_embeddings(embeddings_index, word_index, max_features):
    embed_mean, embed_std = -0.005838499,0.48782197
    embed_size = 300
    num_words = min(max_features, len(word_index))
    print(embed_size, num_words)
    embeddings = np.random.normal(embed_mean, embed_std, (num_words, embed_size))
    
    stats = defaultdict(int)
    for word, index in word_index.items():
        if index >= max_features: continue
        vector = get_vector(embeddings_index, word, stats)
        if vector is not None: embeddings[index] = vector
    print(stats)
    return embeddings

embeddings = build_embeddings(glove, tokenizer.word_index, MAX_FEATURES)

print(embeddings[:1])

300 50000
..
Quorans
Brexit
cryptocurrencies
Redmi
/math
f(x
x^2
Machedo
DCEU
GDPR
SJWs
Qoura
Upwork
BNBR
Doklam
\frac
Boruto
Coinbase
w/
5'4
A+
OnePlus
Bhakts
LNMIIT
Qur'an
AlShamsi
D+
bhakts
demonetisation
Waymo
H+
Zerodha
brexit
.what
altcoins
LGBTQ+
Alshamsi
MSQE
Plancess
Unacademy
SRMJEEE
y=
\sqrt
I`m
Amazon.in
BIPC
B.Des
demonitisation
upwork
2x^2
IISERs
5'7
BMSCE
microservices
.net
Baahubali
eLitmus
A-
5'11
^2
^3
Kilimall
5'6
B+
Simpliv
altcoin
x^3
and/
Koinex
\|
nanodegree
,-
clickbait
B-
apist
ciswomen
Kavalireddi
Adityanath
2017/2018
mc^2
UCEED
fullform
a^2
A2A'd
Vajiram
AKTU
Swachh
Xiomi
Awdhesh
Whydo
ln(x
non-
litecoin
Google+
coinbase
PMAY
don'TS
.NET
RWBY
IIITH
y^2
Splatoon
f(1
x^x
दिल
Kovind
2n+1
UPESEAT
Binance
^1/2
\pi
math]x[/math
USICT
Feku
log(1
internshala
Padmaavat
Kubernetes
vssut
liùlâo
Filmora
adhaar
NS200
को
Anderfels
TISSNET
MUOET
easyazon
Quora.com
|x|
PragerU
Tensorflow
Bregoli
LGBT+
2^n
₹5000
5'3
Gorillavid
3x^2
INSOFE
Kainerugaba
Internshala
note4
α2-α1
a

GTPL
3G.
NIT(Cs
worldmax
Nehisi
Botlan
Talkspace
Betterhelp
diferring
passivly
Chartio
Radhswaomi
rebaring
air<50
unieuro.com
phsychic
decate
R4PG
warmane
Liebesträume
Rajasthany
transgendering
NDTL
repyament
teraformed
fivver
depawali
Akkusative
upesat
VPN.once
after​
Chennaiyin
rashtiya
1mnth
stylictic
shechudle
XTR-150
sutface
^(1/n
2017should
onRestoreInstance
~15
Zygorhiza
topic.so
FETERS
90's/00
nimity
2^x
4^x
3^x
6^x
Tsamara
arigatō
tasukarimashita
femint
sdisha
Palours
unheaven
pay3
enjeenearing
immigate
architect/
SamCoin
BobCoin
6.023×10
digutal
Ichange
UPites
Isn't
thatwhy
internationalizes
ceificate
Family’
Xcent
adark
Ryuugamine
tipsneats.com
tak’
BSc(H
supermaneuverable
kolbri
Brexiter
innerwares
/S
/P
B.tec
pagadi
Islamophilia
right"after
LEDs’
does't
Listify
cocoaine
8.36.And
rashifal
x+π÷4
-cosx
Chijin
swellness
dumbel
Metalness
PGDIM
Earthians
6S+
cuckservative
UberPOOL
Oneplus
Graylog
alaykumu
Ahok
IBFF
CH3-CH2-CH2-Ch2-COOH
a'Quoran
balanca
s10AA
GOV24/7
ILEGALY
Dard

XDVDN8190
PlayerUnknown
womenizer
100^C
1.0gcm^-3
aftrr
yesrc
18000USD
differntiate
mijbil
discreminate
hero.seriously
atlst
SamClay
forget’
sovergion
simultqneously
DS-260
untikl
Wjy
Edwisor
edwisor
homale
DSLR.Is
f/1.8
dtst
karolbagh
Leangap
Endevvr
leagueleagues
Enginx
I=1;I&lt
10;I++
panotorium
ADP6000IX
statatstic
dehliwal
brila
destinaton
soluions
weyuker
disadvantags
insemenated
bleching
-stabilizing
Whatifalthist
EmperorTigerstar
tamilzan
introdece
watssapp
Modi-
Feroglobin
warehouse.where
Ahch
gregnant
coolspot
अनुस्वार
rhizoid
JEE'17
plant(s
PBPD
Radiocarbon(C14
नाकतोडा
1-
127EQ
bit*h
Aswagandha
math]y
masqutos
nomintates
Minister/
sepreation
telanga
R22.5
v2.0
10,12th
unrulier
softsongs
webassembly
9.81m_s2
preons
S20xi
pet(s
XEIB1
electrovalent
aspien
cruxifiction
a(n
hydropenia
Centuriate
8800p
righrs
.bash_profile
obectives
pacitine
detoriate
Vaporwave
basmathi
entrance​
Eilhard
réalisable
metaphysicists
15%and
quota85
benzylalcohol
MIRIOR
Emperialism
solution(s
X^2
mattu

ghanti
x^7
14x^5
16x^3
30x-560=0
a=5
a++8a++
transaction.can
आहें
भरना
जीतना
टटोलना
लगाना
हाले
amoksha
depration
77,263
transpising
rajtarangani
boglin
illegal|
Pudface
einpacken
periotic
stroncium
galiam
electrolysing
left.how
zebpay
plurimum
religion(s
attaract
~NCL
Feitan
-1,1,-1
offersThe
Winesteen
math]\int_{-2}^{2}\int_{-\sqrt{4-x^{2}}}^{\sqrt{4-x^{2}}}\int_{0}^{\sqrt{4-(x^{2}+y^{2}})}z^2\sqrt{x^{2}+y^{2}+z^2}dzdydx[/math
Pt(NH3
kmno4
Kattankulathur
1AA06PA
80TV026WIH
Laureanna
tablet/
coupd
Vadrakar
Abstinence/12step
line(s
AB²
AB³
Tadanaga
Meritchell
Batet
BE(DTE
femenism
HowHow
3x3200
984563456785439782413
bills/
BMMM
E-7018
E-7018H4R
cos^2x=1/2
1/2cos2x
regime/
posdoctoral
admissin
slimido
msc.actuarial
hypertypical
tactic(s
Zukerburg
VueJS
65yo
Ripio
cryptyocurrency
8:00pm
9:00pm
I^n
andhbhakts
.Aim
Denacci
CH2=CH
CH2-CH
4+++
MMVY
demonatisation
Nazifi
Asnanic
Myeshia
math]\f(z)=sqrt(z)[/math
Speedglide
s'abattre
dissapont
60000rs
nodemcu
emprasario
bhagavatgita
sheeshail
Kr

:--
IITH
typeform.com
last?What
prensure
metronaunt
madheshis
math]d_k[/math
math]c_k[/math
visaHe
Zlnd
emergemcy
projrect
triphthong
tmblr.com
me“be
/lgbt
adultration
intropersonal
BBr3>BCl3>BF3
encypt
Eienstein
univarsality
advanstages
gunarathne
Walcaott
crountries
Mirty
relationshithp
Ca10(PO4
6(OH
pentahydride
Stefanianna
Verlac
meroclopromide
phytobezoar
fecolith
inducter
261312
makmu
sempakan
@Siddhartha
incissors
180/100
Airpods
802.11p
lessparanoid
predeturminations
2007–08
repirting
mom./
himanity
elctron
E3b1
0.1ns
decòr
invissible
traveks
kaytra
alopathy
diyhrcodine
sin^3
1-sin^3
Mankita
Yong'an
resonace
Giglo
Killmonger
Iseyin
pentadactyle
Acronal
vipareet
rajyog
y=10
ermenisin
dedim
polycabornate
V2.0
xln(x
onvisage
writing​
phrophet
rebbilion
math]a[/math
math]b[/math
b[/math
2ndyr
MH17
1-w+w^2
1+w
w^2
Xiami
arctanx^2
Kundalatha
Maremares
englishuncle.com
2700.50&.
4400.75
theTV
Azoreans
=-
critizices
strechmarks
awscli
172.120.0.0/16
Hyperfire
10.03.1814
1+(1
SIvagami
c

   5.31251138e-02  2.73308756e-01  6.38542732e-01 -3.50919995e-01]]
Wall time: 12.1 s


In [51]:
def strip_non_alpha(word):
    return ''.join(c for c in word if c.isalpha())

strip_non_alpha("'real-5'")

'real'