In [86]:
import os
import sys
sys.path.append('..')
from embeddings import load_glove

import numpy as np
import pandas as pd
from tqdm import tqdm

In [4]:
data_path = os.path.join('.input')
glove_filepath = os.path.join(data_path, 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt')
quora_path = os.path.join(data_path, 'train.csv')

In [29]:
glove = load_glove(glove_filepath)

===> running load_glove ...
<=== finished load_glove in 228.53 s.


In [25]:
MAX_FEATURES = 50_000
MAX_LEN = 50

In [111]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = MAX_FEATURES, lower=False, split='#')

In [85]:
quora = pd.read_csv(quora_path, usecols=['question_text'], nrows=100_000)
quora.head()

Unnamed: 0,question_text
0,How did Quebec nationalists see their province...
1,"Do you have an adopted dog, how would you enco..."
2,Why does velocity affect time? Does velocity a...
3,How did Otto von Guericke used the Magdeburg h...
4,Can I convert montra helicon D to a mountain b...


In [81]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])

In [105]:
def clean_text(doc):
    return '#'.join([token.orth_ for token in doc])

In [106]:
%%time

text = [clean_text(doc) for doc in nlp.pipe(tqdm(quora.question_text))]
quora['text'] = text

100%|████████████████████████████████████████████████████████████████████████| 100000/100000 [00:55<00:00, 1791.29it/s]


Wall time: 55.9 s


In [110]:
quora.head()

Unnamed: 0,question_text,text
0,How did Quebec nationalists see their province...,How#did#Quebec#nationalists#see#their#province...
1,"Do you have an adopted dog, how would you enco...","Do#you#have#an#adopted#dog#,#how#would#you#enc..."
2,Why does velocity affect time? Does velocity a...,Why#does#velocity#affect#time#?#Does#velocity#...
3,How did Otto von Guericke used the Magdeburg h...,How#did#Otto#von#Guericke#used#the#Magdeburg#h...
4,Can I convert montra helicon D to a mountain b...,Can#I#convert#montra#helicon#D#to#a#mountain#b...


In [112]:
%%time

tokenizer.fit_on_texts(quora.text)

Wall time: 2.37 s


In [113]:
seqs = tokenizer.texts_to_sequences(quora.question_text[:1000])

In [114]:
seqs[:2]

[[], []]

In [98]:
tokenizer.texts_to_sequences(['privet kak dela'.split()])

[[45990]]

In [99]:
from keras.preprocessing.sequence import pad_sequences

seqs = pad_sequences(seqs, maxlen = MAX_LEN)

In [100]:
seqs[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    9,   48, 6234, 7086,  161,   55, 6134,
          36,    4, 1186,    6,    1, 8138],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,   11,   14,   24,   29, 3917,  497,    9,   35,   14, 3624,
          37,    5, 3137,   10,   43, 1795]])

In [101]:
%%time

from collections import defaultdict


def strip_non_alpha(word):
    return ''.join(c for c in word if c.isalpha())

def get_vector(index, word, stats):
    vector = index.get(word)
    if vector is not None:
        stats['found'] += 1
        return vector
        
    vector = index.get(word.capitalize())
    if vector is not None:
        stats['found_capitalized'] += 1
        return vector
        
        
    vector = index.get(word.upper())
    if vector is not None:
        stats['found_upper'] += 1
        return vector
    
    vector = index.get(word.lower())
    if vector is not None:
        stats['found_lower'] += 1
        return vector

    stats['not_found'] += 1
    print(word)
    return None
    

def build_embeddings(embeddings_index, word_index, max_features):
    embed_mean, embed_std = -0.005838499,0.48782197
    embed_size = 300
    num_words = min(max_features, len(word_index))
    print(embed_size, num_words)
    embeddings = np.random.normal(embed_mean, embed_std, (num_words, embed_size))
    
    stats = defaultdict(int)
    for word, index in word_index.items():
        if index >= max_features: continue
        vector = get_vector(embeddings_index, word, stats)
        if vector is not None: embeddings[index] = vector
    print(stats)
    return embeddings

embeddings = build_embeddings(glove, tokenizer.word_index, MAX_FEATURES)

print(embeddings[:1])

300 50000
what's
isn't
i’m
you've
don’t
aren't
what’s
won't
trump's
they're
haven't
shouldn't
he's
it’s
can’t
wouldn't
quorans
who's
today's
doesn’t
someone's
there's
wasn't
people's
one's
hasn't
couldn't
india's
she's
brexit
i’ve
women's
isn’t
cryptocurrencies
master's
didn’t
world's
you’ve
person's
we're
redmi
china's
earth's
country's
you’re
men's
how's
bachelor's
america's
weren't
aren’t
man's
won’t
let's
god's
he’s
'the
friend's
quora's
obama's
woman's
they've
they’re
trump’s
company's
father's
children's
else's
mother's
you'd
child's
that’s
girl's
haven’t
modi's
mcdonald's
we've
dog's
wouldn’t
coinbase
shouldn’t
iitians
google's
year's
“the
she’s
other's
wife's
hadn't
oneplus
there’s
where's
driver's
russia's
bhakts
newton's
upwork
korea's
husband's
asperger's
hitler's
uceed
gdpr
israel's
who’s
demonetisation
wasn’t
boyfriend's
hasn’t
government's
pakistan's
bnbr
someone’s
canada's
girlfriend's
everyone's
boruto
'i
people’s
dceu
son's
90's
adityanath
machedo
japan's
apple's
alsha

£200
berkeley's
spotify's
autoencoder
goku's
mccain's
sweden's
humanity's
worker's
pilot's
iert
6'5
government’s
you’
kohl's
candidate's
nearbuy
niftem
star's
actor's
phonepe
macy's
20’s
black's
london's
nietzsche's
empire's
dogecoin
philippines'
dumbledore's
imessages
angular2
diana's
“do
mit's
musigma
lawrence's
aunt's
£100
language's
murphy's
minance
fitjee
customers'
'black
whatapp
patrick's
'quora'
₹5000
'this
note4
α1
ge14
“just
snape's
“good”
canada’s
left's
bible's
dragon's
employer's
prmo
governor's
'real'
indians'
chicago's
iqoption
cnn's
1900's
male's
i'
jobs'
germany’s
girlfriend’s
cringiest
kakashi's
where’s
lawyer's
puppy's
'we
2π
win10
atheist's
agent's
startup's
ford's
rakshaks
animoji
whydoes
universe's
cosecx
jesus’
curry's
bieber's
dont's
he’d
“to
hololens
singer's
incel
√4
pytorch
carter's
testbook
strowman
o'
'go
israel’s
alphago
'hindu
britain’s
etoos
bohr's
kavalireddi
'free'
ibm's
1700's
professor's
baby’s
uk’s
sanghis
they’d
dilr
dynamodb
buffett's
parents’
tru

harry’s
becl2
tutorialspoint
austin's
iitgn
fasttext
mirakee
₹2
jn0
'why'
dujat
hogan's
presley's
bhagat's
him…
nanodegrees
obergefell
mccurdy's
gravational
kilgrave
'black'
what'd
sendgrid
cloudformation
yygs
'everything
avicii's
why'd
calltend
cos3x
kissanime
“more
ψ
licl
fiance's
industry's
918kiss
nobody's
carey's
sjmsom
4xy
esdeath
murakami's
person”
kekistan
bates'
piaget's
fmge
affleck's
puneri
heart's
dick's
doctor’s
chinese's
x'
pune's
y53
attorney's
izuku
yourself'
fish's
office's
atheists'
russia’s
pixar's
obcs
bull's
yale's
visitor's
simpson's
locopilot
wales'
hulk's
50’s
un’s
1crore
grant's
'time'
logan's
bdes
“happy
akbar's
cleartax
intj's
angellist
in”
i's
jones's
writer'
trek's
foodpanda
kejariwal
earth'
bacl2
4'8
zncl2
rabbit's
george's
lion's
rice's
zzzquil
carnegie's
say's
antardasha
corporation’s
cryptocoin
psit
3's
work'
walton's
daenerys'
sasuke's
dubsmash
idea's
is'
3'
nazi’s
talk”
'l'
“a”
cinderella's
₹1000
momoshiki
airbnb's
‘s
atom's
prince's
duterte's
“racist

   2.54795768e-02  5.12552267e-02  2.08517170e-01 -8.38197044e-02]]
Wall time: 7.24 s


In [51]:
def strip_non_alpha(word):
    return ''.join(c for c in word if c.isalpha())

strip_non_alpha("'real-5'")

'real'

In [76]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])



In [103]:
doc = nlp("She hasn't been there. Harry Potter walked home. She's at india's home")
print(' '.join([token.orth_ for token in doc]))

She has n't been there . Harry Potter walked home . She 's at india 's home


In [80]:
glove[","]

array([-0.082752 ,  0.67204  , -0.14987  , -0.064983 ,  0.056491 ,
        0.40228  ,  0.0027747, -0.3311   , -0.30691  ,  2.0817   ,
        0.031819 ,  0.013643 ,  0.30265  ,  0.0071297, -0.5819   ,
       -0.2774   , -0.062254 ,  1.1451   , -0.24232  ,  0.1235   ,
       -0.12243  ,  0.33152  , -0.006162 , -0.30541  , -0.13057  ,
       -0.054601 ,  0.037083 , -0.070552 ,  0.5893   , -0.30385  ,
        0.2898   , -0.14653  , -0.27052  ,  0.37161  ,  0.32031  ,
       -0.29125  ,  0.0052483, -0.13212  , -0.052736 ,  0.087349 ,
       -0.26668  , -0.16897  ,  0.015162 , -0.0083746, -0.14871  ,
        0.23413  , -0.20719  , -0.091386 ,  0.40075  , -0.17223  ,
        0.18145  ,  0.37586  , -0.28682  ,  0.37289  , -0.16185  ,
        0.18008  ,  0.3032   , -0.13216  ,  0.18352  ,  0.095759 ,
        0.094916 ,  0.008289 ,  0.11761  ,  0.34046  ,  0.03677  ,
       -0.29077  ,  0.058303 , -0.027814 ,  0.082941 ,  0.1862   ,
       -0.031494 ,  0.27985  , -0.074412 , -0.13762  , -0.2186