In [1]:
import torchtext.vocab

glove = torchtext.vocab.GloVe(name='6B', dim=100)

print(f'There are {len(glove.itos)} words in the vocabulary')

There are 400000 words in the vocabulary


In [2]:
glove.vectors.shape

torch.Size([400000, 100])

In [3]:
glove.itos[:10]

['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s"]

In [4]:
glove.stoi['the']

0

In [5]:
glove.vectors[glove.stoi['the']].shape

torch.Size([100])

words not in vocab throw an error.

Get a vector from a word:

In [6]:
def get_vector(embeddings, word):
    assert word in embeddings.stoi, f'{word} not in vocab!'
    return embeddings.vectors[embeddings.stoi[word]]

In [7]:
get_vector(glove, 'the').shape

torch.Size([100])

In [8]:
import torch

def closest_words(embeddings, vector, n=10):
    distances = [(w, torch.dist(vector, get_vector(embeddings, w)).item()) for w in embeddings.itos]
    return sorted(distances, key = lambda w: w[1])[:n]

In [9]:
closest_words(glove, get_vector(glove, 'japan'))

[('japan', 0.0),
 ('japanese', 4.091249465942383),
 ('korea', 4.551243782043457),
 ('tokyo', 4.565995216369629),
 ('china', 4.857661247253418),
 ('thailand', 5.292530536651611),
 ('indonesia', 5.313706874847412),
 ('philippines', 5.3697509765625),
 ('asia', 5.389328479766846),
 ('vietnam', 5.42373514175415)]

In [10]:
def print_tuples(tuples):
    for w, d in tuples:
        print(f'({d:02.04f}) {w}') 

In [11]:
print_tuples(closest_words(glove, get_vector(glove, 'japan')))

(0.0000) japan
(4.0912) japanese
(4.5512) korea
(4.5660) tokyo
(4.8577) china
(5.2925) thailand
(5.3137) indonesia
(5.3698) philippines
(5.3893) asia
(5.4237) vietnam


In [12]:
def analogy(embeddings, word1, word2, word3, n=5):
    
    candidate_words = closest_words(embeddings, get_vector(embeddings, word2) - get_vector(embeddings, word1) + get_vector(embeddings, word3), n+3)
    
    candidate_words = [x for x in candidate_words if x[0] not in [word1, word2, word3]][:n]
    
    print(f'\n{word1} is to {word2} as {word3} is to...')
    
    return candidate_words

In [13]:
print_tuples(analogy(glove, 'man', 'king', 'woman'))


man is to king as woman is to...
(4.0811) queen
(4.6429) monarch
(4.9055) throne
(4.9216) elizabeth
(4.9811) prince


You can think of vector('King') - vector('Man') as a "royalty vector", thus when you add this "royality vector" to woman, you get queen. If you add it to "boy" you should get "prince" and if you add to "girl" you should get princess. Let's test:

In [14]:
print_tuples(analogy(glove, 'man', 'king', 'boy'))
print_tuples(analogy(glove, 'man', 'king', 'girl'))


man is to king as boy is to...
(5.3084) queen
(5.4616) prince
(5.5430) uncle
(5.6069) brother
(5.6418) son

man is to king as girl is to...
(4.6916) queen
(5.3437) princess
(5.4683) prince
(5.5591) daughter
(5.5735) sister


In [15]:
print_tuples(analogy(glove, 'man', 'actor', 'woman'))
print_tuples(analogy(glove, 'cat', 'kitten', 'dog'))
print_tuples(analogy(glove, 'dog', 'puppy', 'cat'))
print_tuples(analogy(glove, 'russia', 'moscow', 'france'))
print_tuples(analogy(glove, 'obama', 'president', 'trump'))
print_tuples(analogy(glove, 'rich', 'mansion', 'poor'))
print_tuples(analogy(glove, 'elvis', 'rock', 'eminem'))
print_tuples(analogy(glove, 'paper', 'newspaper', 'screen'))
print_tuples(analogy(glove, 'monet', 'paint', 'michelangelo'))
print_tuples(analogy(glove, 'beer', 'barley', 'wine'))
print_tuples(analogy(glove, 'earth', 'moon', 'sun'))
print_tuples(analogy(glove, 'house', 'roof', 'castle'))
print_tuples(analogy(glove, 'building', 'architect', 'software'))
print_tuples(analogy(glove, 'boston', 'bruins', 'phoenix'))
print_tuples(analogy(glove, 'good', 'heaven', 'bad'))
print_tuples(analogy(glove, 'jordan', 'basketball', 'woods'))


man is to actor as woman is to...
(2.8133) actress
(5.0039) comedian
(5.1399) actresses
(5.2773) starred
(5.3085) screenwriter

cat is to kitten as dog is to...
(3.8146) puppy
(4.2944) rottweiler
(4.5888) puppies
(4.6086) pooch
(4.6520) pug

dog is to puppy as cat is to...
(3.8146) kitten
(4.0255) puppies
(4.1575) kittens
(4.1882) pterodactyl
(4.1945) scaredy

russia is to moscow as france is to...
(3.2697) paris
(4.6857) french
(4.7085) lyon
(4.9087) strasbourg
(5.0362) marseille

obama is to president as trump is to...
(6.4302) executive
(6.5149) founder
(6.6997) ceo
(6.7524) hilton
(6.7729) walt

rich is to mansion as poor is to...
(5.8262) residence
(5.9444) riverside
(6.0283) hillside
(6.0328) abandoned
(6.0681) bungalow

elvis is to rock as eminem is to...
(5.6597) rap
(6.2057) rappers
(6.2161) rapper
(6.2444) punk
(6.2690) hop

paper is to newspaper as screen is to...
(4.7810) tv
(5.1049) television
(5.3818) cinema
(5.5524) feature
(5.5646) shows

monet is to paint as michelang

http://forums.fast.ai/t/nlp-any-libraries-dictionaries-out-there-for-fixing-common-spelling-errors/16411

In [16]:
glove = torchtext.vocab.GloVe(name='840B', dim=300)

In [17]:
print_tuples(closest_words(glove, get_vector(glove, 'relieable')))

(0.0000) relieable
(5.0366) relyable
(5.2610) realible
(5.4719) realiable
(5.5402) relable
(5.5917) relaible
(5.6412) reliabe
(5.8802) relaiable
(5.9593) stabel
(5.9981) consitant


In [18]:
reliable_vector = get_vector(glove, 'reliable')

reliable_misspellings = ['relieable', 'relyable', 'realible', 'realiable', 'relable', 'relaible', 'reliabe', 'relaiable']

diff_reliable = [(reliable_vector - get_vector(glove, s)).unsqueeze(0) for s in reliable_misspellings]

In [19]:
misspelling_vector = torch.cat(diff_reliable, dim=0).mean(dim=0)

In [20]:
#misspelling of because

print_tuples(closest_words(glove, get_vector(glove, 'becuase') + misspelling_vector))

(6.1090) because
(6.4250) even
(6.4358) fact
(6.4914) sure
(6.5094) though
(6.5601) obviously
(6.5682) reason
(6.5856) if
(6.6099) but
(6.6415) why


In [21]:
#misspelling of definitely

print_tuples(closest_words(glove, get_vector(glove, 'defintiely') + misspelling_vector))

(5.4070) definitely
(5.5643) certainly
(5.7192) sure
(5.8152) well
(5.8588) always
(5.8812) also
(5.9557) simply
(5.9667) consider
(5.9821) probably
(5.9948) definately


In [22]:
#misspelling of consistent

print_tuples(closest_words(glove, get_vector(glove, 'consistant') + misspelling_vector))

(5.9641) consistent
(6.3674) reliable
(7.0195) consistant
(7.0299) consistently
(7.1605) accurate
(7.2737) fairly
(7.3037) good
(7.3520) reasonable
(7.3801) dependable
(7.4027) ensure


In [23]:
#misspelling of package

print_tuples(closest_words(glove, get_vector(glove, 'pakage') + misspelling_vector))

(6.6117) package
(6.9315) packages
(7.0195) pakage
(7.0911) comes
(7.1241) provide
(7.1469) offer
(7.1861) reliable
(7.2431) well
(7.2434) choice
(7.2453) offering
