# Glove Word Vectors - Word similarity (semantic)

In [1]:
import pandas as pd
import csv

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

import numpy as np

## Import data & create data frame

In [2]:
path = 'glove.6B.50d.txt'

words = pd.read_table(path, sep=" ", index_col= 0, header= None, quoting=csv.QUOTE_NONE)

Random sample from "words" dataframe.

In [3]:
words.sample(3)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,41,42,43,44,45,46,47,48,49,50
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
supercharger,0.26481,-0.59335,0.56178,-0.72124,-0.72991,0.66387,0.65279,-0.68577,0.28926,0.24604,...,1.0759,0.015435,-1.788,0.03281,0.26302,0.007122,1.0521,0.46217,-0.37131,0.72106
mahabad,-1.4998,-0.81779,-0.061154,0.44163,-0.12481,-0.15817,1.1115,1.2464,0.70766,-0.49984,...,-0.057966,0.55513,-0.082088,-1.455,-0.77707,1.0739,-0.29587,-0.72214,-0.51878,0.32553
verapamil,-0.20199,-0.67403,0.2081,-0.035614,-1.2199,0.44553,0.82464,-0.1574,1.0971,1.3094,...,0.58847,1.0811,-0.13623,0.51677,0.24699,-0.17816,-0.34939,-0.21223,-0.26362,0.78475


## Create vocabulary & word2idx

Create a word-number reference.

In [4]:
vocab = words.index

word2idx = dict(zip(vocab, np.arange(len(vocab))))

In [5]:
vocab[18]

'he'

In [6]:
word2idx['he']

18

## Semantic similarity between word vectors (measured by the cosine of the angle between two vectors)

Words that have semantic similarity has a value close to 1. Words that do not have a semantic similarity has a value close to 0.

Select words to compare.

In [7]:
cat = np.array(words.loc['cat']).reshape(1, -1)
dog = np.array(words.loc['dog']).reshape(1, -1)
car = np.array(words.loc['car']).reshape(1, -1)

The words "cat" and "dog" have similar semantic in the vector space.

In [8]:
cosine_similarity(cat, dog)

array([[0.92180053]])

The words "cat" and "car" are similar lexicographically, however there is no semantic relationship between them.

In [9]:
cosine_similarity(cat, car)

array([[0.36382526]])

The vector for car is definitely farther away from the vector representing cat, than the vector for dog. 

## Get similar words (words that are the closest to the selected word vector is said to be similar)

In this case, the notion of closeness is defined by the Euclidean distance between the points in space

In [10]:
nneigh = NearestNeighbors(n_neighbors= 10)
nneigh.fit(words)

NearestNeighbors(n_neighbors=10)

Get the 10 most similar word to selected word.

In [11]:
exciting = words.loc['exciting']

In [12]:
distances, indices = nneigh.kneighbors([exciting])

[(vocab[int(ind)], dist) for ind, dist in zip(list(indices[0]), list(distances[0]))]

[('exciting', 0.0),
 ('terrific', 2.792611993603401),
 ('interesting', 2.8611622732844078),
 ('fantastic', 2.861798400678322),
 ('amazing', 3.007153936813019),
 ('enjoyable', 3.008023059154467),
 ('frustrating', 3.0421646103779865),
 ('entertaining', 3.049089698192792),
 ('incredibly', 3.1307719297730716),
 ('wonderful', 3.232475676976922)]

Check for another word

In [13]:
politics = words.loc['politics']

In [14]:
distances, indices = nneigh.kneighbors([politics])

[(vocab[int(ind)], dist) for ind, dist in zip(list(indices[0]), list(distances[0]))]

[('politics', 0.0),
 ('political', 2.518940798601468),
 ('debate', 3.540192338124724),
 ('struggle', 3.7507524907807785),
 ('rooted', 3.754068066487207),
 ('circles', 3.7649280054914733),
 ('liberal', 3.7939710137099043),
 ('minded', 3.8021530920671776),
 ('politicians', 3.80693941162609),
 ('turning', 3.816618306548744)]