## Importing Libraries

In [26]:
import pandas as pd
import numpy as np
import torch
from scipy.spatial.distance import cosine

## Loading CSV File

In [2]:
df = pd.read_csv("glove_embeddings.csv")

In [3]:
df.head()

Unnamed: 0,a,aardvark,abacus,abandon,abbey,abbreviation,abdicate,abdomen,abduct,abduction,...,zipper,zit,zodiac,zombie,zombies,zone,zoo,zoom,zucchini,bandanna
0,0.053397,-0.008875,-0.013335,-0.006122,0.019756,0.030545,-0.069542,-0.056377,0.003284,0.003534,...,-0.06462,0.029034,-0.00244,-0.044165,0.098675,-0.044571,0.005611,0.067518,-0.145417,-0.039534
1,-0.004251,0.002891,0.047071,-0.083403,-0.078187,0.082258,0.0232,0.045625,0.018514,0.037448,...,0.126727,-0.127292,0.033536,-0.010979,0.0027,0.086612,-0.035475,0.083952,0.076169,0.014235
2,-0.110815,-0.045953,-0.011582,-0.048378,-0.039875,-0.107963,0.015946,-0.011218,-0.008692,-0.120792,...,-0.024138,-0.056325,-0.072633,0.08001,-0.070075,-0.097581,-0.07247,-0.013439,0.009109,-0.02775
3,0.069536,0.047428,-0.018891,0.09027,0.012006,-0.053957,0.053257,-0.036753,-0.001521,-0.029523,...,-0.000864,-0.018538,0.002866,0.018774,0.049107,0.105542,0.024093,-0.006343,0.0275,-0.041883
4,-0.041571,-0.039259,-0.001189,-0.01041,0.030436,-0.064063,0.102261,-0.058631,0.11009,0.105033,...,-0.022811,0.056904,0.045423,0.060763,-0.004891,-0.014069,-0.027471,-0.018964,-0.077316,0.063027


## Task 1: Create a function to compute cosine similarity between any two word vectors:

In [31]:
def cosine_similarity(word1, word2):
    return 1 - cosine(torch.from_numpy(np.array(df[word1])),torch.from_numpy(np.array(df[word2])))

In [34]:
cosine_similarity("skull", "brain")

0.44759619387211047

## Task 2: Create a function to find the word closest to another word in the vocab using cosines:

In [38]:
def find_closest(word):
  sims = [cosine_similarity(word,x) for x in df.columns]
  y = np.array(sims)
  y_sorted = np.argsort(-y) ## gives sorted indices
  top5_indices = y_sorted[:5]
  w1 = [df.columns[i] for i in top5_indices]
  return w1

In [40]:
print(find_closest('brain'))
print(find_closest('cat'))
print(find_closest('apple'))
print(find_closest('exam'))

print(find_closest('brain'))
print(find_closest('cat'))
print(find_closest('apple'))
print(find_closest('exam'))

['brain', 'cerebral', 'tumor', 'spinal', 'tissue']
['cat', 'dog', 'pet', 'rabbit', 'pig']
['apple', 'applesauce', 'pear', 'apple pie', 'cherry']
['exam', 'examination', 'exams', 'admission', 'graduation']
['brain', 'cerebral', 'tumor', 'spinal', 'tissue']
['cat', 'dog', 'pet', 'rabbit', 'pig']
['apple', 'applesauce', 'pear', 'apple pie', 'cherry']
['exam', 'examination', 'exams', 'admission', 'graduation']


## Task 3: Create a function to compute the average vector of any two vectors:

In [43]:
def avg_vector(word1, word2):
    return torch.divide(torch.add(torch.from_numpy(np.array(df[word1])),torch.from_numpy(np.array(df[word2]))),2)

In [45]:
print(avg_vector("brain","skull").size())

torch.Size([300])


## Task 4: Create a function to find the word in the vocab closest to an average vector for each wordpair:

In [46]:
def closest_word(word1, word2):
    avg_vector_tensor = avg_vector(word1, word2)
    sims_cosine = [1 - cosine(avg_vector_tensor,torch.from_numpy(np.array(df[x]))) for x in df.columns]
    y = np.array(sims_cosine)
    y_sorted = np.argsort(-y) ## gives sorted indices
    top5_indices = y_sorted[:5]
    w1 = [df.columns[i] for i in top5_indices]
    return w1

In [47]:
print(closest_word("lion","tiger"))

['tiger', 'lion', 'leopard', 'elephant', 'wolf']


## Testing function on a csv file with pair of words:

In [48]:
df_word_pair = pd.read_csv("connector_wordpairs_boards.csv")

In [49]:
df_word_pair.head(5)

Unnamed: 0,Word1,Word2,Experiment,boardnames
0,void,couch,E1,e1_board1_words
1,giggle,abnormal,E1,e1_board1_words
2,exam,algebra,E1,e1_board1_words
3,tea,bean,E1,e1_board10_words
4,tourist,comedy,E1,e1_board10_words


In [50]:
len(df_word_pair)

60

In [51]:
similarity = []
for index, row in df_word_pair.iterrows():
    word1 = row["Word1"]
    word2 = row["Word2"]
    similarity.append(closest_word(word1, word2)[2:])

In [53]:
df_word_pair["top 3 similar words"] = similarity

## Results

In [54]:
df_word_pair.head(60)

Unnamed: 0,Word1,Word2,Experiment,boardnames,top 3 similar words
0,void,couch,E1,e1_board1_words,"[fill, sit, sofa]"
1,giggle,abnormal,E1,e1_board1_words,"[laughter, hysterical, symptom]"
2,exam,algebra,E1,e1_board1_words,"[mathematics, exams, examination]"
3,tea,bean,E1,e1_board10_words,"[coffee, rice, vegetable]"
4,tourist,comedy,E1,e1_board10_words,"[popular, movie, attraction]"
5,pendulum,dusk,E1,e1_board10_words,"[dawn, sunset, sunrise]"
6,beginning,brake,E1,e1_board2_words,"[end, start, early]"
7,birds,aircraft,E1,e1_board2_words,"[fly, airplane, flight]"
8,school,stop,E1,e1_board2_words,"[go, continue, start]"
9,circle,dance,E1,e1_board3_words,"[dancing, dancer, music]"


In [55]:
df_word_pair.to_csv("word_pair_results(Glove).csv")