In [None]:
#STEP 1
#Download pre-trained embeddings

!gdown --id 0B7XkCwpI5KDYNlNUTTlSS21pQmM

from gensim.models import KeyedVectors

word_vector_file_path = "/content/GoogleNews-vectors-negative300.bin.gz"
model = KeyedVectors.load_word2vec_format(word_vector_file_path, binary=True)

In [4]:
#STEP 1
#Method 2
# Run this if first cell shows error
import gensim.downloader
import os
import numpy as np

model = gensim.downloader.load('word2vec-google-news-300')



In [5]:
#STEP 2
# Word Vectors using gensim

# word 'cementary' is excluded as it is not present in the word2vec pretrained model
n_words = ['asylum','autograph','automobile','boy','bird','brother','cord','cushion','coast','crane','car','cock','fruit','furnace','forest','food','grin','graveyard','glass','gem','hill','implement','jewel','journey','lad','monk','midday','madhouse','mound','magician','noon','oracle','pillow','rooster','smile','string','sage','shore','serf','slave','signature','stove','tumbler','tool','voyage','wizard','woodland']

wordvec =[]

for word in n_words:
  wordvec.append(model[word])


In [6]:
#STEP 3
#Cosine Distance
from sklearn.metrics.pairwise import cosine_similarity

S_word_vec=[]
word_pair1=['cord','rooster','noon','fruit','autograph','automobile','mound','grin','graveyard','glass','boy','cushion','monk','coast','grin','shore','monk','boy','automobile','mound','lad','forest','food','shore','bird','coast','furnace','crane','hill','car','glass','magician','crane','brother','sage','oracle','bird','bird','food','brother','furnace','magician','hill','cord','glass','grin','journey','autograph','coast','forest','implement','cock','boy','cushion','automobile','midday','gem','asylum','asylum','asylum','serf']
word_pair2=['smile','voyage','string','furnace','shore','wizard','stove','implement','madhouse','magician','rooster','jewel','slave','forest','lad','woodland','oracle','sage','cushion','shore','wizard','graveyard','rooster','voyage','woodland','hill','implement','rooster','woodland','journey','jewel','oracle','implement','lad','wizard','sage','crane','cock','fruit','monk','stove','wizard','mound','string','tumbler','smile','voyage','signature','shore','woodland','tool','rooster','lad','pillow','car','noon','jewel','fruit','monk','madhouse','slave']
S_Manual=[0.02,0.04,0.04,0.05,0.06,0.11,0.14,0.18,0.42,0.44,0.44,0.45,0.57,0.85,0.88,0.9,0.91,0.96,0.97,0.97,0.99,1,1.09,1.22,1.24,1.26,1.37,1.41,1.48,1.55,1.78,1.82,2.37,2.41,2.46,2.61,2.63,2.63,2.69,2.74,3.11,3.21,3.29,3.41,3.45,3.46,3.58,3.59,3.6,3.65,3.66,3.68,3.82,3.84,3.92,3.94,3.94,0.19,0.39,3.46,3.04]


for i in range(len(word_pair1)):
  w1 = word_pair1[i]
  w2 = word_pair2[i]
  index_w1 = n_words.index(w1)
  index_w2 = n_words.index(w2)

  S_word_vec.append(cosine_similarity(wordvec[index_w1].reshape(1,300),wordvec[index_w2].reshape(1,300))[0][0])

In [7]:
#STEP 3
#Pearson Correlation
from scipy import stats

P_word_vec = stats.pearsonr(S_Manual, S_word_vec)

print('The Pearson Correlations are as follows:')
print('Word2Vec: ',P_word_vec)

The Pearson Correlations are as follows:
Word2Vec:  PearsonRResult(statistic=0.762628302486367, pvalue=9.210274605623711e-13)


In [9]:
#STEP 4
# Word Analogies

#Reading the analogies
file1 = open('./Analogies/top_5028_semantic_analogies.txt', 'r')
sem_analogies = file1.readlines()

file2 = open('./Analogies/top_5028_syntactic_analogies.txt', 'r')
syn_analogies = file2.readlines()

sem_analogy_words = [w.split() for w in sem_analogies]
syn_analogy_words = [w.split() for w in syn_analogies]

In [16]:
#STEP 4
# Word Analogies - Word2Vec Analysis

sem_analogy_accuracy = 0
syn_analogy_accuracy = 0

for i in range(len(sem_analogy_words)):
  w1=model[sem_analogy_words[i][0]]
  w2=model[sem_analogy_words[i][1]]
  w3=model[sem_analogy_words[i][2]]
  p_w4 = w3-w1+w2
  gt=model[sem_analogy_words[i][3]]
  sem_analogy_accuracy+=cosine_similarity(p_w4.reshape(1,300),gt.reshape(1,300))[0][0]

sem_analogy_accuracy = sem_analogy_accuracy/len(sem_analogy_words)

for i in range(len(syn_analogy_words)):
  w1=model[syn_analogy_words[i][0]]
  w2=model[syn_analogy_words[i][1]]
  w3=model[syn_analogy_words[i][2]]
  p_w4 = w3-w1+w2
  gt=model[syn_analogy_words[i][3]]
  syn_analogy_accuracy+=cosine_similarity(p_w4.reshape(1,300),gt.reshape(1,300))[0][0]

syn_analogy_accuracy = syn_analogy_accuracy/len(syn_analogy_words)

print('The semantic analogy accuracy using word2vec method is :',sem_analogy_accuracy*100, " %.")
print('The syntactic analogy accuracy using word2vec method is :',syn_analogy_accuracy*100, " %.")

The semantic analogy accuracy using word2vec method is : 71.79592225964613  %.
The syntactic analogy accuracy using word2vec method is : 60.75930756541075  %.


In [17]:
#STEP 4
# Word Analogies - LSA Preprocessing

# Creating the word vectors using LSA
import nltk
from collections import Counter
import numpy as np
nltk.download("brown")
nltk.download("stopwords")
from string import punctuation
from nltk.corpus import brown
from nltk import bigrams
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt

#Getting High frequency words
freq_of_words = Counter({})
freq_bi = Counter({})
stop_words = set(stopwords.words('english')+list(punctuation)+["``","''","--"])
biagrams = []
for c in brown.categories():
  words_wo_numbers = [w for w in brown.words(categories=c) if not w.isnumeric()]
  words_wo_stop_words = [w for w in words_wo_numbers if not w.lower() in stop_words]
  freq_of_words += Counter(words_wo_stop_words)
  freq_bi += Counter(FreqDist(bigrams(words_wo_stop_words)))

freq_of_words = dict(freq_of_words)

all_freq_words = sorted(freq_of_words,key=freq_of_words.get,reverse=True)[:]
highest_freq_words = all_freq_words[:5000]
n_words = ['asylum','autograph','automobile','boy','bird','brother','cord','cushion','coast','cementary','crane','car','cock','fruit','furnace','forest','food','grin','graveyard','glass','gem','hill','implement','jewel','journey','lad','monk','midday','madhouse','mound','magician','noon','oracle','pillow','rooster','smile','string','sage','shore','serf','slave','signature','stove','tumbler','tool','voyage','wizard','woodland']
words_in_brown_corpus = [ w for w in n_words if w in freq_of_words ]

words=highest_freq_words+[w for w in words_in_brown_corpus if w not in highest_freq_words]

M1= np.zeros((len(words),len(words)))
# Constructing biagrams and calculating frequency
for i in range(len(words)):
  for j in range(len(words)):
    bigram=(words[i],words[j])
    M1[i][j] = freq_bi[(bigram)]

#PPMI
M1_plus = np.zeros_like(M1)
total_bigrams = np.sum(M1)
total_unigram_freq =0
for i,w1 in enumerate(words):
  total_unigram_freq+= freq_of_words[w1]

for i,w1 in enumerate(words):
  for j,w2 in enumerate(words):
    prob_wc = M1[i][j]/total_bigrams
    prob_w = freq_of_words[w1]/total_unigram_freq
    prob_c = freq_of_words[w2]/total_unigram_freq
    M1_plus[i][j] = max(np.log(prob_wc/(prob_c * prob_w)),0)

#PCA
#Scaled data
M1_plus_scaled = preprocessing.scale(M1_plus.T)

pca300 = PCA(n_components=300, svd_solver='full')
pca300.fit(M1_plus_scaled)
M2_300 = pca300.transform(M1_plus_scaled)

LSA={}

for i,w in enumerate(words):
  LSA[w] = M2_300[i:i+1]

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  M1_plus[i][j] = max(np.log(prob_wc/(prob_c * prob_w)),0)


In [19]:
#STEP 4
# Word Analogies - LSA Analysis

sem_analogy_accurancy = 0
syn_analogy_accurancy = 0

for i in range(len(sem_analogy_words)):
  w1=LSA[sem_analogy_words[i][0]]
  w2=LSA[sem_analogy_words[i][1]]
  w3=LSA[sem_analogy_words[i][2]]
  p_w4 = w3-w1+w2
  gt=LSA[sem_analogy_words[i][3]]
  sem_analogy_accuracy+=cosine_similarity(p_w4,gt)[0][0]

sem_analogy_accuracy = sem_analogy_accuracy/len(sem_analogy_words)

for i in range(len(syn_analogy_words)):
  w1=LSA[syn_analogy_words[i][0]]
  w2=LSA[syn_analogy_words[i][1]]
  w3=LSA[syn_analogy_words[i][2]]
  p_w4 = w3-w1+w2
  gt=LSA[syn_analogy_words[i][3]]
  syn_analogy_accuracy+=cosine_similarity(p_w4,gt)[0][0]

syn_analogy_accuracy = syn_analogy_accuracy/len(syn_analogy_words)

print('The semantic analogy accuracy using LSA method is :',sem_analogy_accuracy*100, " %.")
print('The syntactic analogy accuracy using LSA method is :',syn_analogy_accuracy*100, " %.")

The semantic analogy accuracy using LSA method is : 11.37295300245644  %.
The syntactic analogy accuracy using LSA method is : 7.431095733440996  %.


In [20]:
#Miscellaneous - Word analogies on larger set for Word2Vec

file1 = open('./Analogies/full_c_semantic_analogies.txt', 'r')
sem_analogies = file1.readlines()

file2 = open('./Analogies/full_c_syntactic_analogies.txt', 'r')
syn_analogies = file2.readlines()

sem_analogy_words = [w.split() for w in sem_analogies]
syn_analogy_words = [w.split() for w in syn_analogies]

sem_analogy_accuracy = 0
syn_analogy_accuracy = 0

for i in range(len(sem_analogy_words)):
  w1=model[sem_analogy_words[i][0]]
  w2=model[sem_analogy_words[i][1]]
  w3=model[sem_analogy_words[i][2]]
  p_w4 = w3-w1+w2
  gt=model[sem_analogy_words[i][3]]
  sem_analogy_accuracy+=cosine_similarity(p_w4.reshape(1,300),gt.reshape(1,300))[0][0]

sem_analogy_accuracy = sem_analogy_accuracy/len(sem_analogy_words)

for i in range(len(syn_analogy_words)):
  w1=model[syn_analogy_words[i][0]]
  w2=model[syn_analogy_words[i][1]]
  w3=model[syn_analogy_words[i][2]]
  p_w4 = w3-w1+w2
  gt=model[syn_analogy_words[i][3]]
  syn_analogy_accuracy+=cosine_similarity(p_w4.reshape(1,300),gt.reshape(1,300))[0][0]

syn_analogy_accuracy = syn_analogy_accuracy/len(syn_analogy_words)

print('The semantic analogy accuracy using word2vec method is :',sem_analogy_accuracy*100, " %.")
print('The syntactic analogy accuracy using word2vec method is :',syn_analogy_accuracy*100, " %.")

The semantic analogy accuracy using word2vec method is : 63.855976842600725  %.
The syntactic analogy accuracy using word2vec method is : 62.19563980708954  %.
