## NLP HW1

In [1]:
import numpy as np

from scipy.stats import spearmanr
from scipy.sparse import dok_matrix, coo_matrix, csr_matrix, csc_matrix
from scipy.sparse.linalg import norm

import time

from tqdm.notebook import tqdm

import pickle

import pandas as pd

from collections import defaultdict

In [2]:
with open('./data/vocab-15kws.txt') as f:
    V = [l.strip() for l in f.readlines()]
    
with open('./data/vocab-5k.txt') as f:
    V_c = [l.strip() for l in f.readlines()]
    
word_to_ind = dict(zip(V, range(len(V))))
contextword_to_ind = dict(zip(V_c, range(len(V_c))))

#with open('./hw1-data-31190-fall-2020/wiki-1percent.txt') as f:
#    data = [l.strip() for l in f.readlines()]
    
with open('./data/men.txt') as f:
    men = f.readlines()[1:]
    
with open('./data/simlex-999.txt') as f:
    simlex = f.readlines()[1:]
    
def parse_scores(lines):
    counts = {}
    for line in lines:
        w1, w2, n = line.split()
        n = float(n)
        counts[w1, w2] = n
    return counts

men, simlex = parse_scores(men), parse_scores(simlex)


In [3]:
print(V[:5], '\n', V_c[:5])

['the', '.', ',', 'of', 'and'] 
 ['the', '.', ',', 'of', 'and']


### 1.1 (12 points) Implement distributional counting as described above for a provided w, V, and VC. Submit your code.

In [4]:
l = 997898 # len(data)

def get_vectors_and_idf(V, V_c, windows=[1, 3, 6]):
    """
        txt: a list of sentence strings
        V: a list of vocabulary words
        V_c: a list of context words
        window: an integer, window length
    """
    idf = {}
    word_to_ind = dict(zip(V, range(len(V))))
    contextword_to_ind = dict(zip(V_c, range(len(V_c))))
    df_v, df_v_c = dok_matrix((len(V), 1), dtype=np.float32), dok_matrix((len(V_c), 1), dtype=np.float32)
    t1 = time.time()
    vectors = {k: dok_matrix((len(V), len(V)), dtype=np.float32) for k in [('V', 1), ('V', 3), ('V', 6)]}
    for k in [('V_c', 1), ('V_c', 3), ('V_c', 6)]:
        vectors[k] = dok_matrix((len(V), len(V_c)), dtype=np.float32)
    with open('./data/wiki-1percent.txt') as f:
        for linenum, line in tqdm(enumerate(f)):
            if (linenum+1)%50000==0 or linenum==0:
                t2 = time.time()
                print(f"({linenum+1}/{l}) : {t2-t1} s")
                #with open(f'/Users/aabir/Documents/uchicago/nlp/hw1/vectors2.pkl', 'wb') as f:
                #    pickle.dump(vectors, f)
                #with open(f'/Users/aabir/Documents/uchicago/nlp/hw1/idf2.pkl', 'wb') as f:
                #    pickle.dump(idf, f)
            line = line.split()
            for word_ind, word in enumerate(line):
                if word not in V:
                    continue
                df_v[word_to_ind[word]] = df_v[word_to_ind[word]] + 1
                if word in V_c:
                    df_v_c[contextword_to_ind[word]] = df_v_c[contextword_to_ind[word]]+1
                continue
                for w in windows:
                    for cword_ind in range(word_ind-w, word_ind+w):
                        if cword_ind != word_ind and cword_ind <= len(line)-1 and cword_ind >= 0:
                            contextword = line[cword_ind]
                            if contextword in V_c:
                                i, j = word_to_ind[word], contextword_to_ind[contextword]
                                vectors['V_c', w][i, j] = vectors['V_c', w][i, j] + 1
                            if contextword in V:
                                i, j = word_to_ind[word], word_to_ind[contextword]
                                vectors['V', w][i, j] = vectors['V', w][i, j] + 1
    idf['V'] = df_v
    idf['V_c'] = df_v_c
    #df_v, df_v_c = df_v.toarray().flatten(), df_v_c.toarray().flatten()
    #idf['V'] = np.array([l/x if x!=0 else 0. for x in df_v])
    #idf['V_c'] = np.array([l/x if x!=0 else 0. for x in df_v_c])
    return vectors, idf

def get_idf(V, V_c, startind=0, lastind=np.inf):
    """
        txt: a list of sentence strings
        V: a list of vocabulary words
        V_c: a list of context words
        window: an integer, window length
    """
    idf = {}
    word_to_ind = dict(zip(V, range(len(V))))
    contextword_to_ind = dict(zip(V_c, range(len(V_c))))
    V, V_c = set(V), set(V_c)
    df_v, df_v_c = dok_matrix((len(V), 1), dtype=np.float32), dok_matrix((len(V_c), 1), dtype=np.float32)
    t1 = time.time()
    with open('./hw1-data-31190-fall-2020/wiki-1percent.txt') as f:
        for linenum, line in tqdm(enumerate(f)):
            if linenum<startind:
                continue
            if (linenum+1)%50000==0 or linenum==0:
                t2 = time.time()
                print(f"({linenum+1}/{l}) : {t2-t1} s")
                #with open(f'/Users/aabir/Documents/uchicago/nlp/hw1/vectors2.pkl', 'wb') as f:
                #    pickle.dump(vectors, f)
                #with open(f'/Users/aabir/Documents/uchicago/nlp/hw1/idf2.pkl', 'wb') as f:
                #    pickle.dump(idf, f)
            line = line.split()
            for word_ind, word in enumerate(line):
                if word not in V:
                    continue
                df_v[word_to_ind[word]] = df_v[word_to_ind[word]] + 1
                if word in V_c:
                    df_v_c[contextword_to_ind[word]] = df_v_c[contextword_to_ind[word]]+1
                continue
            if lastind==linenum:
                    return df_v, df_v_c
    #idf['V'] = np.array([l/x if x!=0 else 0. for x in df_v])
    #idf['V_c'] = np.array([l/x if x!=0 else 0. for x in df_v_c])
    return df_v, df_v_c

#### Note: my computer was struggling to run this so I ran it on RCC and downloaded the Pickled results for further analysis

In [5]:
loadpickle = True

counts, idfs = {}, {}

if loadpickle:
    vectors, dfs = [pickle.load(open(f'/Users/aabir/Documents/uchicago/nlp/hw1/{f}.pkl', 'rb')) 
                    for f in ['vectors', 'dfs']]
    dfs = {'V':dfs[0], 'V_c':dfs[1]}
else:
    vectors, dfs = get_vectors_and_idf(V, V_c, [1, 3, 6])
    if input('Overwrite pickles? y/n\t').lower()=='y':
        with open(f'/Users/aabir/Documents/uchicago/nlp/hw1/vectors.pkl', 'wb') as f:
            pickle.dump(vectors, f)
        with open(f'/Users/aabir/Documents/uchicago/nlp/hw1/dfs.pkl', 'wb') as f:
            pickle.dump(dfs, f)


In [6]:
idfs = {}
for k, v in dfs.items():
    idfs[k] = np.array([l/val if val!=0 else 0 for val in v.toarray()]).flatten()

In [7]:
vectors['V', 6].sum()

141228700.0

### 1.2 (6 points) Using vocab-15kws.txt to populate V and vocab-5k.txt to populate VC, use your code to report #(x, y) for the pairs in the following table for both w = 3 and w = 6.

The second time I ran my code, these values did not match up due to some changes (I modified it to run the whole ensemble with different context vocabulary and window sizes). However, I  ran it just for V_c, w=3 once previously and found counts that matched the expected value. I'm printing them here:

```
print("for w=3:\n")
print(counts[('chicken', 'the')])
print(counts[('chicken', 'wings')])
print(counts[('chicago', 'chicago')])
print(counts[('coffee', 'the')])
print(counts[('coffee', 'cup')])
print(counts[('coffee', 'coffee')])
>>> 52
>>> 6
>>> 38
>>> 95
>>> 10
>>> 4
```

I'm printing the results that don't match up (from the latest run) below.

In [8]:
print("for w=3:\n")
for (w1, w2) in [('chicken', 'the'), ('chicken', 'wings'), ('chicago', 'chicago'),
             ('coffee', 'the'), ('coffee', 'cup'), ('coffee', 'coffee')]:
    i, j = word_to_ind[w1], contextword_to_ind[w2]
    print((w1, w2), ':', vectors['V_c', 3][i, j])

for w=3:

('chicken', 'the') : 40.0
('chicken', 'wings') : 4.0
('chicago', 'chicago') : 22.0
('coffee', 'the') : 70.0
('coffee', 'cup') : 8.0
('coffee', 'coffee') : 3.0


In [9]:
print("for w=6:\n")
for (w1, w2) in [('chicken', 'the'), ('chicken', 'wings'), ('chicago', 'chicago'),
             ('coffee', 'the'), ('coffee', 'cup'), ('coffee', 'coffee')]:
    i, j = word_to_ind[w1], contextword_to_ind[w2]
    print((w1, w2), ':', vectors['V_c', 6][i, j])

for w=6:

('chicken', 'the') : 83.0
('chicken', 'wings') : 5.0
('chicago', 'chicago') : 101.0
('coffee', 'the') : 175.0
('coffee', 'cup') : 11.0
('coffee', 'coffee') : 29.0


### 1.3 (6 points) Using w = 3 (and again using vocab-15kws.txt for V and vocab-5k.txt for VC ), eval- uate your count-based word vectors using EVALWS and report your results on MEN and SimLex-999. As a sanity check, your Spearman correlation for MEN should be close to 0.22.

In [13]:
def cosine_similarity(vec1, vec2):
    n1, n2 = norm(vec1), norm(vec2)
    if n1*n2 != 0:
        return vec1.dot(vec2.T).toarray()[0, 0]/n1/n2
    else:
        return 0

def evalws(vectors, benchmark, name):
    cosinevals_dict = {}
    vectors = vectors.tocsr()
    vec_cos, benchmark_cos = [], []
    for (w1, w2), sim in tqdm(benchmark.items()):
        if w1 not in V or w2 not in V:
            vec_cos.append(0.)
        else:
            w1i, w2i = word_to_ind[w1], word_to_ind[w2]
            cos = cosine_similarity(vectors[w1i], vectors[w2i])
            vec_cos.append(cos)
            #print(cos, sim)
        benchmark_cos.append(sim)
    #print(vec_cos)
    #print(benchmark_cos)
    rho, pval = spearmanr(vec_cos, benchmark_cos)
    print(f"{name} spearmanr: rho = {rho}, pval = {pval}")
    return rho, pval

### EvalWS for w=3, V as vocab, V_c as context vocabulary with **raw counts** as vectors

In [11]:
_ = evalws(vectors['V_c', 3], men, 'MEN')
_ = evalws(vectors['V_c', 3], simlex, 'SIMLEX')

MEN spearmanr: rho = 0.2203416545296961, pval = 2.6173677002997682e-34
SIMLEX spearmanr: rho = 0.05480605099111539, pval = 0.08338305067574583


In [8]:
def check_cosine_similarity(w1, w2, vecs):
    w1i, w2i = word_to_ind[w1], word_to_ind[w2]
    cos = cosine_similarity(vecs[w1i], vecs[w2i])
    print(f"{w1}, {w2}: {cos}")
    
check_cosine_similarity('sun', 'sunlight', vectors['V_c', 3])
check_cosine_similarity('automobile', 'car', vectors['V_c', 3])
check_cosine_similarity('festival', 'whiskers', vectors['V_c', 3])
check_cosine_similarity('bakery', 'zebra', vectors['V_c', 3])

sun, sunlight: 0.8683664202690125
automobile, car: 0.8751132488250732
festival, whiskers: 0.5750707387924194
bakery, zebra: 0.8923996090888977


### 2.1 (10 points) Extend your implementation to be able to compute IDF-based word vectors using Eq. 1. Us- ing w = 3, vocab-15kws.txt to populate V , and vocab-5k.txt to populate VC , evaluate (EVALWS) your IDF-based word vectors and report your results.

In [8]:
def get_idf_vecs(vecs, idf, method='csc'):
    print('computing TF-IDF vectors')
    if method=='csr':
        return vecs.tocsr().multiply(idf)
    if method=='csc':
        return vecs.tocsc().multiply(idf)

In [9]:
t1 = time.time()
idf_vecs = get_idf_vecs(vectors['V_c', 3], idfs['V_c'], 'csc')
print(time.time()-t1)

computing TF-IDF vectors
7.508224010467529


In [10]:
idf_vecs.shape

(15228, 5000)

In [44]:
_ = evalws(dok_matrix(idf_vecs), men, "MEN")
_ = evalws(dok_matrix(idf_vecs), simlex, "SIMLEX")

MEN spearmanr: rho = 0.44886867503643035, pval = 1.0831563340443806e-148
SIMLEX spearmanr: rho = 0.1855666928945375, pval = 3.436639200620402e-09


### 3.1 (8 points) Implement the capability of computing PMIs. Use your code to calculate PMIs for w = 3 when using vocab-15kws.txt to populate V and vocab-5k.txt to populate VC . Note that since we are using different vocabularies for center words and context words, pmi(a, b) will not necessarily equal pmi(b, a) (though they will be similar). (If there is a word in V that has no counts, the numerator and denominator for all of its PMI values will be zero, so you can just define all such PMIs to be zero.) For center word x = “coffee”, print the 10 context words with the largest PMIs and the 10 context words with the smallest PMIs. Print both the words and the PMI values. (Note: using my implementation, the highest-PMI context word was “tea” with PMI 8.166, and the lowest-PMI context word was “he” with PMI -2.26034.)


### 3.2 (6 points) Now, define word vectors using PMI. That is, the word vector for a word x ∈ V has an entry for each word y ∈ VC with value given by pmi(x, y). As above, use w = 3, vocab-15kws.txt to populate V , and vocab-5k.txt to populate VC . Evaluate (EVALWS) your PMI-based word vectors and report your results.

In [9]:
N_v_c3 = vectors['V_c', 3].sum()
N_v3 = vectors['V', 3].sum()
def get_pmi_for_words(w1, w2, vecs, N, contextvocab='V_c'):
    i = word_to_ind[w1]
    j = contextword_to_ind[contextword] if contextvocab=='V_c' else word_to_ind[w2]
    vecs = vecs.tolil()
    if vecs[i,j]!=0:
        return np.log2(vecs[i,j])+np.log2(N)-np.log2(vecs[i,:].sum()*vecs[:,j].sum())
    else:
        return 0

In [22]:
coffee_pmis = []
for contextword in V_c:
    coffee_pmis.append(get_pmi_for_words('coffee', contextword, vectors['V_c', 3], N_v_c3))
    
args = np.argsort(coffee_pmis)
minwords, maxwords = [V_c[i] for i in args[:10]], [V_c[i] for i in args[-10:]]

In [23]:
print(f"'coffee' min PMI words:")
for i in args[:10]:
    print(f"\t{V_c[i]}: {coffee_pmis[i]}")

print(f"\n'coffee' max PMI words:")
for i in args[-1:-10:-1]:
    print(f"\t{V_c[i]}: {coffee_pmis[i]}")

'coffee' min PMI words:
	this: -2.6687450408935547
	he: -1.987762451171875
	be: -1.7789745330810547
	had: -1.6826629638671875
	not: -1.5677719116210938
	its: -1.5174312591552734
	after: -1.3077754974365234
	other: -1.2900466918945312
	all: -1.284261703491211
	;: -1.2233772277832031

'coffee' max PMI words:
	tea: 8.36812973022461
	shop: 7.819431304931641
	drinking: 7.684116363525391
	shops: 7.584140777587891
	costa: 7.232067108154297
	coffee: 6.40800666809082
	seattle: 6.275318145751953
	houses: 6.181125640869141
	lloyd: 6.114559173583984


In [11]:
def get_pmi_vectors(vecs, N=None):
    print('computing PMI vectors')
    pmi_vecs = dok_matrix(vecs.shape, dtype=np.float32)
    if N is None:
        N = vecs.sum()
    vecs = vecs.tolil()
    rowsums, colsums = vecs.sum(axis=1).flatten().T, vecs.sum(axis=0).flatten().T
    vecs = vecs.tocoo()
    for i, j, v in tqdm(zip(vecs.row, vecs.col, vecs.data)):
        pmi_vecs[i, j] = np.log2(v*N/(rowsums[i]*colsums[j]))
    return pmi_vecs

In [None]:
#13:13 pm Oct 18
pmi_vecs = get_pmi_vectors(vectors['V_c', 3], N_v_c3)

### EvalWS for w=3, V as vocab, V_c as context vocabulary

In [17]:
_ = evalws(pmi_vecs, men, 'MEN')
_ = evalws(pmi_vecs, simlex, 'SIMLEX')

MEN spearmanr: rho = 0.46153842598967837, pval = 3.549880350180334e-158
SIMLEX spearmanr: rho = 0.19900072623832812, pval = 2.2170010905021224e-10


### 4.1 (8 points) Evaluate the word vectors (EVALWS) corresponding to the three ways of comput- ing vectors (counts, IDF, and PMI), three values of w (1, 3, and 6), and two context vocabularies (vocab-15kws.txt and vocab-5k.txt). For all cases, use vocab-15kws.txt for V . Report the results (there should be 36 correlations in all) and describe your findings. What happens as window size changes for different methods of creating word vectors? What happens when context vocabulary changes? Why do you think you observe the trends you see? Do you see the same trends for MEN and SimLex or do they differ?


### 4.2 (4 points) You should observe systematic trends in terms of correlation as window size changes which should differ for MEN and SimLex-999. Look at some of the manually-annotated similarities in the MEN and SimLex-999 datasets and describe why you think the two datasets show the trends they do. Are these two datasets encoding the same type of similarity? How does the notion of similarity differ between them?

In [14]:
columns = ['rho', 'pval', 'context_vocab', 'windowsize', 'eval_data', 'vector_type']

scores_df = pd.DataFrame({}, columns=columns)

for vocab in ['V_c', 'V']:
    for window in [1, 3, 6]:
        if (vocab, window) == ('V_c', 3):
            continue
        N = vectors[vocab, window].sum()
        for vector_type in ['raw', 'tfidf', 'pmi']:
            if vector_type=='raw':
                vecs = vectors[vocab, window]
            if vector_type=='pmi':
                vecs = get_pmi_vectors(vectors[vocab, window], N)
            if vector_type=='tfidf' and vocab!='V':
                vecs = get_idf_vecs(vectors[vocab, window], idfs[vocab])
            if (vector_type, vocab) == ('tfidf', 'V'):
                continue
            for data_name, eval_data in zip(['MEN', 'SIMLEX'], [men, simlex]):
                print(f"\n{vocab}, {data_name}, {window}, {vector_type}")
                rho, pval = evalws(vecs, eval_data, data_name)
                scores_df.append(dict(zip(columns, [rho, pval, vocab, window, data_name, vector_type])),
                                 ignore_index=True)


V_c, MEN, 1, raw


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.1453646412214026, pval = 1.2324872951126856e-15

V_c, SIMLEX, 1, raw


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.05130971677729792, pval = 0.10506450447826148
computing TF-IDF vectors

V_c, MEN, 1, tfidf


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.29421919712354866, pval = 5.612330898012024e-61

V_c, SIMLEX, 1, tfidf


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.17777717010466934, pval = 1.5396835417652116e-08
computing PMI vectors


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



V_c, MEN, 1, pmi


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.3622824597101258, pval = 1.0183168811957188e-93

V_c, SIMLEX, 1, pmi


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.20832696064990097, pval = 2.942379302716302e-11

V_c, MEN, 6, raw


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.2405388357214604, pval = 9.544733802667223e-41

V_c, SIMLEX, 6, raw


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.04655659821922139, pval = 0.14143513010068376
computing TF-IDF vectors

V_c, MEN, 6, tfidf


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.5196876901148884, pval = 3.170819001168843e-207

V_c, SIMLEX, 6, tfidf


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.11878414885025199, pval = 0.0001678633374868749
computing PMI vectors


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



V_c, MEN, 6, pmi


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.4723553399745761, pval = 1.3478358479482169e-166

V_c, SIMLEX, 6, pmi


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.154171537123257, pval = 9.776058133618332e-07

V, MEN, 1, raw


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.14000684517560627, pval = 1.3263418238349479e-14

V, SIMLEX, 1, raw


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.05250729745927539, pval = 0.09718379879864335

V, MEN, 1, tfidf


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.14000684517560627, pval = 1.3263418238349479e-14

V, SIMLEX, 1, tfidf


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.05250729745927539, pval = 0.09718379879864335
computing PMI vectors


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



V, MEN, 1, pmi


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.38785047256811156, pval = 2.7867884669220316e-108

V, SIMLEX, 1, pmi


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.23303023453015143, pval = 8.712874018535476e-14

V, MEN, 3, raw


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.21551090456727312, pval = 7.352005167777831e-33

V, SIMLEX, 3, raw


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.05361097703127073, pval = 0.09034575849779028

V, MEN, 3, tfidf


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.21551090456727312, pval = 7.352005167777831e-33

V, SIMLEX, 3, tfidf


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.05361097703127073, pval = 0.09034575849779028
computing PMI vectors


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



V, MEN, 3, pmi


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.5224845465220976, pval = 7.833857588927124e-210

V, SIMLEX, 3, pmi


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.22330635121502992, pval = 9.364300306845717e-13

V, MEN, 6, raw


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.23572152690209489, pval = 3.731207056287394e-39

V, SIMLEX, 6, raw


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.04227952379792869, pval = 0.18179411276008164

V, MEN, 6, tfidf


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.23572152690209489, pval = 3.731207056287394e-39

V, SIMLEX, 6, tfidf


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.04227952379792869, pval = 0.18179411276008164
computing PMI vectors


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



V, MEN, 6, pmi


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


MEN spearmanr: rho = 0.5259332859986898, pval = 4.418948548753325e-213

V, SIMLEX, 6, pmi


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


SIMLEX spearmanr: rho = 0.16213999988025068, pval = 2.570823255684684e-07


In [25]:
results_csv = """vocab, test, window, method, rho, pval
‘V_c’, ‘MEN’, 1, ‘raw’, 0.1453646412214026, 1.2324872951126856e-15
‘V_c’, ‘MEN’, 1, ‘raw’, 0.05130971677729792, 0.10506450447826148
‘V_c’, ‘MEN’, 1, ‘tfidf’, 0.29421919712354866, 5.612330898012024e-61
‘V_c’, ‘SIMLEX’, 1, ‘tfidf’, 0.17777717010466934, 1.5396835417652116e-08
‘V_c’, ‘MEN’, 1, ‘pmi’, 0.3622824597101258, 1.0183168811957188e-93
‘V_c’, ‘SIMLEX’, 1, ‘pmi’, 0.20832696064990097, 2.942379302716302e-11
‘V_c’, ‘MEN’, 3, ‘raw’, 0.2203416545296961, 2.6173677002997682e-34
‘V_c’, ‘SIMLEX’, 3, ‘raw’, 0.05480605099111539, 0.08338305067574583
’V_c’, MEN’, 3, ’tfidf’,  0.44886867503643035, 1.0831563340443806e-148
‘V_c’, ‘SIMLEX’, 3, ‘tfidf’, 0.1855666928945375, 3.436639200620402e-09
‘V_c’, ‘MEN’, 3, ‘pmi’, 0.46153842598967837, 3.549880350180334e-158
‘V_c’, ‘SIMLEX’, 3, ‘pmi’, 0.19900072623832812, 2.2170010905021224e-10
‘V_c’, ‘MEN’, 6, ‘raw’, 0.2405388357214604, 9.544733802667223e-41
‘V_c’, ‘MEN’, 6, ‘raw’, 0.04655659821922139, 0.14143513010068376
‘V_c’, ‘MEN’, 6, ‘tfidf’, 0.5196876901148884, 3.170819001168843e-207
‘V_c’, ‘SIMLEX’, 6, ‘tfidf’, 0.11878414885025199, 0.0001678633374868749
‘V_c’, ‘MEN’, 6, ‘pmi’, 0.4723553399745761, 1.3478358479482169e-166
‘V_c’, ‘SIMLEX’, 6, ‘pmi’, 0.154171537123257, 9.776058133618332e-07
‘V’, ‘MEN’, 1, ‘raw’, 0.14000684517560627, 1.3263418238349479e-14
‘V’, ‘SIMLEX’, 1, ‘raw’, 0.05250729745927539, 0.09718379879864335
‘V’, ‘MEN’, 1, ‘pmi’, 0.38785047256811156, 2.7867884669220316e-108
‘V’, ‘SIMLEX’, 1, ‘pmi’, 0.23303023453015143, 8.712874018535476e-14
‘V’, ‘MEN’, 3, ‘raw’, 0.21551090456727312, 7.352005167777831e-33
‘V’, ‘SIMLEX’, 3, ‘raw’, 0.05361097703127073, 0.09034575849779028
‘V’, ‘MEN’, 3, ‘pmi’, 0.5224845465220976, 7.833857588927124e-210
‘V’, ‘SIMLEX’, 3, ‘pmi’, 0.22330635121502992, 9.364300306845717e-13
‘V’, ‘MEN’, 6, ‘raw’, 0.23572152690209489, 3.731207056287394e-39
‘V’, ‘SIMLEX’, 6, ‘raw’, 0.04227952379792869, 0.18179411276008164
‘V’, ‘MEN’, 6, ‘pmi’, 0.5259332859986898, 4.418948548753325e-213
‘V’, ‘SIMLEX’, 6, ‘pmi’, 0.16213999988025068, 2.570823255684684e-07""".strip("''")

with open('./scores_csv.csv', 'w') as f:
    f.write(results_csv)

In [26]:
scores_df2 = pd.read_csv('./scores_csv.csv')

scores_df2

Unnamed: 0,vocab,test,window,method,rho,pval
0,‘V_c’,‘MEN’,1,‘raw’,0.145365,1.232487e-15
1,‘V_c’,‘MEN’,1,‘raw’,0.05131,0.1050645
2,‘V_c’,‘MEN’,1,‘tfidf’,0.294219,5.612331e-61
3,‘V_c’,‘SIMLEX’,1,‘tfidf’,0.177777,1.539684e-08
4,‘V_c’,‘MEN’,1,‘pmi’,0.362282,1.018317e-93
5,‘V_c’,‘SIMLEX’,1,‘pmi’,0.208327,2.942379e-11
6,‘V_c’,‘MEN’,3,‘raw’,0.220342,2.617368e-34
7,‘V_c’,‘SIMLEX’,3,‘raw’,0.054806,0.08338305
8,’V_c’,MEN’,3,’tfidf’,0.448869,1.083156e-148
9,‘V_c’,‘SIMLEX’,3,‘tfidf’,0.185567,3.436639e-09


## Effect of vocabulary, window size and method

As the vocabulary size increases, we see a marginal (not large, but not insignificant) increase in the correlation. This can be explained by the additional context available from the increased number of words.

As the window size increases, the correlations also increase. This can be explained by the larger number of context words we are leveraging for each usage of a word. For both V and V_c as context vocabularies, this increase is large from 1 to 3, but smaller from 3 to 6. This tells us that there is a decreasing incentive to increase the window size beyond a limit. This also makes intuitive sense.

For small vocabularies and smaller windows, the PMI method dominates the TF-IDF one. However, with window size=6, TF-IDF surpasses PMI for the smaller context vocabulary.

With the full vocabulary, PMI performs significantly well for all cases. I was unable to compute the TF-IDF vectors for this case because matrix multiplication of a 15000x15000 matrix with a 15000 column was prohibitively slow.

## Qualitative differences between MEN and SIMLEX

Consider these examples from MEN:

```
morning	sunrise	49.000000
rain	storm	49.000000
festival	whiskers	1.000000
muscle	tulip	1.000000
```

And the following from SIMLEX-999:

```
happy	cheerful	9.55
hard	easy	0.95
fast	rapid	8.75
stupid	dumb	9.58
```

The notion of similarity between MEN and SIMLEX seems to differ in that MEN classifies concepts or entities (mostly nouns) based on their similarity. On the other hand, SIMLEX-999 seems to record adjectives and their similarity/dissimilarity. In fact, SIMLEX-999 seems to have many synonym pairs with high scores, and antonym pairs with low scores.

In general, this means we expect MEN correlations to be higher than SIMLEX ones. This is because the distributional hypothesis makes it difficult to infer the difference between synonyms and antonyms that are commonly used in the same contexts - e.g. This is a big/small house, A very tall/short man walked down the street, etc.

This is clearly seen in the printed correlations. **Correlations on MEN are uniformly higher than those on SIMLEX.**

### 5.1 (5 points) For the two window sizes w = 1 and w = 6, compute and print the 10 nearest neighbors for the query word judges. (Hint: using my implementation, the nearest neighbor for both window sizes is judge, followed by justices for w = 1 and appeals with w = 6. Your nearest neighbor lists may differ slightly from mine, but hopefully these words are high up in your lists.)


### 5.2 (10 points) Do nearest neighbors tend to have the same part-of-speech tag as the query word, or do they differ? Does the pattern differ across different part-of-speech tags for the query word? How does window size affect this? Explore these questions by choosing query words with different parts of speech and computing their nearest neighbors. When choosing query words, consider nouns, verbs, adjectives, and prepositions. (Hint: when considering verbs, use inflected forms like transported.) Try a few query words from each part of speech category and see if you can find any systematic patterns when comparing their nearest neighbors across window sizes 1 and 6. When the neighbors differ between window sizes, how do they differ? Can you find any query words that have almost exactly the same nearest neighbors with the two window sizes? Discuss your findings, showing examples of nearest neighbors for particular words to support your claims.


### 5.3 (10 points) Now try choosing words with multiple senses (e.g., bank, cell, apple, apples, axes, frame, light, well, etc.) as query words. What appears to be happening with multisense words based on the nearest neighbors that you observe? What happens when you compare the neighbors with different window sizes (w = 1 vs. w = 6)? Discuss your findings, showing examples of nearest neighbors for particular words to support your claims.

In [17]:
#pmi_vecs = {}
for w in [1, 6]:
    print(f'\nWindow = {w}')
    pmi_vecs[w] = get_pmi_vectors(vectors['V_c', w])


Window = 1
computing PMI vectors


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



Window = 6
computing PMI vectors


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [165]:
def nearest_neighbours(vecs, word, limit=10):
    word_ind = word_to_ind[word]
    wordvec = vecs[word_ind]
    vecs = vecs.tocsr()
    norms = norm(vecs, axis=1)
    similarities = vecs.multiply(wordvec).sum(axis=1)
    coeffs = np.array([1/(nrm*norms[word_ind]) if nrm!=0 else 0. for nrm in norms]).reshape((-1, 1))
    similarities = np.multiply(similarities, coeffs).flatten().tolist()
    args = np.argsort(similarities)[0][::-1]
    neighbours = [V[arg] for arg in args[1:limit+1]]
    nb_string = ', '.join(n for n in neighbours[:10])
    print(f"nearest 10 neighbours of {word}:\n\t{nb_string}")
    return neighbours

In [166]:
for w in [1, 6]:
    print(f"\nfor w={w}:")
    nearest_neighbours(pmi_vecs[w], 'judges', limit=10)


for w=1:
nearest 10 neighbours of judges:
	judge, players, justices, judiciary, members, courts, contestants, governments, officers, interceptions

for w=6:
nearest 10 neighbours of judges:
	judge, courts, jury, supreme, justice, appeals, panel, contestants, judicial, candidates


In [167]:
for w in [1, 6]:
    print(f'\nWindow = {w}')
    nearest_neighbours(pmi_vecs[w], 'happy', 10)
    nearest_neighbours(pmi_vecs[w], 'terrible', 10)
    nearest_neighbours(pmi_vecs[w], 'talk', 10)
    nearest_neighbours(pmi_vecs[w], 'walk', 10)


Window = 1
nearest 10 neighbours of happy:
	pleased, proud, afraid, worried, comfortable, sad, sure, satisfied, sorry, confident
nearest 10 neighbours of terrible:
	amazing, impressive, awful, weak, funny, blatant, beautiful, memorable, remarkable, tragic
nearest 10 neighbours of talk:
	own, main, second, last, article, page, talkpage, original, final, united
nearest 10 neighbours of walk:
	buy, hide, stay, get, reach, remember, write, leave, hear, remove

Window = 6
nearest 10 neighbours of happy:
	anyone, 'll, feel, everyone, ask, let, sure, saying, 'd, wants
nearest 10 neighbours of terrible:
	severe, worse, bad, telling, worst, thinks, happened, wrong, unfortunate, everyone
nearest 10 neighbours of talk:
	page, discussion, should, article, review, do, list, you, link, further
nearest 10 neighbours of walk:
	walking, ride, foot, drive, journey, onto, trail, door, train, ball


In [168]:
for w in [1, 6]:
    print(f'\nWindow = {w}')
    nearest_neighbours(pmi_vecs[w], 'bank', 10)
    nearest_neighbours(pmi_vecs[w], 'orange', 10)
    nearest_neighbours(pmi_vecs[w], 'term', 10)
    nearest_neighbours(pmi_vecs[w], 'well', 10)


Window = 1
nearest 10 neighbours of bank:
	banks, side, province, coast, banking, railway, africa, shore, hemisphere, library
nearest 10 neighbours of orange:
	blue, purple, pink, aluminum, yellow, red, green, coloured, bright, rain
nearest 10 neighbours of term:
	terms, name, period, word, version, title, sentence, largest, language, paragraph
nearest 10 neighbours of well:
	important, so, popular, few, likely, far, common, small, but, same

Window = 6
nearest 10 neighbours of bank:
	corporation, capital, railway, valley, southern, northern, centre, trade, banks, central
nearest 10 neighbours of orange:
	yellow, green, blue, dark, bright, brown, color, purple, grey, colors
nearest 10 neighbours of term:
	terms, word, example, subject, common, considered, language, meaning, means, words
nearest 10 neighbours of well:
	such, other, many, most, some, are, have, like, more, all


In [169]:
for w in [1, 6]:
    print(f'\nWindow = {w}')
    nearest_neighbours(pmi_vecs[w], 'transported', 10)
    nearest_neighbours(pmi_vecs[w], 'before', 10)
    nearest_neighbours(pmi_vecs[w], 'arrested', 10)
    nearest_neighbours(pmi_vecs[w], 'candidate', 10)


Window = 1
nearest 10 neighbours of transported:
	converted, promoted, corrected, relegated, baptized, deported, resurrected, eliminated, overlooked, augmented
nearest 10 neighbours of before:
	after, during, ;, when, until, without, since, while, under, -
nearest 10 neighbours of arrested:
	deposed, beaten, convicted, diagnosed, kidnapped, banned, reunited, disqualified, murdered, acquitted
nearest 10 neighbours of candidate:
	candidates, party, nominee, nomination, caucus, coalition, leader, member, candidacy, parties

Window = 6
nearest 10 neighbours of transported:
	cargo, supplies, transport, carrying, passengers, ships, captured, vessels, equipment, transporting
nearest 10 neighbours of before:
	after, when, until, then, he, during, back, again, later, time
nearest 10 neighbours of arrested:
	convicted, murder, prison, sentenced, arrest, charged, attacked, killed, captured, killing
nearest 10 neighbours of candidate:
	democratic, republican, candidates, election, nomination, lib

#### Do nearest neighbors tend to have the same part-of-speech tag as the query word, or do they differ?
Nearest neighbours are not bound to have the same POS tag as the query word - but they frequently do. This is usually violated for words with multiple sense. With window=1, 'bank' has neighbours like 'library' as well as 'coast', indicating the two different senses. On the other hand, adjectives tend to have nearest neighbors that are also adjectives. This makes sense due to similar styles of word usage.


#### Does the pattern differ across different part-of-speech tags for the query word? How does window size affect this? Explore these questions by choosing query words with different parts of speech and computing their nearest neighbors. When choosing query words, consider nouns, verbs, adjectives, and prepositions. (Hint: when considering verbs, use inflected forms like transported.) Try a few query words from each part of speech category and see if you can find any systematic patterns when comparing their nearest neighbors across window sizes 1 and 6.
As mentioned, adjectives tend to have adjective neighbors. Similarly, as seen for prepositions and adverbs, these also tend to have neighbors that have the same POS tag. 'Before' is closest to 'after', 'when', 'until' and so on.

With participles like 'arrested' or 'transported', we see some differences. As these can be used as both verbs and adjectives, we see a mix of neighbors. There are also related nouns in the list.



#### When the neighbors differ between window sizes, how do they differ? Can you find any query words that have almost exactly the same nearest neighbors with the two window sizes? Discuss your findings, showing examples of nearest neighbors for particular words to support your claims.
Often, we find that words with multiple senses (or POS) are represented better as the window size increases. This makes intuitive sense. In the example of 'bank', with window=1 we see limited contexts similar to the usage of 'river bank' - words like 'coast'. However, with increased window sizes, we see more trade related words as well.



#### 5.3 (10 points) Now try choosing words with multiple senses (e.g., bank, cell, apple, apples, axes, frame, light, well, etc.) as query words. What appears to be happening with multisense words based on the nearest neighbors that you observe? What happens when you compare the neighbors with different window sizes (w = 1 vs. w = 6)? Discuss your findings, showing examples of nearest neighbors for particular words to support your claims.

For words with multiple sense like 'talk', the nearest neighbors are often a mixed bag of different POS tags. It appears that these words get somewhat 'confused' by the distributional hypothesis, getting placed in the middle of many different contexts. For example, the nearest neighbors of 'bank' with w=1 show a mix of the two contexts in which the word can be used - as a river bank, or as the financial institution.

Word sense disambiguation might help us by potentially creating different embeddings for different senses of a word, thereby resolving the confusion.