In [9]:
import nltk
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
import pickle

In [10]:
nltk.download('brown')
from nltk.corpus import brown

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\anna\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [11]:
fdist = nltk.FreqDist(brown.words())
pdist = nltk.MLEProbDist(fdist)

In [12]:
W = [w[0] for w in fdist.most_common(5000)]

In [13]:
print(W[:5])
print(W[-5:])

['the', ',', '.', 'of', 'and']
['expanded', 'emphasize', 'Manhattan', 'temporarily', 'puts']


In [14]:
new_words = ["asylum", "autograph", "boy", "brother", "car", "coast", "cock", 
 "cord", "crane", "cushion", "food", "furnace", "gem", "glass", 
 "graveyard", "grin", "mound", "noon", "oracle", "slave", "tool", 
 "voyage", "wizard", "woodland", "automobile", "bird", "cemetery",
 "forest", "fruit", "hill", "implement", "jewel", "journey", "had",
 "madhouse", "magician", "midday", "monk", "pillow", "rooster", 
 "sage", "serf", "shore", "signature", "smile", "stove", "string",
 "tumbler"]

In [15]:
for w in new_words:
    if w not in W:
        W.append(w)
print(len(W))
W = np.array(W)

5030


In [16]:
fd_bigrams = nltk.ConditionalFreqDist(nltk.bigrams(brown.words()))
pd_bigrams = nltk.ConditionalProbDist(fd_bigrams, nltk.MLEProbDist)

In [28]:
print(fd_bigrams['the']['dog'])
print(pd_bigrams['the'].prob('dog')*pdist.prob('dog'))

16
1.5380002673367443e-08


In [29]:
M1_dense = np.zeros((len(W), len(W)))
for i, w1 in enumerate(W):
    for j, w2 in enumerate(W):
        M1_dense[i][j] = pd_bigrams[w1].prob(w2)*pdist.prob(w2)

In [40]:
M1_counts = np.zeros((len(W), len(W)))
for i, w1 in enumerate(W):
    for j, w2 in enumerate(W):
        M1_counts[i][j] = fd_bigrams[w1][w2]

In [41]:
M1 = csr_matrix(M1_counts)

In [31]:
probs = np.zeros((len(W), len(W)))
for i, w1 in enumerate(W):
    for j, w2 in enumerate(W):
        probs[i][j] = pdist.prob(w1) * pdist.prob(w2)

In [32]:
M1_plus_dense = np.zeros((len(W), len(W)))
div = np.divide(M1_dense, probs, where=(probs!=0))
M1_plus_dense = np.log2(div, where=(div!=0))
for i, _ in enumerate(M1_plus_dense):
    for j, _ in enumerate(M1_plus_dense[i]):
        M1_plus_dense[i][j] = max(0, M1_plus_dense[i][j])

In [33]:
M1_plus = csr_matrix(M1_plus_dense)

In [34]:
svd10 = PCA(n_components=10)
M2_10 = svd10.fit_transform(M1_plus_dense)
svd100 = PCA(n_components=100)
M2_100 = svd100.fit_transform(M1_plus_dense)
svd300 = PCA(n_components=300)
M2_300 = svd300.fit_transform(M1_plus_dense)

In [35]:
P = [("cord", "smile"), ("rooster", "voyage"), ("noon", "string"), 
     ("fruit", "furnace"), ("autograph", "shore"), ("automobile", "wizard"),
     ("mound", "stove"), ("grin", "implement"), ("asylum", "fruit"),
     ("asylum", "monk"), ("graveyard", "madhouse"), ("glass", "magician"),
     ("boy", "rooster"), ("cushion", "jewel"), ("monk", "slave"),
     ("asylum", "cemetery"), ("coast", "forest"),
     ("shore", "woodland"), ("monk", "oracle"), ("boy", "sage"),
     ("automobile", "cushion"), ("mound", "shore"),
     ("forest", "graveyard"), ("food", "rooster"), ("cemetery", "woodland"),
     ("shore", "voyage"), ("bird", "woodland"), ("coast", "hill"),
     ("furnace", "implement"), ("crane", "rooster"), ("hill", "woodland"),
     ("car", "journey"), ("cemetery", "mound"), ("glass", "jewel"),
     ("magician", "oracle"), ("crane", "implement"),
     ("sage", "wizard"), ("oracle", "sage"), ("bird", "crane"),
     ("bird", "cock"), ("food", "fruit"), ("brother", "monk"), 
     ("asylum", "madhouse"), ("furnace", "stove"),
     ("magician", "wizard"), ("hill", "mound"), ("cord", "string"),
     ("glass", "tumbler"), ("grin", "smile"), ("serf", "slave"),
     ("journey", "voyage"), ("autograph", "signature"), ("coast", "shore"),
     ("forest", "woodland"), ("implement", "tool"), ("cock", "rooster"),
     ("cushion", "pillow"), ("cemetery", "graveyard"),
     ("automobile", "car"), ("midday", "noon"), ("gem", "jewel")]
S = [0.02, 0.04, 0.04, 0.05, 0.06, 0.11, 0.14, 0.18, 0.19, 0.39, 0.42,
    0.44, 0.44, 0.45, 0.57, 0.79, 0.85, 0.90, 0.91, 0.96, 0.97,
    0.97, 1.00, 1.09, 1.18, 1.22, 1.24, 1.26, 1.37, 1.41, 1.48,
    1.55, 1.69, 1.78, 1.82, 2.37, 2.46, 2.61, 2.63, 2.63, 2.69,
    2.74, 3.04, 3.11, 3.21, 3.29, 3.41, 3.45, 3.46, 3.46, 3.58, 3.59, 
    3.60, 3.65, 3.66, 3.68, 3.84, 3.88, 3.92, 3.94, 3.94]
print(len(P), len(S))

61 61


In [42]:
S_M1 = [float(cosine_similarity(M1[np.where(W == w1)[0]], M1[np.where(W == w2)[0]])[0]) for (w1, w2) in P]
S_M1_plus = [float(cosine_similarity(M1_plus[np.where(W == w1)[0]], M1_plus[np.where(W == w2)[0]])[0]) for (w1, w2) in P]
S_M2_10 = [float(cosine_similarity(M2_10[np.where(W == w1)[0]], M2_10[np.where(W == w2)[0]])[0]) for (w1, w2) in P]
S_M2_100 = [float(cosine_similarity(M2_100[np.where(W == w1)[0]], M2_100[np.where(W == w2)[0]])[0]) for (w1, w2) in P]
S_M2_300 = np.array([float(cosine_similarity(M2_300[np.where(W == w1)[0]], M2_300[np.where(W == w2)[0]])[0]) for (w1, w2) in P])

# print(np.where(S_M2_300 >= 0.9))
np.array(P)[np.where(S_M2_300 >= 0.9)]

array([], shape=(0, 2), dtype='<U10')

In [43]:
print(pearsonr(S, S_M1))
print(pearsonr(S, S_M1_plus))
print(pearsonr(S, S_M2_10))
print(pearsonr(S, S_M2_100))
print(pearsonr(S, S_M2_300))

(0.14720707674998043, 0.2575842256998856)
(0.20666776280348725, 0.11003426959226345)
(0.20851598019109546, 0.10682021057203918)
(0.28473404526172186, 0.0261420090815637)
(0.31016088046240925, 0.014989193551728036)


In [38]:
with open("exercise_M2_300.pkl", "wb") as file:
    pickle.dump(M2_300, file)
with open("exercise_W.pkl", "wb") as file:
    pickle.dump(W, file)

In [39]:
from gensim.models import KeyedVectors
words = W
vectors = M2_300
model = KeyedVectors(vectors.shape[1])
model.add(words, vectors)
model.save('m2_300')