# NLP 4 lab

1) implement PPMI weighting with cooccurrence based on the presence withing the same document

In [56]:
import pandas as pd
def TermDocMatrix(documents):
    matrix = pd.DataFrame(columns=documents.keys())
    vocabulary = []
    for doc in documents.keys():
        for word in documents[doc]:
            vocabulary.append(word)
    vocabulary = list(set(vocabulary))
    vocabulary.sort()
    for word in vocabulary:
        matrix.loc[word] = 0
    for word in vocabulary:
        for doc in documents.keys():
            matrix.loc[word, doc] = documents[doc].count(word)
    return matrix

In [57]:
import nltk
import numpy as np
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
austen = nltk.corpus.gutenberg.words('austen-persuasion.txt')
# select the first 1000 words from each text
emma = emma[:1000]
austen = austen[:1000]
documents = {'emma':emma, 'austen':austen}

In [58]:
def probabilities_pij(matrix):
    total_sum_of_occurrences = matrix.sum().sum()
    p_ij = matrix / total_sum_of_occurrences
    return p_ij

In [59]:
def probabilities_pi(matrix):
    p_i = matrix.sum(axis=1) / matrix.sum().sum()
    return p_i

In [60]:
def probabilities_pj(matrix):
    p_j = matrix.sum(axis=0) / matrix.sum().sum()
    return p_j

In [61]:
def PPMI(matrix):
    p_ij = probabilities_pij(matrix)
    p_i = probabilities_pi(matrix)
    p_j = probabilities_pj(matrix)
    PPMI_matrix = matrix.copy()
    for i in range(len(matrix.index)):
        for j in range(len(matrix.columns)):
            PPMI_matrix.iloc[i,j] = max(0, np.log2(p_ij.iloc[i,j] / (p_i.iloc[i] * p_j.iloc[j])))
    return PPMI_matrix

In [62]:
F_matrix = TermDocMatrix(documents)
F_matrix.head(50)

Unnamed: 0,emma,austen
"""",0,4
',7,7
(,1,2
),1,1
"),",0,1
",",69,97
",""",0,2
-,5,2
--,7,2
.,28,17


In [63]:
p_ij = probabilities_pij(F_matrix)
p_ij.head(50)

Unnamed: 0,emma,austen
"""",0.0,0.002
',0.0035,0.0035
(,0.0005,0.001
),0.0005,0.0005
"),",0.0,0.0005
",",0.0345,0.0485
",""",0.0,0.001
-,0.0025,0.001
--,0.0035,0.001
.,0.014,0.0085


In [64]:
pi = probabilities_pi(F_matrix)
pi.head(50)

"             0.0020
'             0.0070
(             0.0015
)             0.0010
),            0.0005
,             0.0830
,"            0.0010
-             0.0035
--            0.0045
.             0.0225
."            0.0010
.,            0.0005
.--           0.0010
1             0.0015
15            0.0005
16            0.0005
1760          0.0005
1784          0.0005
1785          0.0005
1787          0.0005
1789          0.0005
1791          0.0005
1800          0.0005
1810          0.0005
1816          0.0005
1818          0.0005
20            0.0005
5             0.0005
9             0.0005
:             0.0010
:--           0.0005
:--"          0.0005
;             0.0165
?--           0.0005
A             0.0005
Anne          0.0005
August        0.0005
Austen        0.0010
Baronetage    0.0005
Be            0.0005
Between       0.0005
CHAPTER       0.0005
Chapter       0.0005
Charles       0.0015
Cheshire      0.0005
Christmas     0.0005
December      0.0005
Dugdale      

In [65]:
pj = probabilities_pj(F_matrix)
pj.head(50)

emma      0.5
austen    0.5
dtype: float64

In [66]:
PPMI_matrix = PPMI(F_matrix)
PPMI_matrix.head(50)

  PPMI_matrix.iloc[i,j] = max(0, np.log2(p_ij.iloc[i,j] / (p_i.iloc[i] * p_j.iloc[j])))


Unnamed: 0,emma,austen
"""",0.0,1.0
',0.0,0.0
(,0.0,0.415037
),0.0,0.0
"),",0.0,1.0
",",0.0,0.224873
",""",0.0,1.0
-,0.514573,0.0
--,0.63743,0.0
.,0.315502,0.0


2) implement PPMI weighting with cooccurrence based on the sliding window of size 4 of neighboring words

In [67]:
def TermDoc_slidingMatrix(document, window_size):
    vocabulary = []
    for word in document:
        vocabulary.append(word)
    vocabulary = list(set(vocabulary))
    vocabulary.sort()
    matrix = pd.DataFrame(index=vocabulary, columns=vocabulary)
    for word in vocabulary:
        matrix.loc[word] = 0
    for word1 in vocabulary:
        for word2 in vocabulary:
            count = 0
            for index, t in enumerate(document):
                if word1 in document[index-window_size:index+window_size] and word2 in document[index-window_size:index+window_size]:
                    count += 1
            matrix.loc[word1, word2] = count
    return matrix

In [69]:
emma = emma[:200]

In [70]:
tdsm = TermDoc_slidingMatrix(emma, 4)

In [71]:
tdsm.head(50)

Unnamed: 0,',",",-,.,1816,;,Austen,Between,CHAPTER,Emma,...,unite,very,vex,was,who,with,woman,world,years,youngest
',16,12,0,6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
",",12,79,0,6,0,8,0,2,4,10,...,5,8,0,0,7,8,5,0,0,0
-,0,0,8,0,0,2,0,0,0,0,...,0,1,0,0,0,2,0,3,6,0
.,6,6,0,48,0,0,0,7,0,7,...,0,6,6,12,0,0,0,0,7,4
1816,0,0,0,0,6,0,5,0,4,4,...,0,0,0,0,0,0,0,0,0,0
;,0,8,2,0,0,24,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Austen,0,0,0,0,5,0,5,0,3,3,...,0,0,0,0,0,0,0,0,0,0
Between,0,2,0,7,0,0,0,8,0,6,...,0,0,0,5,0,0,0,0,0,0
CHAPTER,0,4,0,0,4,0,3,0,8,6,...,0,0,0,0,0,0,0,0,0,0
Emma,0,10,0,7,4,0,3,6,6,18,...,0,0,0,3,0,0,0,0,0,0


In [72]:
PPMI_tdsm = PPMI(tdsm)
PPMI_tdsm.head(50)

  PPMI_matrix.iloc[i,j] = max(0, np.log2(p_ij.iloc[i,j] / (p_i.iloc[i] * p_j.iloc[j])))


Unnamed: 0,',",",-,.,1816,;,Austen,Between,CHAPTER,Emma,...,unite,very,vex,was,who,with,woman,world,years,youngest
',3.514714,0.885357,0.0,0.522248,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
",",0.885357,1.389857,0.0,0.0,0.0,0.0,0.0,0.0,0.49304,0.577929,...,0.668127,0.0,0.0,0.0,1.10775,0.381532,0.622323,0.0,0.0,0.0
-,0.0,0.0,4.514714,0.0,0.0,0.991152,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.595851,0.0,3.099677,3.099677,0.0
.,0.522248,0.0,0.0,1.944819,0.0,0.0,0.0,1.74464,0.0,0.700246,...,0.0,0.0,1.522248,1.579733,0.0,0.0,0.0,0.0,0.74464,1.12993
1816,0.0,0.0,0.0,0.0,5.11597,0.0,5.096862,0.0,4.215506,2.978467,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
;,0.0,0.0,0.991152,0.0,0.0,3.052553,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Austen,0.0,0.0,0.0,0.0,5.096862,0.0,5.340787,0.0,4.044394,2.807355,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Between,0.0,0.0,0.0,1.74464,0.0,0.0,0.0,4.514714,0.0,3.055282,...,0.0,0.0,0.0,2.894128,0.0,0.0,0.0,0.0,0.0,0.0
CHAPTER,0.0,0.49304,0.0,0.0,4.215506,0.0,4.044394,0.0,4.900004,3.247928,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Emma,0.0,0.577929,0.0,0.700246,2.978467,0.0,2.807355,3.055282,3.247928,3.595851,...,0.0,0.0,0.0,1.112768,0.0,0.0,0.0,0.0,0.0,0.0


3) Very rare words might will have high PMI values. How would you solve the problem?

PMI has the problem of being biased toward infrequent events; very rare words tend to have very high PMI values. One way to reduce this bias toward low frequency events is to slightly change the computation for P(c), using a different function Pα(c) that raises the probability of the context word to the power of α. Levy et al. found that a setting of α = 0.75 improved performance of embeddings on a wide range of tasks. This works because raising the count to α = 0.75 increases the probability assigned to rare contexts, and hence lowers their PMI (Pα(c) > P(c) when c is rare). Another possible solution is Laplace smoothing: Before computing PMI, a small constant k (values of 0.1-3 are common) is added to each of the counts, shrinking (discounting) all the non-zero values. The larger the k, the more the non-zero counts are discounted

4) Check how algorithm works using English thesaurus. Pick some 10 words, find
synonyms for these, e.g. using https://www.merriam-webster.com/thesaurus.
Note that semantic similarity is represented in different shades of orange. Does
it match the output of PPMI weighting function? Would be nice if you could
also draw a table with shaded cells matching closeness given by PPMI.

In [87]:
words = ["sundry", "beautiful", "keister", "monkey", "frontier", "sad", "amazing", "wrench", "good", "love"]
synonyms = ["various", "lovely", "bum", "butt", "addiction"]
doc = {'doc1': words, 'doc2': synonyms}
TermDocMatrix(doc)


Unnamed: 0,doc1,doc2
addiction,0,1
amazing,1,0
beautiful,1,0
bum,0,1
butt,0,1
frontier,1,0
good,1,0
keister,1,0
love,1,0
lovely,0,1


In [88]:
PPMI_my = PPMI(TermDocMatrix(doc))
PPMI_my

  PPMI_matrix.iloc[i,j] = max(0, np.log2(p_ij.iloc[i,j] / (p_i.iloc[i] * p_j.iloc[j])))


Unnamed: 0,doc1,doc2
addiction,0.0,1.584963
amazing,0.584963,0.0
beautiful,0.584963,0.0
bum,0.0,1.584963
butt,0.0,1.584963
frontier,0.584963,0.0
good,0.584963,0.0
keister,0.584963,0.0
love,0.584963,0.0
lovely,0.0,1.584963


Я не зрозумів суть останнього завдання......