- https://www.nltk.org/howto/wordnet.html WordNet examples

In [1]:
from nltk.corpus import wordnet as wn
import numpy as np
from scipy import stats

### Utilities

In [2]:
def get_all_hypernyms(synset):
    # get all hypernyms of a synset until the root of wordnet
    # estendere per far prendere più iperonimi, non solo il primo ogni volta
    ret_list = []
    hypernyms = synset.hypernyms()
    while hypernyms:
        for hyper in hypernyms:
            ret_list.append(hyper)
        hypernyms = hypernyms[0].hypernyms()
    return ret_list


In [3]:
def lowest_common_subsumer(syn1, syn2): 
    return syn1.lowest_common_hypernyms(syn2)[0] if syn1.lowest_common_hypernyms(syn2) else None

def lowest_common_subsumer_2(syn1, syn2):
    # risale la gerarchia degli iperonimi, scegliendo sempre e solo il primo synset percjè non ho un PC non della NASA
    syn1_hypernyms = get_all_hypernyms(syn1)
    syn2_hypernyms = get_all_hypernyms(syn2)

    for h1 in syn1_hypernyms:
        if h1 in syn2_hypernyms:
            return h1

In [4]:
def depth(syn):
    return syn.min_depth() if syn else 0

In [5]:
def max_path(): # restituisce sempre 19, per velocizzare le esecuzioni salvo il valore in una costante
    max_path = 0
    for synset in wn.all_synsets():
        if synset.max_depth() > max_path:
            max_path = synset.max_depth()
    return max_path

In [6]:
def length(syn1, syn2): # NB, non esistono i cammini tra nomi e verbi in WordNet, pepr cui vanno rimossi i verbi credo
    return syn1.shortest_path_distance(syn2) if syn1.shortest_path_distance(syn2) else 0

In [7]:
MAX_PATH = 19

In [8]:
print(lowest_common_subsumer(wn.synset('dog.n.01'), wn.synset('big_cat.n.01')))
print(lowest_common_subsumer_2(wn.synset('dog.n.01'), wn.synset('big_cat.n.01')))

Synset('carnivore.n.01')
Synset('carnivore.n.01')


### Wu & Palmer

In [9]:
def wu_palmer(syn1, syn2):
    dep = 0

    lcs = lowest_common_subsumer(syn1, syn2)
    dep = (depth(syn1) + depth(syn2))
    
    if dep == 0:
        dep = 0.001
        
    return 2 * depth(lcs) / dep

### Shortest Path

In [10]:
def shortest_path(syn1, syn2):
    return 2 * MAX_PATH - length(syn1, syn2) if length(syn1, syn2) else 0

### Leakcock & Chodorow

In [11]:
def leakcock_chodorow(syn1, syn2):
    return -np.log(length(syn1, syn2) / 2 * MAX_PATH) if length(syn1, syn2) else -1000 #-1000 per indicare un valore di somiglianza basso

### Execution

read lines from WordSim353.csv

In [17]:
dataset = []

with open(r'C:\Users\andre\Desktop\Università\Magistrale\TLN\PART 2\TLN-LAB-PART-2\data\WordSim353.csv', 'r') as f:
    word_sim = f.readlines()[1:]
    for tuple in word_sim:
        dataset.append(tuple.split(','))

get synset from dataset

In [18]:
syns_1 = []
syns_2 = []

for tuple in dataset:
    syns_1.append(wn.synsets(tuple[0]))
    syns_2.append(wn.synsets(tuple[1]))

compute similarity using the three methods described above over all the combinations of synsets of every word in the input file.
For each couple, take the maximum value of each similarity 

In [19]:
wp = []
sp = []
lc = []

max_wu = 0
max_sp =  0
max_lc = -1000

for i in range(len(syns_1)):
    for j in range(len(syns_1[i])):
        for k in range(len(syns_2[i])):
            
            #print(f"syn1: {syns_1[i][j]}, syn2: {syns_2[i][k]} --> WU_PALMER: {wu_palmer(syns_1[i][j], syns_2[i][k])} - SHORTEST_PATH: {shortest_path(syns_1[i][j], syns_2[i][k])} - LEAKCOCK_CHODOROW: {leakcock_chodorow(syns_1[i][j], syns_2[i][k])}")
            if wu_palmer(syns_1[i][j], syns_2[i][k]) > float(max_wu):
                max_wu = wu_palmer(syns_1[i][j], syns_2[i][k])
            if shortest_path(syns_1[i][j], syns_2[i][k]) > float(max_sp):
                max_sp = shortest_path(syns_1[i][j], syns_2[i][k])
            if leakcock_chodorow(syns_1[i][j], syns_2[i][k]) > float(max_lc):
                max_lc = leakcock_chodorow(syns_1[i][j], syns_2[i][k])

    wp.append(max_wu) 
    max_wu = 0
    sp.append(max_sp)
    max_sp = 0
    lc.append(max_lc)
    max_lc = -1000

print(wp)
print(sp)
print(lc)

[0.9090909090909091, 0.9629629629629629, 1.0, 0.8571428571428571, 0.8, 0.5882352941176471, 0.7368421052631579, 0.75, 0, 0.9, 0.8, 0, 0.9090909090909091, 0.8571428571428571, 0.8, 0.625, 0.46153846153846156, 0, 0, 0.5882352941176471, 0.5333333333333333, 0.75, 0.8, 0.8, 0.5714285714285714, 0, 0, 0.6666666666666666, 0.75, 0.5, 1.0, 0.8571428571428571, 0.5263157894736842, 0.625, 1.0, 0.9, 0.7692307692307693, 0.6666666666666666, 0.23529411764705882, 0.18181818181818182, 0.8333333333333334, 0, 0.9473684210526315, 0.8888888888888888, 0.7777777777777778, 0.5555555555555556, 0, 0.5454545454545454, 0.7692307692307693, 0, 0.5454545454545454, 0.3157894736842105, 0.5, 0.6153846153846154, 0.125, 0.8888888888888888, 0.2857142857142857, 0.6, 0.875, 0.875, 0.16666666666666666, 0.36363636363636365, 0.4444444444444444, 0, 0.7142857142857143, 0.5454545454545454, 1.0, 1.0, 0.9473684210526315, 0.9090909090909091, 0.8888888888888888, 0.9473684210526315, 1.0, 1.0, 0.47058823529411764, 0.3076923076923077, 0.947

### Evaluation Methods
- similarity values in WordSim353.csv

In [20]:
word_sim_353 = [float(data[2].split('\n')[0]) for data in dataset]

#### Spearman

In [21]:
print("\nSPEARMAN'S CORRELATION COEFFICIENT\n")
print(f"WU & PALMER             {stats.spearmanr(wp, word_sim_353).correlation}")
print(f"SHORTEST PATH           {stats.spearmanr(sp, word_sim_353).correlation}")
print(f"LEAKCOCK & CHODOROW     {stats.spearmanr(lc, word_sim_353).correlation}\n\n")

print("\nPEARSON'S CORRELATION COEFFICIENT\n")
print(f"WU & PALMER             {stats.pearsonr(wp, word_sim_353)}")
print(f"SHORTEST PATH           {stats.pearsonr(sp, word_sim_353)}")
print(f"LEAKCOCK & CHODOROW     {stats.pearsonr(lc, word_sim_353)}")


SPEARMAN'S CORRELATION COEFFICIENT

WU & PALMER             0.32927825770618513
SHORTEST PATH           0.2236163845432123
LEAKCOCK & CHODOROW     0.2236163845432123



PEARSON'S CORRELATION COEFFICIENT

WU & PALMER             (0.2846480047884611, 5.266550780950281e-08)
SHORTEST PATH           (0.08984891365408158, 0.09188617940762013)
LEAKCOCK & CHODOROW     (-0.10399156735058393, 0.05091564036689271)
