- https://www.nltk.org/howto/wordnet.html WordNet examples

In [20]:
from nltk.corpus import wordnet as wn
import numpy as np
from scipy import stats
import math

### Utilities

In [3]:
def get_all_hypernyms(synset):
    # get all hypernyms of a synset until the root of wordnet
    # estendere per far prendere più iperonimi, non solo il primo ogni volta
    ret_list = []
    hypernyms = synset.hypernyms()
    while hypernyms:
        for hyper in hypernyms:
            ret_list.append(hyper)
        hypernyms = hypernyms[0].hypernyms()
    return ret_list

def get_all_hypernyms_2(synset):
    # get all hypernyms of a synset until the root of wordnet
    # estendere per far prendere più iperonimi, non solo il primo ogni volta
    ret_list = []
    temp_list = []
    hypernyms = synset.hypernyms()
    while hypernyms:
        for hyper in hypernyms:
            temp_list.append(hyper)
        ret_list.append(temp_list)
        temp_list = []
        hypernyms = hypernyms[0].hypernyms()
    return ret_list

In [4]:
def lowest_common_subsumer(syn1, syn2): 
    return syn1.lowest_common_hypernyms(syn2)[0] if syn1.lowest_common_hypernyms(syn2) else None

def lowest_common_subsumer_2(syn1, syn2):
    # risale la gerarchia degli iperonimi, scegliendo sempre e solo il primo synset perchè non ho un PC non della NASA
    syn1_hypernyms = get_all_hypernyms(syn1)
    syn2_hypernyms = get_all_hypernyms(syn2)

    for h1 in syn1_hypernyms:
        if h1 in syn2_hypernyms:
            return h1

lowest_common_subsumer(wn.synsets('car')[0], wn.synsets('dog')[0])

Synset('whole.n.02')

In [5]:
def depth(syn):
    return syn.min_depth() if syn else 0

In [6]:
def max_path(): # restituisce sempre 19, per velocizzare le esecuzioni salvo il valore in una costante
    max_path = 0
    for synset in wn.all_synsets():
        if synset.max_depth() > max_path:
            max_path = synset.max_depth()
    return max_path

In [10]:
def my_distance_between_syn(syn_1, syn_2):
    if syn_1 == syn_2:
        return 0
    s1_hyper = get_all_hypernyms_2(syn_1)
    s2_hyper = get_all_hypernyms_2(syn_2)

    print(s1_hyper)
    print(s2_hyper)
    s1_temp = get_all_hypernyms(syn_1)
    s2_temp = get_all_hypernyms(syn_2)

    c_1 = 0
    flag = False
    for i in range(0, len(s1_hyper)):
        for j in range(0, len(s1_hyper[i])):
            if s1_hyper[i][j] in s2_temp and not flag:
                flag = True
                c_1 += i+1

    c_2 = 0
    flag = False
    for i in range(0, len(s2_hyper)):
        for j in range(0, len(s2_hyper[i])):
            if s2_hyper[i][j] in s1_temp and not flag:
                flag = True
                c_2 += i+1
    
   
    return (c_1+c_2)
        
def length(syn1, syn2): # NB, non esistono i cammini tra nomi e verbi in WordNet, pepr cui vanno rimossi i verbi credo
    return syn1.shortest_path_distance(syn2) if syn1.shortest_path_distance(syn2) is not None else None


print(wn.synsets('chest')[0].shortest_path_distance(wn.synsets('dog')[0]))
print(length(wn.synsets('chest')[0], wn.synsets('dog')[0]))
print(my_distance_between_syn(wn.synsets('chest')[0], wn.synsets('dog')[0]))

11
11
[[Synset('body_part.n.01')], [Synset('part.n.03')], [Synset('thing.n.12')], [Synset('physical_entity.n.01')], [Synset('entity.n.01')]]
[[Synset('canine.n.02'), Synset('domestic_animal.n.01')], [Synset('carnivore.n.01')], [Synset('placental.n.01')], [Synset('mammal.n.01')], [Synset('vertebrate.n.01')], [Synset('chordate.n.01')], [Synset('animal.n.01')], [Synset('organism.n.01')], [Synset('living_thing.n.01')], [Synset('whole.n.02')], [Synset('object.n.01')], [Synset('physical_entity.n.01')], [Synset('entity.n.01')]]
16


In [21]:
DEPTH_MAX = 20

In [218]:
print(lowest_common_subsumer(wn.synset('dog.n.01'), wn.synset('big_cat.n.01')))
print(lowest_common_subsumer_2(wn.synset('dog.n.01'), wn.synset('big_cat.n.01')))

Synset('carnivore.n.01')
Synset('carnivore.n.01')


### Wu & Palmer
Resituisce valori tra 0 e 1, dove più ci si avvicina a 1 più i sensi sono simili.

In [219]:
def wu_palmer(syn1, syn2):
    dep = 0

    lcs = lowest_common_subsumer(syn1, syn2)
    dep = (depth(syn1) + depth(syn2))
    
    if dep == 0:
        dep = 0.001
        
    return 2 * depth(lcs) / dep
    #return syn1.wup_similarity(syn2)

### Shortest Path
Valore che può oscillare tra 0 e 2depthMax.
Più il valore si avvicina a 2depthMax più i sensi sono simili, questo perchè vuol dire che la distanza tra i due sensi (len(s1,s2)) è minima o uguale a 0

In [220]:
def shortest_path(syn1, syn2):
    #return syn1.path_similarity(syn2) 
    return 2 * DEPTH_MAX - length(syn1, syn2) if length(syn1, syn2) is not None else 0

### Leakcock & Chodorow
I valori sono compresi tra 0 e log(2depthMax + 1)

In [25]:
def leakcock_chodorow(syn1, syn2):
    '''if syn1.name().split('.')[1] == syn2.name().split('.')[1]:
        return syn1.lch_similarity(syn2)
    else:
        return 0'''
    distance = length(syn1, syn2) 
    if distance is not None:
        if distance != 0:
            return -math.log(distance / 2 * DEPTH_MAX)
        else:
             return -math.log(distance+1 / (2 * DEPTH_MAX)+1)
    else:
        return 0

print(leakcock_chodorow(wn.synsets('dog')[0], wn.synsets('cat')[0]))
print(wn.synsets('dog')[0].lch_similarity(wn.synsets('cat')[0]))

-3.6888794541139363
2.0281482472922856


### Execution

read lines from WordSim353.csv

In [222]:
dataset = []

with open(r'../resources/WordSim353.csv', 'r') as f:
    word_sim = f.readlines()[1:]
    for tuple in word_sim:
        dataset.append(tuple.split(','))

get synset from dataset

In [223]:
syns_1 = []
syns_2 = []

for tuple in dataset:
    syns_1.append(wn.synsets(tuple[0]))
    syns_2.append(wn.synsets(tuple[1]))

compute similarity using the three methods described above over all the combinations of synsets of every word in the input file.
For each couple, take the maximum value of each similarity 

In [224]:
wp = []
sp = []
lc = []

max_wu = 0
max_sp =  0
max_lc = 0

for i in range(len(syns_1)):
    for j in range(len(syns_1[i])):
        for k in range(len(syns_2[i])):
            
            #print(f"syn1: {syns_1[i][j]}, syn2: {syns_2[i][k]} --> WU_PALMER: {wu_palmer(syns_1[i][j], syns_2[i][k])} - SHORTEST_PATH: {shortest_path(syns_1[i][j], syns_2[i][k])} - LEAKCOCK_CHODOROW: {leakcock_chodorow(syns_1[i][j], syns_2[i][k])}")
            
            wu_temp = wu_palmer(syns_1[i][j], syns_2[i][k])
            sp_temp = shortest_path(syns_1[i][j], syns_2[i][k])
            lc_temp = leakcock_chodorow(syns_1[i][j], syns_2[i][k])

            if wu_temp > float(max_wu):
                max_wu = wu_temp
            if sp_temp > float(max_sp):
                max_sp = sp_temp
            if lc_temp < float(max_lc):
                max_lc = lc_temp

    wp.append(max_wu) 
    max_wu = 0
    sp.append(max_sp)
    max_sp = 0
    lc.append(max_lc)
    max_lc = 0

print(len(wp))
print(len(sp))
print(len(lc))

353
353
353


### Evaluation Methods
- similarity values in WordSim353.csv

In [225]:
word_sim_353 = [float(data[2].strip('\n')) for data in dataset]

#### Spearman

In [226]:
print("\nSPEARMAN'S CORRELATION COEFFICIENT\n")
print(f"WU & PALMER             {stats.spearmanr(wp, word_sim_353).correlation}")
print(f"SHORTEST PATH           {stats.spearmanr(sp, word_sim_353).correlation}")
print(f"LEAKCOCK & CHODOROW     {stats.spearmanr(lc, word_sim_353).correlation}\n\n")

print("\nPEARSON'S CORRELATION COEFFICIENT\n")
print(f"WU & PALMER             {stats.pearsonr(wp, word_sim_353)}")
print(f"SHORTEST PATH           {stats.pearsonr(sp, word_sim_353)}")
print(f"LEAKCOCK & CHODOROW     {stats.pearsonr(lc, word_sim_353)}")





SPEARMAN'S CORRELATION COEFFICIENT

WU & PALMER             0.32927825770618513
SHORTEST PATH           0.2895253335747299
LEAKCOCK & CHODOROW     0.10503987463606122



PEARSON'S CORRELATION COEFFICIENT

WU & PALMER             (0.2846480047884611, 5.266550780950281e-08)
SHORTEST PATH           (0.16653216769600956, 0.0016911511526584368)
LEAKCOCK & CHODOROW     (0.15199780024041876, 0.004205795923151336)
