# Es.1 - Conceptual Similarity

In questo esercizio andiamo ad esplorare varie tecniche per calcolare la similarità semantica tra 
due parole. Per farlo utilizzeremo il dataset WordSim353.csv, che contiene una serie di coppie di
 parole e il loro punteggio di similarità. Il punteggio è un valore reale compreso tra 0 e 10.

Le misure di similarità che utilizzeremo sono:

- Wu and Palmer
- Shortest Path
- Leakcock & Chodorow

### Imports

In [121]:
from nltk.corpus import stopwords
from collections import Counter
from gensim.test.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
import random

### Data preparation

In [122]:
def get_text_from_file(path):
    '''
    Read a file and, after revoving all stopwords, return a list of words.
    '''
    file = []
    stop_words = set(stopwords.words('english'))
    with open (path, 'r') as f:
        for row in f:
            filtered_s = [w for w in word_tokenize(row) if not w.lower() in stop_words]
            file.append(simple_preprocess(str(filtered_s), deacc=True))
    f.close()
    return file

def extract_triple(path):
    '''
    Read a file and return the first 2 word in each row, as a tuple.
    '''
    tuple_list = []
    with open (path, 'r') as f:
        for row in f:
            # Remove \n
            row = row.strip()
            # Organize in a triple the values
            tuple_list.append(tuple(row.split(",")[:3]))
            
        # Remove first value of the tuple
        tuple_list.pop(0)
    f.close()
    return tuple_list

In [123]:
values = extract_triple('../data/WordSim353.csv')

### Wu and Palmer

In [166]:
def get_synsets(term):
    '''
    Retrurn the synsets of a term.
    '''
    if(len(wn.synsets(term)) > 0):
        return wn.synsets(term)
    return None

def depth(syn):
    '''
    Return the distance between the WorNet root and a synset.
    '''
    if(syn is None):
        return 0
    if(type(syn) is list):
        if(len(syn) > 0):
            return syn[0].max_depth()
        return 0
    return syn.max_depth()

def lch(syn1, syn2):
    '''
    Return the lowest common hypenyms of two synsets.
    '''
    if(syn1 is None or syn2 is None):
        return None
    return syn1.lowest_common_hypernyms(syn2)

def wu_pal_sim(syn1, syn2):
    '''
    Return the Wu-Palmer similarity of two synsets.
    '''
    if(syn1 is None or syn2 is None):
        return 0
    return syn1.wup_similarity(syn2)

def my_wu_pal_sim(syn1, syn2):
    if(syn1 is None or syn2 is None):
        return None
    else:
        if(depth(syn1) + depth(syn2) == 0):
            return 0
        return 2.0 * depth(lch(syn1, syn2)) / (depth(syn1) + depth(syn2))
    
def max_similarity(syns1, syns2):
    sim_max = ("", "", 0)
    
    for syn1 in syns1:
        for syn2 in syns2:
            if(syns1 is not None and syns2 is not None):
                sim = my_wu_pal_sim(syn1, syn2)
                if sim > sim_max[2]:
                    sim_max = (syn1, syn2, sim)
                    
    return sim_max

In [167]:
for val in values:
    syns1 = get_synsets(val[0])
    syns2 = get_synsets(val[1])
        
    sim = max_similarity(syns1, syns2)

    print("\n", sim)



 (Synset('sexual_love.n.02'), Synset('sexual_activity.n.01'), 0.9090909090909091)

 (Synset('tiger.n.02'), Synset('big_cat.n.01'), 0.9629629629629629)

 (Synset('tiger.n.01'), Synset('tiger.n.01'), 1.0)

 (Synset('book.n.02'), Synset('newspaper.n.03'), 0.8571428571428571)

 (Synset('computer.n.01'), Synset('keyboard.n.01'), 0.8)

 (Synset('computer.n.01'), Synset('internet.n.01'), 0.5882352941176471)

 (Synset('airplane.n.01'), Synset('car.n.02'), 0.7)

 (Synset('train.n.01'), Synset('car.n.02'), 0.7058823529411765)

 ('', '', 0)

 (Synset('television_receiver.n.01'), Synset('radio_receiver.n.01'), 0.9)

 (Synset('medium.n.01'), Synset('radio.n.01'), 0.8)

 ('', '', 0)

 (Synset('bread.n.01'), Synset('butter.n.01'), 0.7142857142857143)

 (Synset('cucumber.n.01'), Synset('potato.n.02'), 0.8)

 (Synset('doctor.n.01'), Synset('nurse.n.01'), 0.8571428571428571)

 (Synset('professor.n.01'), Synset('doctor.n.01'), 0.7272727272727273)

 (Synset('student.n.01'), Synset('professor.n.01'), 0.63