#### Lab 7 
#### Rouge Metrics

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
import numpy as np
from fuzzywuzzy import fuzz 
import ipywidgets as widgets
import pprint
from ipywidgets import interact, interact_manual
import re
__PATH__ = "./data.csv"



In [2]:
df = pd.read_csv(__PATH__,sep=";",header=0)

#### Preprocessing the title to list of tokens

In [3]:
titles = list(df['title'].apply(
    lambda t : 
        tuple(
            filter(lambda e:not e in stopwords.words('english'),
                map(lambda e:e.lower(),
                       re.findall('([A-Z]{1}[a-z]+)',t.replace('.pdf','')))
                )
            )
        )
    )

In [4]:
res = {}
for title in titles:
    synsets = {}
    for word in title:
        synsets[word]=[synset for synset in wn.synsets(word)]
    res[title] = synsets

In [5]:
#helpers

def get_hyps(word_set):
    all_hyps = {}
    for w in word_set:
        hyps = []
        for synset in wn.synsets(w):
            hyps.extend(synset.hypernyms())
        all_hyps[w] = set(hyps)
    
    return all_hyps

def calculate_f1(rcl, prec):
    return 2*rcl*prec/(rcl+prec)

def get_f1_and_intr(a, b):
    intr = a.intersection(b)
    if len(intr) == 0:
        return 0, intr
    
    rcl = len(intr) / len(a)
    prec = len(intr) / len(b)
    msr = calculate_f1(rcl, prec)
    return msr, intr

def distance(a,b):
    a = set(a)
    b = set(b)
    f1score, intr = get_f1_and_intr(a, b)
    a = a - intr
    b = b - intr
    
    if len(a) == 0 or len(b) == 0:
        return 1.0 - f1score
    
    buff_a = get_hyps(a)
    buff_b = get_hyps(b)
    
    hyp_penalty = 0
    scnd_lvl_hyps = set()
    for wordA in a:
        for wordB in b:
            common_hyps = buff_a[wordA].intersection(buff_b[wordB])
            if len(common_hyps) > 0:
                hyp_penalty += 1
                strs = []
                for s in common_hyps:
                    strs.extend(s.lemma_names())
                scnd_lvl_hyps.update(strs)
    
    if len(scnd_lvl_hyps) > 0:
        buff_scnd_lvl_hyps = get_hyps(scnd_lvl_hyps)
        for h in scnd_lvl_hyps:
            for wordA in a:
                a_common_hyps = buff_scnd_lvl_hyps[h].intersection(buff_a[wordA])
                if len(a_common_hyps) > 0:
                    hyp_penalty += 0.5
            
            for wordB in b:
                b_common_hyps = buff_scnd_lvl_hyps[h].intersection(buff_b[wordB])
                if len(b_common_hyps) > 0:
                    hyp_penalty += 0.5
                
    rcl_hyp = hyp_penalty/len(a)
    prec_hyp = hyp_penalty/len(b)
    f1score_hyp = calculate_f1(rcl_hyp, prec_hyp) if hyp_penalty > 0 else 0
    f1res = (2*f1score+f1score_hyp)/3
    return (1.0 - f1res)

In [6]:
buff = list(res.items())
dist = np.zeros((len(buff),len(buff)))
for lli,ll in enumerate(buff):
    for rri,rr in enumerate(buff):
        dist[lli,rri]=distance(ll[0],rr[0])
        

#### Top ten closest articles with fuzzy metrics of titles

In [7]:
@interact(ind=(0,len(buff)-1,1))
def h(ind=0):
    pp = pprint.PrettyPrinter(indent=4)
    print(' '.join(buff[ind][0]))
    pp.pprint([buff[i][0] for i in dist[ind][:].argsort()[1:11]])


interactive(children=(IntSlider(value=0, description='ind', max=995), Output()), _dom_classes=('widget-interac…

In [8]:
@interact(ind=(0,len(buff)-1,1))
def hypernyms(ind=0):
    pp = pprint.PrettyPrinter(indent=4)
    print(' '.join(buff[ind][0]))
    pp.pprint(buff[ind][1])

interactive(children=(IntSlider(value=0, description='ind', max=995), Output()), _dom_classes=('widget-interac…