In [1]:
from nltk.corpus import wordnet as wn
from collections import defaultdict
import numpy as np
import json

In [2]:
mfs_kb = defaultdict(lambda: defaultdict(int))
with open('data/jsonl/SemCor.jsonl', 'r') as f:
    for line in f:
        items = json.loads(line)
        for sense in items['sense'].split(';'):
            mfs_kb[items['lemma']][sense] += 1

In [3]:
mfs = defaultdict(str)
for lemma, kb in mfs_kb.items():
    mfs[lemma] = max(kb, key=kb.get)

In [4]:
wsd = ['ALL', 'semeval2007', 'semeval2013', 'semeval2015', 'senseval2', 'senseval3']

In [40]:
def wn_first(lemma, pos = None):
    for l in wn.synsets(lemma, pos)[0].lemmas():
        key = l.key()
        if key.startswith('{}%'.format(lemma)):
            res = key
            break
        else:
            res = ''
    return res
    
# def wn_first_sense(lemma, postag=None):
#     pos_map = {'VERB': 'v', 'NOUN': 'n', 'ADJ': 'a', 'ADV': 'r'}
#     first_synset = wn.synsets(lemma, pos=pos_map[postag])[0]
#     found = False
#     for lem in first_synset.lemmas():
#         key = lem.key()
#         if key.startswith('{}%'.format(lemma)):
#             found = True
#             break
#     assert found
#     return key

In [6]:
def evaluate(system: list, reference: list):
    
    system = list(map(lambda x: x.strip(), system))
    reference = list(map(lambda x: x.strip(), reference))
    
    intersection = set(system).intersection(set(reference))
    if len(intersection) >= 1:
        return 1.0
    else:
        return 0.0

In [34]:
pos = {
    'VERB': 'v',
    'NOUN': 'n',
    'ADJ': 'a',
    'ADV': 'r'
}

In [48]:
scores = []
outputs = []
pos_wise = defaultdict(list)
with open('data/jsonl/ALL.jsonl', 'r') as f:
    for line in f:
        items = json.loads(line)
        mfs = wn_first(items['lemma'], pos[items['pos']])
        truth = items['sense'].split(';')
        scores.append(evaluate([mfs], truth))
        pos_wise[items['pos']].append(evaluate([mfs], truth))

In [41]:
with open('data/outputs/ALL_mfs.txt', 'w') as fw:
    with open('data/jsonl/ALL.jsonl', 'r') as f:
        for line in f:
            items = json.loads(line)
            mfs = wn_first(items['lemma'], pos[items['pos']])
            out = f'{items["id"]} {mfs}\n'
            fw.write(out)

In [49]:
{k: np.mean(v) for k, v in pos_wise.items()}

{'NOUN': 0.676046511627907,
 'ADJ': 0.743455497382199,
 'VERB': 0.5030266343825666,
 'ADV': 0.8092485549132948}

In [50]:
np.mean(scores)

0.6518681924720805