Skip to content

Commit

Permalink
added simpler siganture function
Browse files Browse the repository at this point in the history
  • Loading branch information
alvations committed Jul 15, 2017
1 parent 2794749 commit c347053
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 12 deletions.
36 changes: 35 additions & 1 deletion pywsd/lesk.py
Expand Up @@ -16,7 +16,41 @@
from pywsd.cosine import cosine_similarity as cos_sim
from pywsd.utils import lemmatize, porter, lemmatize_sentence, synset_properties

EN_STOPWORDS = stopwords.words('english')
EN_STOPWORDS = stopwords.words('english') + list(string.punctuation)

def signatures(ambiguous_word, pos=None,
hyperhypo=True, adapted=False,
remove_stopwords=True, to_lemmatize=True):
# Ensure that the POS is supported.
pos = pos if pos in ['a', 'r', 's', 'n', 'v', None] else None
# Holds the synset->signature dictionary.
synsets_signatures = {}
for ss in wn.synsets(ambiguous_word, pos=pos):
signature = []
# Adds the definition, example sentences and lemma_names.
signature += word_tokenize(ss.definition())
signature += chain(*[word_tokenize(eg) for eg in ss.examples()])
signature += ss.lemma_names()
# Optional: includes lemma_names of hyper-/hyponyms.
if hyperhypo:
hyperhypo = set(ss.hyponyms() + ss.hypernyms() + ss.instance_hyponyms() + ss.instance_hypernyms())
signature += set(chain(*[i.lemma_names() for i in hyperhypo]))
# Optional: Includes signatures from related senses as in Adapted Lesk.
if adapted:
# Includes lemma_names from holonyms, meronyms and similar_tos
related_senses = set(ss.member_holonyms() + ss.part_holonyms() + ss.substance_holonyms() +
ss.member_meronyms() + ss.part_meronyms() + ss.substance_meronyms() +
ss.similar_tos())
signature += set(chain(*[i.lemma_names() for i in related_senses]))
# Optional: removes stopwords.
if remove_stopwords:
signature = [i for i in signature if i not in EN_STOPWORDS]
# Lemmatized context is preferred over stemmed context.
if to_lemmatize:
signature = [lemmatize(i) for i in signature]
synsets_signatures[ss] = signature
return synsets_signatures


def compare_overlaps_greedy(context, synsets_signatures):
"""
Expand Down
20 changes: 10 additions & 10 deletions pywsd/utils.py
Expand Up @@ -12,17 +12,17 @@

SS_PARAMETERS_TYPE_MAP = {'definition':str, 'lemma_names':list,
'examples':list, 'hypernyms':list,
'hyponyms': list, 'member_holonyms':list,
'part_holonyms':list, 'substance_holonyms':list,
'member_meronyms':list, 'substance_meronyms': list,
'part_meronyms':list, 'similar_tos':list}
'hyponyms': list, 'member_holonyms':list,
'part_holonyms':list, 'substance_holonyms':list,
'member_meronyms':list, 'substance_meronyms': list,
'part_meronyms':list, 'similar_tos':list}

def remove_tags(text):
""" Removes <tags> in angled brackets from text. """
import re
tags = {i:" " for i in re.findall("(<[^>\n]*>)",text.strip())}
no_tag_text = reduce(lambda x, kv:x.replace(*kv), tags.iteritems(), text)
return " ".join(no_tag_text.split())
""" Removes <tags> in angled brackets from text. """
import re
tags = {i:" " for i in re.findall("(<[^>\n]*>)",text.strip())}
no_tag_text = reduce(lambda x, kv:x.replace(*kv), tags.iteritems(), text)
return " ".join(no_tag_text.split())

def offset_to_synset(offset):
"""
Expand Down Expand Up @@ -86,7 +86,7 @@ def lemmatize(ambiguous_word, pos=None, neverstem=False,
else:
return stem
else:
return lemma
return lemma


def penn2morphy(penntag, returnNone=False):
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
@@ -1,3 +1,3 @@
numpy>=1.8.0
six>=1.9.0
nltk>=3.1.0
nltk>=3.2.4

0 comments on commit c347053

Please sign in to comment.