Skip to content

Commit

Permalink
Minor updates to accuracy test
Browse files Browse the repository at this point in the history
  • Loading branch information
bjascob committed May 17, 2019
1 parent ca6196e commit ae1a805
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 24 deletions.
61 changes: 39 additions & 22 deletions tests/accuracy/20_TestLemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,50 @@ def __init__(self):
import lemminflect
self.name = 'LemmInflect'
self.version_string = 'LemmInflect version: %s' % lemminflect.__version__
# Force loading dictionary and model so lazy loading doesn't show up in run times
lemmas = lemminflect.getAllLemmas('testing', 'VERB')
lemmas = lemminflect.getAllLemmasOOV('xxtesting', 'VERB')

# Use only the dictionary methods
def getLemmaDictOnly(self, entry, upos):
lemmas = lemminflect.getAllLemmas(entry.infl, upos)
lemma = lemmas.get(upos, ())
if not lemma:
return ()
return lemma[0]

# Use only the model methods
def getLemmaOOVOnly(self, entry, upos):
lemmas = lemminflect.getAllLemmasOOV(entry.infl, upos)
lemma = lemmas.get(upos, ())
if not lemma:
return ()
return lemma[0]

# Standard combined method
def getLemma(self, entry, upos):
lemmas = lemminflect.getLemma(entry.infl, upos)
if not lemmas:
return ()
return lemmas[0]

# get the lemmas for every upos (pos_type='a' will have adv and adj)
# With LemmInflect 0.1.0 and the 119,194 test set, there are 88,182 words OOV
def getLemmas(self, entry):
possible_lemmas = set()
for upos in entry.upos_list:
lemmas = lemminflect.getLemma(entry.infl, upos)
lemma = lemmas[0] # first one is the most common form
possible_lemmas.add( lemma )
#lemma = self.getLemmaDictOnly(entry, upos)
#lemma = self.getLemmaOOVOnly(entry, upos)
lemma = self.getLemma(entry, upos)
if lemma:
possible_lemmas.add(lemma)
return possible_lemmas

# Spacy
class SpacyLemmatizer(object):
def __init__(self, smodel):
import spacy
self.morphology = spacy.load(smodel).vocab.morphology
self.lemmatizer = spacy.load(smodel).vocab.morphology.lemmatizer
self.name = 'Spacy'
self.version_string = 'Spacy version: %s' % spacy.__version__

Expand All @@ -48,7 +77,7 @@ def getLemmas(self, entry):
for upos in entry.upos_list:
# The 3rd param, morphology=None, only impacts the call to is_base_form()
# so omitting it should only impact trying to lemmatize a lemma.
lemmas = self.morphology.lemmatizer(entry.infl, upos)
lemmas = self.lemmatizer(entry.infl, upos)
lemma = lemmas[0] # See morphology.pyx::lemmatize
possible_lemmas.add( lemma )
return possible_lemmas
Expand Down Expand Up @@ -105,26 +134,24 @@ def getLemmas(self, entry):

def testLemmatizer(tester, lemmatizer, results_dir):
tester.resetTest()
# Loop through the sentences
print('Processing sentences')
print('Processing inflections')
ntests = len(tester)
pb = ProgressBar(ntests)
st = time.time()
for i, entry in enumerate(tester):
if i%10 == 0: pb.update(i)
if i%1000 == 0: pb.update(i)
possible_lemmas = lemmatizer.getLemmas(entry)
tester.addResult(entry, possible_lemmas)
duration = time.time() - st
pb.clear()
print()

# Print some stats
bad_returns = tester.lemma_errors+tester.lemma_no_ret
print(lemmatizer.version_string)
print('{:,} total test cases were {:,} had no returns.'.format(ntests,tester.lemma_no_ret))
print('{:,} total test cases where {:,} had no returns.'.format(ntests,tester.lemma_no_ret))
print('{:.1f} usecs per lemma'.format(int(1e6*duration/ntests)))
print('{:,} incorrect lemmas = {:.1f}% accuracy'.format((bad_returns),
100.*(1-bad_returns/ntests)))
print('{:,} incorrect lemmas = {:.1f}% accuracy'.format((tester.lemma_errors),
100.*(1-tester.lemma_errors/ntests)))
print('Results by pos type')
for i in range(3):
print(' {:8} : {:7,} / {:6,} = {:5.1f}% accuracy'.format(\
Expand All @@ -150,16 +177,6 @@ def testLemmatizer(tester, lemmatizer, results_dir):
results_dir = '/tmp/'
smodel = 'en_core_web_sm'

# Debug return results
if 0:
infl = 'aardvarks'
pos_type = 'N'
entry = Entry(infl, pos_type, [], '') # don't populate results
lemmatizer = SNLPLemmatizer()
lemmas = lemmatizer.getLemmas(entry)
print('%s/%s -> %s' % (entry.infl, entry.pos_type, lemmas))
sys.exit(0)

# Load the corpus to test with
print('Loading corpus ', config.acc_lemma_corp_fn)
tester = LemmatizerTest(config.acc_lemma_corp_fn)
Expand Down
5 changes: 3 additions & 2 deletions tests/accuracy/acclib/LemmatizerTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ def addResult(self, entry, possible_lemmas):
self.van_counts[vidx] += 1
if not possible_lemmas:
self.lemma_no_ret += 1
return
if entry.lemma not in possible_lemmas:
self.van_errors[vidx] += 1
self.lemma_errors += 1
elif entry.lemma not in possible_lemmas:
self.van_errors[vidx] += 1
self.lemma_errors += 1
plemmas_str = '/'.join(sorted(possible_lemmas))
Expand Down

0 comments on commit ae1a805

Please sign in to comment.