# Fuzzy Text Matching

In [1]:
import os, sys
cwd = os.getcwd()
project_path = cwd[:cwd.find('pygents')+7]
if project_path not in sys.path: sys.path.append(project_path)
os.chdir(project_path)

if 'pygents.text_matching' in sys.modules:
    del sys.modules['pygents.text_matching']

from pygents.text_matching import FuzzyMatcher
from pygents.aigents_api import load_lines


[nltk_data] Downloading package punkt to /Users/akolonin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
fm = FuzzyMatcher(['Anton Kolonin','Evgeny Bochkov','Alexey Gluschshenko','International Business Machines'])

assert(str(fm.match('Alexey'))=="('Alexey Gluschshenko', 0.53, 'chars')")
assert(str(fm.match('Alex'))=="('Alexey Gluschshenko', 0.41, 'chars')")
assert(str(fm.match('International Business'))=="('International Business Machines', 0.71, 'wordsonly')")
assert(str(fm.match('Interational Busines'))=="('International Business Machines', 0.84, 'chars')")


In [3]:
# use list of company names from CompanyFacts archive downloaded from https://www.sec.gov/edgar/ 
with open('CompanyFacts_companies.txt',errors='ignore') as f:
    lines = f.readlines()
lines = [l.strip() for l in lines]
lines = [l for l in lines if len(l) > 1]
print(len(lines))
for l in lines[:20]:
    print(l)

16267
AAR CORP
ABBOTT LABORATORIES
WORLDS INC.
ACETO CORP
ACME UNITED CORP
ADAMS RESOURCES & ENERGY, INC.
BK Technologies Corp
ADVANCED MICRO DEVICES, INC
BALLY TECHNOLOGIES, INC.
AGNICO EAGLE MINES LIMITED
AIR PRODUCTS AND CHEMICALS, INC.
AKORN INC
SPIRE ALABAMA INC
CECO ENVIRONMENTAL CORP.
Matson,Inc.
ALEXANDERS INC
ALICO, INC.
CHENIERE ENERGY, INC.
ALLEGHENY ENERGY, INC
ALLIED DEFENSE GROUP INC


In [4]:
cfm = FuzzyMatcher(lines)

In [5]:
print(cfm.match('ABBOTT'))
print(cfm.match('ABOTT'))
print(cfm.match('ADVANCED MICRO DEVICES, INC'))
print(cfm.match('ADVANCED MICRO DEVICES INC.'))
print(cfm.match('ADVANCED MICRO DEVICES'))
print(cfm.match('ADVANCED MACRO DEVICES'))
print(cfm.match('ADVANCED MICRO'))
print(cfm.match('MICRO DEVICES'))
print(cfm.match('ADVANCED DEVICES'))
print(cfm.match('ADVANCED DEVICE'))


('ABBOTT LABORATORIES', 0.64, 'chars')
('ABBOTT LABORATORIES', 0.61, 'chars')
('ADVANCED MICRO DEVICES, INC', 1.0, 'wordsonly')
('ADVANCED MICRO DEVICES, INC', 1.0, 'wordsonly')
('ADVANCED MICRO DEVICES, INC', 0.82, 'wordsonly')
('ADVANCED MICRO DEVICES, INC', 0.85, 'chars')
('ADVANCED MICRO DEVICES, INC', 0.58, 'wordsonly')
('ADVANCED MICRO DEVICES, INC', 0.58, 'wordsonly')
('ADVANCED MICRO DEVICES, INC', 0.81, 'chars')
('ADVANCED MICRO DEVICES, INC', 0.8, 'chars')


In [6]:
with open('./data/dict/en/lexicon.txt',errors='ignore') as f:
    lines = f.readlines()
lines = [l.split()[0].strip() for l in lines]
lines = [l for l in lines if len(l) > 1]
print(len(lines))
print()
for l in lines[:20]:
    print(l)


97539

the
of
and
to
in
is
that
for
it
as
was
with
be
by
on
not
he
this
are
or


In [7]:
lfm = FuzzyMatcher(lines,['chars'])

print(lfm.match('abbrebiation',options=['chars'],threshold=0.8))
print(lfm.match('updat',threshold=0.8))
print(lfm.match('updte',threshold=0.8))
print(lfm.match('elitation',threshold=0.8))

def auto_correct_tokens(self,tokens,threshold=0.8):
    new_list = []
    lex = self.idxs['chars']
    for t in tokens:
        if t in lex:
            new_list.append(t)
        else:
            bestmatch, maxsim, option = self.match(t,threshold=threshold)
            new_list.append(t if bestmatch is None else bestmatch)
    return new_list if type(tokens) == list else set(new_list)

print(auto_correct_tokens(lfm,['would','abbrebiation','updat','elitation','updte']))
print(str(lfm.auto_correct_tokens(('would','abbrebiation','updat','elitation','updte'))))


('abbreviation', 0.82, 'chars')
('update', 0.89, 'chars')
(None, None, None)
('elicitation', 0.89, 'chars')
['would', 'abbreviation', 'update', 'elicitation', 'updte']
('would', 'abbreviation', 'update', 'elicitation', 'updte')


In [8]:
lex_fm = FuzzyMatcher(set([l.split('\t')[0].strip() for l in load_lines('./data/dict/en/lexicon.txt')]),['chars'])
print(str(lex_fm.auto_correct_tokens(['would','abbrebiation','updat','elitation','updte'])))


['would', 'abbreviation', 'update', 'elicitation', 'updte']


In [9]:
lex_fm = FuzzyMatcher(set([l.split('\t')[0].strip().lower() for l in load_lines('./data/dict/ru/lexicon.txt')]),['chars'])
print(str(lex_fm.auto_correct_tokens(['случитс','катстрофа','ксяк','ашибка'])))


['случится', 'катастрофа', 'сяк', 'ошибка']
