In [None]:
import time
import sys
from collections import defaultdict, Counter
import re
import numpy as np

In [380]:
class Trie:
    def __init__(self):
        self.word = None
        self.probability = None
        self.children = defaultdict(Trie)
        
    def insert(self, word, probability):
        node = self
        for letter in word:
            node = node.children[letter]
        node.word = word
        node.probability = probability

In [385]:
class LevenshteinAutomaton:
    """
    Automaton for calculate Levenshtein distance
    """
    def __init__(self, word, max_dist):
        self.word = word
        self.max_dist = max_dist

    def start(self):
        return range(len(self.word) + 1)

    def step(self, state, letter):
        new_state = [state[0] + 1]
        for i in range(len(state) - 1):
            insert_state = new_state[i] + 1
            replace_state = state[i] + (self.word[i] != letter)
            delete_state = state[i + 1] + 1
            new_state.append(min(insert_state, replace_state, delete_state))
        return new_state

    def is_match(self, state):
        return state[-1] <= self.max_dist

    def can_match(self, state):
        return min(state) <= self.max_dist

In [373]:
class WordSearcher:
    def __init__(self, trie, max_dist=1, alpha=0.1):
        """
        Args:
        trie (Trie): dictionary trie
        max_dist (int): maximum distance Levenstein
        alpha (float): parameter of model language
        """
        self.trie = trie
        self.max_dist = max_dist
        self.alpha = alpha
        
    def search(self, word):
        """
        Args:
        word (string)
        
        Return:
        a list of nearest (word, distance)
        
        Example:
        'corect' =>
        [('direct', 2.8976129598587455),
         ('comet', 3.1622304395971579),
         ('collect', 3.0746835658617675),
         ('correct', 2.0287303329239235),
         ('corrects', 3.3924889488965624),
         ...]
        """
        words = []
        automaton = LevenshteinAutomaton(word, self.max_dist)
        state = automaton.start()
        self.search_recursive(self.trie.children, automaton, state, words)
        return words
        
    def search_recursive(self, node, automaton, state, words):
        """
        Support function for a previous
        """
        for letter in node:
            new_state = automaton.step(state, letter)
            if automaton.is_match(new_state) and node[letter].word != None:
                words.append((node[letter].word, new_state[-1] - self.alpha * np.log(node[letter].probability)))
            if automaton.can_match(new_state):
                self.search_recursive(node[letter].children, automaton, new_state, words)

In [374]:
class SpellChecker:
    def __init__(self, rus_dictionary, eng_dictionary, max_dist=1, alpha=0.1):
        """
        Args:
        rus_dictionary (string): path of russian dictionary
        eng_dictionary (string): path of english dictionary
        max_dist (int): maximum distance Levenstein
        alpha (float): parameter of model language
        """
        self.searcher = {}
        self.searcher['rus'] = WordSearcher(SpellChecker.get_trie(rus_dictionary), max_dist, alpha)
        self.searcher['eng'] = WordSearcher(SpellChecker.get_trie(eng_dictionary), max_dist, alpha)
        self.alpha = alpha
        
    @staticmethod
    def get_trie(dictionary):
        """
        dictionary (string) => trie (Trie)
        """
        cnt = Counter(SpellChecker.get_words(open(dictionary).read().lower()))
        trie = Trie()
        N = sum(cnt.values())
        for word, count in cnt.items():
            trie.insert(word, count / N)
        return trie
    
    @staticmethod
    def get_words(text):
        """
        'Я иду гулять.' => ['Я', 'иду', 'гулять']
        """
        return re.findall(r'\w+', text)
    
    @staticmethod
    def get_punctuations(text):
        """
        'Я иду гулять.' => [' ', ' ', '.']
        """ 
        return re.findall(r'[^\w]+', text)
    
    @staticmethod
    def get_case(word, correct_word):
        """
        шерлок => Шерлок
        оон => ООН
        """
        if word.istitle():
            return correct_word.title()
        if word.isupper():
            return correct_word.upper()
        return correct_word
    
    @staticmethod
    def get_language(text):
        """
        cat => eng
        кот => rus
        """
        eng_count = len(re.findall(r'[a-zA-Z]', text))
        rus_count = len(re.findall(r'\w', text)) - eng_count
        return 'eng' if rus_count < eng_count else 'rus'
        
    def check(self, text):
        """
        'стекляный' => 'стеклянный'
        'corect' => 'correct'
        """
        lang = SpellChecker.get_language(text)
        words = SpellChecker.get_words(text)
        punctuations = SpellChecker.get_punctuations(text)
        correct_words = []
        correct_text = [None] * (len(punctuations) + len(words))
        
        for word in words:
            lower_word = word.lower()
            candidates = self.searcher[lang].search(lower_word)
            candidates.append((lower_word, np.inf))
            correct_word = min(candidates, key=lambda x: x[1])[0]
            correct_words.append(SpellChecker.get_case(word, correct_word))
        
        correct_text[::2] = correct_words
        correct_text[1::2] = punctuations
        return ''.join(correct_text)

In [382]:
speller = SpellChecker('../data/rus.txt', '../data/eng.txt', max_dist=1, alpha=0.1)

In [None]:
def spelltest(speller, tests):
    for key, item tests.items:
        correction = speller(key)
        

In [384]:
tests.items()

dict_items([('experience', 'experance experiance'), ('dragged', 'draged'), ('financially', 'financialy'), ('hierarchal', 'hierachial'), ('these', 'thees thess'), ('challenge', 'chalange'), ('brief', 'brif'), ('announcement', 'anouncement'), ('sufficient', 'suficient'), ('accumulated', 'acumulated'), ('regained', 'regined'), ('minuscule', 'miniscule'), ('thoughts', 'thorts'), ('inconvenient', 'inconvienient inconvient inconvinient'), ('approximately', 'aproximatly'), ('particular', 'paerticulaur'), ('stomach', 'stomac stomache stomec stumache'), ('biscuits', 'biscits biscutes biscuts bisquits buiscits buiscuts'), ('benefits', 'benifits'), ('supposedly', 'supposidly'), ('desiccate', 'desicate dessicate dessiccate'), ('permanent', 'perminant'), ('excessively', 'exessively'), ('progresses', 'progressess'), ('months', 'monthes'), ('overall', 'overal'), ('base', 'basse'), ('requested', 'rquested'), ('fails', 'failes'), ('loans', 'lones'), ('investigated', 'investegated'), ('credit', 'creadit

In [383]:
tests =  {'access': 'acess',
          'accessing': 'accesing',
          'accommodation': 'accomodation acommodation acomodation',
          'account': 'acount',
          'address': 'adress adres',
          'addressable': 'addresable',
          'arranged': 'aranged arrainged',
          'arrangeing': 'aranging',
          'arrangement': 'arragment',
          'articles': 'articals',
          'aunt': 'annt anut arnt',
          'auxiliary': 'auxillary',
          'available': 'avaible',
          'awful': 'awfall afful',
          'basically': 'basicaly',
          'beginning': 'begining',
          'benefit': 'benifit',
          'benefits': 'benifits',
          'between': 'beetween',
          'bicycle': 'bicycal bycicle bycycle',
          'biscuits': 'biscits biscutes biscuts bisquits buiscits buiscuts',
          'built': 'biult',
          'cake': 'cak',
          'career': 'carrer',
          'cemetery': 'cemetary semetary',
          'centrally': 'centraly',
          'certain': 'cirtain',
          'challenges': 'chalenges chalenges',
          'chapter': 'chaper chaphter chaptur',
          'choice': 'choise',
          'choosing': 'chosing',
          'clerical': 'clearical',
          'committee': 'comittee',
          'compare': 'compair',
          'completely': 'completly',
          'consider': 'concider',
          'considerable': 'conciderable',
          'contented': 'contenpted contende contended contentid',
          'curtains': 'cartains certans courtens cuaritains curtans curtians curtions',
          'decide': 'descide',
          'decided': 'descided',
          'definitely': 'definately difinately',
          'definition': 'defenition',
          'definitions': 'defenitions',
          'description': 'discription',
          'desiccate': 'desicate dessicate dessiccate',
          'diagrammatically': 'diagrammaticaally',
          'different': 'diffrent',
          'driven': 'dirven',
          'ecstasy': 'exstacy ecstacy',
          'embarrass': 'embaras embarass',
          'establishing': 'astablishing establising',
          'experience': 'experance experiance',
          'experiences': 'experances',
          'extended': 'extented',
          'extremely': 'extreamly',
          'fails': 'failes',
          'families': 'familes',
          'february': 'febuary',
          'further': 'futher',
          'gallery': 'galery gallary gallerry gallrey',
          'hierarchal': 'hierachial',
          'hierarchy': 'hierchy',
          'inconvenient': 'inconvienient inconvient inconvinient',
          'independent': 'independant independant',
          'initial': 'intial',
          'initials': 'inetials inistals initails initals intials',
          'juice': 'guic juce jucie juise juse',
          'latest': 'lates latets latiest latist',
          'laugh': 'lagh lauf laught lugh',
          'level': 'leval',
          'levels': 'levals',
          'liaison': 'liaision liason',
          'lieu': 'liew',
          'literature': 'litriture',
          'loans': 'lones',
          'locally': 'localy',
          'magnificent': 'magnificnet magificent magnifcent magnifecent magnifiscant magnifisent magnificant',
          'management': 'managment',
          'meant': 'ment',
          'minuscule': 'miniscule',
          'minutes': 'muinets',
          'monitoring': 'monitering',
          'necessary': 'neccesary necesary neccesary necassary necassery neccasary',
          'occurrence': 'occurence occurence',
          'often': 'ofen offen offten ofton',
          'opposite': 'opisite oppasite oppesite oppisit oppisite opposit oppossite oppossitte',
          'parallel': 'paralel paralell parrallel parralell parrallell',
          'particular': 'particulaur',
          'perhaps': 'perhapse',
          'personnel': 'personnell',
          'planned': 'planed',
          'poem': 'poame',
          'poems': 'poims pomes',
          'poetry': 'poartry poertry poetre poety powetry',
          'position': 'possition',
          'possible': 'possable',
          'pretend': 'pertend protend prtend pritend',
          'problem': 'problam proble promblem proplen',
          'pronunciation': 'pronounciation',
          'purple': 'perple perpul poarple',
          'questionnaire': 'questionaire',
          'really': 'realy relley relly',
          'receipt': 'receit receite reciet recipt',
          'receive': 'recieve',
          'refreshment': 'reafreshment refreshmant refresment refressmunt',
          'remember': 'rember remeber rememmer rermember',
          'remind': 'remine remined',
          'scarcely': 'scarcly scarecly scarely scarsely',
          'scissors': 'scisors sissors',
          'separate': 'seperate',
          'singular': 'singulaur',
          'someone': 'somone',
          'sources': 'sorces',
          'southern': 'southen',
          'special': 'speaical specail specal speical',
          'splendid': 'spledid splended splened splended',
          'standardizing': 'stanerdizing',
          'stomach': 'stomac stomache stomec stumache',
          'supersede': 'supercede superceed',
          'there': 'ther',
          'totally': 'totaly',
          'transferred': 'transfred',
          'transportability': 'transportibility',
          'triangular': 'triangulaur',
          'understand': 'undersand undistand',
          'unexpected': 'unexpcted unexpeted unexspected',
          'unfortunately': 'unfortunatly',
          'unique': 'uneque',
          'useful': 'usefull',
          'valuable': 'valubale valuble',
          'variable': 'varable',
          'variant': 'vairiant',
          'various': 'vairious',
          'visited': 'fisited viseted vistid vistied',
          'visitors': 'vistors',
          'voluntary': 'volantry',
          'voting': 'voteing',
          'wanted': 'wantid wonted',
          'whether': 'wether',
          'wrote': 'rote wote',
          'forbidden': 'forbiden',
          'decisions': 'deciscions descisions',
          'supposedly': 'supposidly',
          'embellishing': 'embelishing',
          'technique': 'tecnique',
          'permanently': 'perminantly',
          'confirmation': 'confermation',
          'appointment': 'appoitment',
          'progression': 'progresion',
          'accompanying': 'acompaning',
          'applicable': 'aplicable',
          'regained': 'regined',
          'guidelines': 'guidlines',
          'surrounding': 'serounding',
          'titles': 'tittles',
          'unavailable': 'unavailble',
          'advantageous': 'advantageos',
          'brief': 'brif',
          'appeal': 'apeal',
          'consisting': 'consisiting',
          'clerk': 'cleark clerck',
          'component': 'componant',
          'favourable': 'faverable',
          'separation': 'seperation',
          'search': 'serch',
          'receive': 'recieve',
          'employees': 'emploies',
          'prior': 'piror',
          'resulting': 'reulting',
          'suggestion': 'sugestion',
          'opinion': 'oppinion',
          'cancellation': 'cancelation',
          'criticism': 'citisum',
          'useful': 'usful',
          'humour': 'humor',
          'anomalies': 'anomolies',
          'would': 'whould',
          'doubt': 'doupt',
          'examination': 'eximination',
          'therefore': 'therefoe',
          'recommend': 'recomend',
          'separated': 'seperated',
          'successful': 'sucssuful succesful',
          'apparent': 'apparant',
          'occurred': 'occureed',
          'particular': 'paerticulaur',
          'pivoting': 'pivting',
          'announcing': 'anouncing',
          'challenge': 'chalange',
          'arrangements': 'araingements',
          'proportions': 'proprtions',
          'organized': 'oranised',
          'accept': 'acept',
          'dependence': 'dependance',
          'unequalled': 'unequaled',
          'numbers': 'numbuers',
          'sense': 'sence',
          'conversely': 'conversly',
          'provide': 'provid',
          'arrangement': 'arrangment',
          'responsibilities': 'responsiblities',
          'fourth': 'forth',
          'ordinary': 'ordenary',
          'description': 'desription descvription desacription',
          'inconceivable': 'inconcievable',
          'data': 'dsata',
          'register': 'rgister',
          'supervision': 'supervison',
          'encompassing': 'encompasing',
          'negligible': 'negligable',
          'allow': 'alow',
          'operations': 'operatins',
          'executed': 'executted',
          'interpretation': 'interpritation',
          'hierarchy': 'heiarky',
          'indeed': 'indead',
          'years': 'yesars',
          'through': 'throut',
          'committee': 'committe',
          'inquiries': 'equiries',
          'before': 'befor',
          'continued': 'contuned',
          'permanent': 'perminant',
          'choose': 'chose',
          'virtually': 'vertually',
          'correspondence': 'correspondance',
          'eventually': 'eventully',
          'lonely': 'lonley',
          'profession': 'preffeson',
          'they': 'thay',
          'now': 'noe',
          'desperately': 'despratly',
          'university': 'unversity',
          'adjournment': 'adjurnment',
          'possibilities': 'possablities',
          'stopped': 'stoped',
          'mean': 'meen',
          'weighted': 'wagted',
          'adequately': 'adequattly',
          'shown': 'hown',
          'matrix': 'matriiix',
          'profit': 'proffit',
          'encourage': 'encorage',
          'collate': 'colate',
          'disaggregate': 'disaggreagte disaggreaget',
          'receiving': 'recieving reciving',
          'proviso': 'provisoe',
          'umbrella': 'umberalla',
          'approached': 'aproached',
          'pleasant': 'plesent',
          'difficulty': 'dificulty',
          'appointments': 'apointments',
          'base': 'basse',
          'conditioning': 'conditining',
          'earliest': 'earlyest',
          'beginning': 'begining',
          'universally': 'universaly',
          'unresolved': 'unresloved',
          'length': 'lengh',
          'exponentially': 'exponentualy',
          'utilized': 'utalised',
          'set': 'et',
          'surveys': 'servays',
          'families': 'familys',
          'system': 'sysem',
          'approximately': 'aproximatly',
          'their': 'ther',
          'scheme': 'scheem',
          'speaking': 'speeking',
          'repetitive': 'repetative',
          'inefficient': 'ineffiect',
          'geneva': 'geniva',
          'exactly': 'exsactly',
          'immediate': 'imediate',
          'appreciation': 'apreciation',
          'luckily': 'luckeley',
          'eliminated': 'elimiated',
          'believe': 'belive',
          'appreciated': 'apreciated',
          'readjusted': 'reajusted',
          'were': 'wer where',
          'feeling': 'fealing',
          'and': 'anf',
          'false': 'faulse',
          'seen': 'seeen',
          'interrogating': 'interogationg',
          'academically': 'academicly',
          'relatively': 'relativly relitivly',
          'traditionally': 'traditionaly',
          'studying': 'studing',
          'majority': 'majorty',
          'build': 'biuld',
          'aggravating': 'agravating',
          'transactions': 'trasactions',
          'arguing': 'aurguing',
          'sheets': 'sheertes',
          'successive': 'sucsesive sucessive',
          'segment': 'segemnt',
          'especially': 'especaily',
          'later': 'latter',
          'senior': 'sienior',
          'dragged': 'draged',
          'atmosphere': 'atmospher',
          'drastically': 'drasticaly',
          'particularly': 'particulary',
          'visitor': 'vistor',
          'session': 'sesion',
          'continually': 'contually',
          'availability': 'avaiblity',
          'busy': 'buisy',
          'parameters': 'perametres',
          'surroundings': 'suroundings seroundings',
          'employed': 'emploied',
          'adequate': 'adiquate',
          'handle': 'handel',
          'means': 'meens',
          'familiar': 'familer',
          'between': 'beeteen',
          'overall': 'overal',
          'timing': 'timeing',
          'committees': 'comittees commitees',
          'queries': 'quies',
          'econometric': 'economtric',
          'erroneous': 'errounous',
          'decides': 'descides',
          'reference': 'refereence refference',
          'intelligence': 'inteligence',
          'edition': 'ediion ediition',
          'are': 'arte',
          'apologies': 'appologies',
          'thermawear': 'thermawere thermawhere',
          'techniques': 'tecniques',
          'voluntary': 'volantary',
          'subsequent': 'subsequant subsiquent',
          'currently': 'curruntly',
          'forecast': 'forcast',
          'weapons': 'wepons',
          'routine': 'rouint',
          'neither': 'niether',
          'approach': 'aproach',
          'available': 'availble',
          'recently': 'reciently',
          'ability': 'ablity',
          'nature': 'natior',
          'commercial': 'comersial',
          'agencies': 'agences',
          'however': 'howeverr',
          'suggested': 'sugested',
          'career': 'carear',
          'many': 'mony',
          'annual': 'anual',
          'according': 'acording',
          'receives': 'recives recieves',
          'interesting': 'intresting',
          'expense': 'expence',
          'relevant': 'relavent relevaant',
          'table': 'tasble',
          'throughout': 'throuout',
          'conference': 'conferance',
          'sensible': 'sensable',
          'described': 'discribed describd',
          'union': 'unioun',
          'interest': 'intrest',
          'flexible': 'flexable',
          'refered': 'reffered',
          'controlled': 'controled',
          'sufficient': 'suficient',
          'dissension': 'desention',
          'adaptable': 'adabtable',
          'representative': 'representitive',
          'irrelevant': 'irrelavent',
          'unnecessarily': 'unessasarily',
          'applied': 'upplied',
          'apologised': 'appologised',
          'these': 'thees thess',
          'choices': 'choises',
          'will': 'wil',
          'procedure': 'proceduer',
          'shortened': 'shortend',
          'manually': 'manualy',
          'disappointing': 'dissapoiting',
          'excessively': 'exessively',
          'comments': 'coments',
          'containing': 'containg',
          'develop': 'develope',
          'credit': 'creadit',
          'government': 'goverment',
          'acquaintances': 'aquantences',
          'orientated': 'orentated',
          'widely': 'widly',
          'advise': 'advice',
          'difficult': 'dificult',
          'investigated': 'investegated',
          'bonus': 'bonas',
          'conceived': 'concieved',
          'nationally': 'nationaly',
          'compared': 'comppared compased',
          'moving': 'moveing',
          'necessity': 'nessesity',
          'opportunity': 'oppertunity oppotunity opperttunity',
          'thoughts': 'thorts',
          'equalled': 'equaled',
          'variety': 'variatry',
          'analysis': 'analiss analsis analisis',
          'patterns': 'pattarns',
          'qualities': 'quaties',
          'easily': 'easyly',
          'organization': 'oranisation oragnisation',
          'the': 'thw hte thi',
          'corporate': 'corparate',
          'composed': 'compossed',
          'enormously': 'enomosly',
          'financially': 'financialy',
          'functionally': 'functionaly',
          'discipline': 'disiplin',
          'announcement': 'anouncement',
          'progresses': 'progressess',
          'except': 'excxept',
          'recommending': 'recomending',
          'mathematically': 'mathematicaly',
          'source': 'sorce',
          'combine': 'comibine',
          'input': 'inut',
          'careers': 'currers carrers',
          'resolved': 'resoved',
          'demands': 'diemands',
          'unequivocally': 'unequivocaly',
          'suffering': 'suufering',
          'immediately': 'imidatly imediatly',
          'accepted': 'acepted',
          'projects': 'projeccts',
          'necessary': 'necasery nessasary nessisary neccassary',
          'journalism': 'journaism',
          'unnecessary': 'unessessay',
          'night': 'nite',
          'output': 'oputput',
          'security': 'seurity',
          'essential': 'esential',
          'beneficial': 'benificial benficial',
          'explaining': 'explaning',
          'supplementary': 'suplementary',
          'questionnaire': 'questionare',
          'employment': 'empolyment',
          'proceeding': 'proceding',
          'decision': 'descisions descision',
          'per': 'pere',
          'discretion': 'discresion',
          'reaching': 'reching',
          'analysed': 'analised',
          'expansion': 'expanion',
          'although': 'athough',
          'subtract': 'subtrcat',
          'analysing': 'aalysing',
          'comparison': 'comparrison',
          'months': 'monthes',
          'hierarchal': 'hierachial',
          'misleading': 'missleading',
          'commit': 'comit',
          'auguments': 'aurgument',
          'within': 'withing',
          'obtaining': 'optaning',
          'accounts': 'acounts',
          'primarily': 'pimarily',
          'operator': 'opertor',
          'accumulated': 'acumulated',
          'extremely': 'extreemly',
          'there': 'thear',
          'summarys': 'sumarys',
          'analyse': 'analiss',
          'understandable': 'understadable',
          'safeguard': 'safegaurd',
          'consist': 'consisit',
          'declarations': 'declaratrions',
          'minutes': 'muinutes muiuets',
          'associated': 'assosiated',
          'accessibility': 'accessability',
          'examine': 'examin',
          'surveying': 'servaying',
          'politics': 'polatics',
          'annoying': 'anoying',
          'again': 'agiin',
          'assessing': 'accesing',
          'ideally': 'idealy',
          'scrutinized': 'scrutiniesed',
          'simular': 'similar',
          'personnel': 'personel',
          'whereas': 'wheras',
          'when': 'whn',
          'geographically': 'goegraphicaly',
          'gaining': 'ganing',
          'requested': 'rquested',
          'separate': 'seporate',
          'students': 'studens',
          'prepared': 'prepaired',
          'generated': 'generataed',
          'graphically': 'graphicaly',
          'suited': 'suted',
          'variable': 'varible vaiable',
          'building': 'biulding',
          'required': 'reequired',
          'necessitates': 'nessisitates',
          'together': 'togehter',
          'profits': 'proffits'}