In [1]:
from deeppavlov.models.spelling_correction.levenshtein.searcher_component import LevenshteinSearcherComponent
import numpy as np
DATA_PATH = "/home/alx/Cloud/spell_corr/py_spelling_corrector/data/"


[nltk_data] Downloading package punkt to /home/alx/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alx/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to /home/alx/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/alx/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


In [51]:

class LanguageModel():
    def tokenize(self, sent_str):
        pass

    def estimate_likelihood(self, sent_str):
        pass

    def score_sentences(self, sentences):
        return np.random.rand(len(sentences))

class Hypothesis():
    def __init__(self, text):
        self.text = text
        self.score = np.nan

    def fork_for_each_suffix(self, suffixes):
        """Given a list of suffixes strings it forks the current hypotheses into several
        hypotheses for each suffix"""
        hypotheses_list = []
        for each_suffix in suffixes:
            hypotheses_list.append(Hypothesis(self.text + each_suffix))

        return hypotheses_list
        
    def __str__(self):
        if self.score:
            out_text = " score: %f" % self.score
        else:
            out_text = ""
        text = "hypothesis: %s" % self.text
        return text + out_text
    
    
    def __repr__(self):
        if self.score:
            out_text = " score: %f" % self.score
        else:
            out_text = ""
        text = "hypothesis: %s" % self.text
        return text + out_text
    
class HypothesesHub():
    def __init__(self):
        # init with null hypothesis:
        self.hypotheses = [Hypothesis("")]
        
        self.lm = LanguageModel()

        # hypotheses which are 20% lower probable are pruuned:
        self.prob_pruning_treshold = 0.8

        self.max_score = 0.0

    def get_scores(self):
        scores = [each_hypo.score for each_hypo in self.hypotheses]
        return scores

    def append_partial_hypotheses(self, partial_candidates):
        """
        For each hypothesis in the hub it appends all candidates
        :param partial_candidates:
        :return: updated self
        """
        new_hypotheses = []
        if self.hypotheses:
            for each_hypothesis in self.hypotheses:
                hypos = each_hypothesis.fork_for_each_suffix(partial_candidates)
                new_hypotheses += hypos
        self.hypotheses = new_hypotheses
        return self

    def score_hypotheses(self):
        """
        Command to run scoring of all hypotheses by language model scoring function
        :return: list of scored hypotheses
        """
        sentences = [each_hypo.text for each_hypo in self.hypotheses]
        scores = self.lm.score_sentences(sentences)

        for num, each_hypo in enumerate(self.hypotheses):
            each_hypo.score = scores[num]
        return self.hypotheses

    def prune_low_prob_hypotheses(self, prob_pruning_treshold=None, max_number_of_hypotheses=100):
        """
        Prunes hypotheses that has probability lower than treshold value.
        treshold value depends on value of max prob hypothesis.

        :param prob_pruning_treshold: [0,1] ratio of max probability that is required for a
        hypothesis to be kept in hypotheses hub
        :return:
        """
        # number check
        if len(self.hypotheses)>max_number_of_hypotheses:
            # prune those which are the worst
            filtered_hypotheses = sorted(self.hypotheses, key=lambda x: x.score, reverse=True)
            self.hypotheses = filtered_hypotheses[:max_number_of_hypotheses]

        # prune by prob value
        if not prob_pruning_treshold:
            prob_pruning_treshold = self.prob_pruning_treshold

        scores = self.get_scores()
        # prune low probs
        max_score = max(scores)
        # finmd all that below
        lowest_allowed_prob = max_score * prob_pruning_treshold

        filtered_hypotheses = [each_hypo for idx, each_hypo in
                               enumerate(self.hypotheses) if
                               each_hypo.score >= lowest_allowed_prob]
        self.hypotheses = filtered_hypotheses        
            
        return self.hypotheses


class SpellingCorrectionCandidatesGenerator():
    def __init__(self, path_to_dictionary=None):

        words_dict = []
        if not path_to_dictionary:
            path_to_dictionary = DATA_PATH + "compreno_wordforms.txt"
#             path_to_dictionary = DATA_PATH + "russian_words_vocab.dict"
        
        with open(path_to_dictionary, "r") as dict_file:
            words_dict = dict_file.read().splitlines()
        self.lsc = LevenshteinSearcherComponent(words=words_dict)

    def gen_candidates(self, token):
        """
        Given a token striung generates candidates
        :param token:
        :return:
        """
        scored_candidates=self.lsc([[token]])[0][0]
        scores, w_forms = zip(*scored_candidates)
        w_forms = list(w_forms)
        scores = list(scores)
        # ############################################################################################
        # here is rule based/statistical substitutions with distant levenshtein can be applied:
        FREQUENT_ERRORS_DECREMENT_SCORE = -1.0
        if token == "нить":
            w_forms.append("нибудь")
            scores.append(FREQUENT_ERRORS_DECREMENT_SCORE)        
        elif token in ["оч"]:
            w_forms.append("очень")
            scores.append(FREQUENT_ERRORS_DECREMENT_SCORE)
        elif token in ["што", "шо", "чо", "чё"]:
            w_forms.append("что")
            scores.append(FREQUENT_ERRORS_DECREMENT_SCORE)
        elif token in ["ваще", "воще"]:
            w_forms.append("вообще")
            scores.append(FREQUENT_ERRORS_DECREMENT_SCORE)
        elif token in ["вообщем"]:
            w_forms.append("в общем")
            scores.append(FREQUENT_ERRORS_DECREMENT_SCORE)        
        elif token in ["писят"]:
            w_forms.append("пятьдесят")
            scores.append(FREQUENT_ERRORS_DECREMENT_SCORE)
        elif token in ["аццкий"]:
            w_forms.append("адский")
            scores.append(FREQUENT_ERRORS_DECREMENT_SCORE)
        return scores, w_forms

    def variate_with_prefixes(self, candidates):
        """
        Given a tokens candidates this method enriches the space of candidates with prefixed variants
        by default it prepends prefixes of the space and hyphen to tokens.

        So ["то"] -> ["то", "-то", " то"]

        :param candidates: list of candidate strings
        :param prefixes: list of possible prefixes
        :return: list of candidates enriched with prefixed versions
        """
        
#         prefixes = [" ", "-"]
        result_candidates = []
        for each_candidate in candidates:
            # add raw candidate:
            result_candidates.append(each_candidate)
            
            # add space candidate
            result_candidates.append(" " + each_candidate)
            
            # add hyphen candidates conditionally:
            # TODO improve heuristics for hyphen adding?
            # TODO add hyphen after "по"
            if each_candidate in ["то", "таки", "нибудь", "моему", "нашему", "твоему", "любому", "за", "другому", "как",
                                 "русски", "разному"]:
                result_candidates.append("-" + each_candidate)

        return result_candidates
    
class SpellingCorrector():
    # language_model;
    # error model;
    def __init__(self):
        self.sccg = SpellingCorrectionCandidatesGenerator()
    
    def lowercase(self, sent_str):
        return sent_str.lower()
    
    def _tokenize(self, sent_str):
        return sent_str.split()

    def generate_hypotheses_for_token(self, token):

        pass

    # def generate_hypotheses_for_sentence(self, tokenized_sentence):
    #
    #     for each_tok in tokenized_sentence:
    #         # generate hypothesys for token
    #         tok_hypotheses = self.generate_hypotheses_for_token(each_tok)


    def predict_correct(self, sentence_str):
        """
        predicts correction of the sentence
        :param sentence_str:
        :return:
        """

        pass

    def analyze_sentence(self, sentence):
        """

        :param sentence: str, sentence with errors
        :return: the best hypotheses of the sentence
        """
        hypo_hub = HypothesesHub()
        # preprocessing:
        # TODO make lowercasing revertible:
        sentence = self.lowercase(sentence)        
        tokenized_input = self._tokenize(sentence)

        for idx, each_tok in enumerate(tokenized_input):
            scores, candidates = self.sccg.gen_candidates(each_tok)
            if idx>0:
                candidates = self.sccg.variate_with_prefixes(candidates)

            hypo_hub = hypo_hub.append_partial_hypotheses(candidates)
            hypo_hub.score_hypotheses()
            hypo_hub.prune_low_prob_hypotheses()

        return hypo_hub.hypotheses

In [52]:
sc = SpellingCorrector()

In [20]:
tokenized_input = sc._tokenize("Привед ватрушка")
tokenized_input

['Привед', 'ватрушка']

In [28]:
sc.sccg.lsc([["Привед"]])

[[[(-8.0, 'Привед')]]]

In [18]:
scored_candidates = sc.sccg.gen_candidates("привед")
scored_candidates

[(-4.0, 'привез'),
 (-4.0, 'привей'),
 (-4.0, 'привел'),
 (-4.0, 'привес'),
 (-4.0, 'привет'),
 (-4.0, 'приведи'),
 (-4.0, 'приведу'),
 (-4.0, 'приведя'),
 (-4.0, 'привад'),
 (-4.0, 'привод'),
 (-4.0, 'присед'),
 (-4.0, 'при ед'),
 (-4.0, 'пр вед'),
 (-4.0, 'при вед'),
 (-8.0, 'привед')]

In [29]:
scores, w_forms = (zip(*scored_candidates))

In [30]:
w_forms

('привез',
 'привей',
 'привел',
 'привес',
 'привет',
 'приведи',
 'приведу',
 'приведя',
 'привад',
 'привод',
 'присед',
 'при ед',
 'пр вед',
 'при вед',
 'привед')

In [32]:
part_hypos = sc.sccg.variate_with_prefixes(w_forms, prefixes=['-', ' '])
part_hypos

['привез',
 '-привез',
 ' привез',
 'привей',
 '-привей',
 ' привей',
 'привел',
 '-привел',
 ' привел',
 'привес',
 '-привес',
 ' привес',
 'привет',
 '-привет',
 ' привет',
 'приведи',
 '-приведи',
 ' приведи',
 'приведу',
 '-приведу',
 ' приведу',
 'приведя',
 '-приведя',
 ' приведя',
 'привад',
 '-привад',
 ' привад',
 'привод',
 '-привод',
 ' привод',
 'присед',
 '-присед',
 ' присед',
 'при ед',
 '-при ед',
 ' при ед',
 'пр вед',
 '-пр вед',
 ' пр вед',
 'при вед',
 '-при вед',
 ' при вед',
 'привед',
 '-привед',
 ' привед']

In [35]:
hypo_hub = HypothesesHub()
hypo_hub.hypotheses = [Hypothesis("")]
hypo_hub.append_partial_hypotheses(part_hypos)

<__main__.HypothesesHub at 0x7f7f2cbe7128>

In [36]:
hypo_hub.hypotheses

[<__main__.Hypothesis at 0x7f7f2cc31630>,
 <__main__.Hypothesis at 0x7f7f2cc314e0>,
 <__main__.Hypothesis at 0x7f7f2cc316a0>,
 <__main__.Hypothesis at 0x7f7f2cc31668>,
 <__main__.Hypothesis at 0x7f7f2c646668>,
 <__main__.Hypothesis at 0x7f7f2c6464e0>,
 <__main__.Hypothesis at 0x7f7f2c6460b8>,
 <__main__.Hypothesis at 0x7f7f2c646208>,
 <__main__.Hypothesis at 0x7f7f2c646630>,
 <__main__.Hypothesis at 0x7f7f2c6466a0>,
 <__main__.Hypothesis at 0x7f7f2c6466d8>,
 <__main__.Hypothesis at 0x7f7f2c646710>,
 <__main__.Hypothesis at 0x7f7f2c646780>,
 <__main__.Hypothesis at 0x7f7f2c6467b8>,
 <__main__.Hypothesis at 0x7f7f2c6467f0>,
 <__main__.Hypothesis at 0x7f7f2c646a58>,
 <__main__.Hypothesis at 0x7f7f2c70f198>,
 <__main__.Hypothesis at 0x7f7f2c70f1d0>,
 <__main__.Hypothesis at 0x7f7f2c70f208>,
 <__main__.Hypothesis at 0x7f7f2c70f240>,
 <__main__.Hypothesis at 0x7f7f2c70f278>,
 <__main__.Hypothesis at 0x7f7f2c70f2b0>,
 <__main__.Hypothesis at 0x7f7f2c70f2e8>,
 <__main__.Hypothesis at 0x7f7f2c7

In [37]:
hypo_hub.score_hypotheses()

[<__main__.Hypothesis at 0x7f7f2cc31630>,
 <__main__.Hypothesis at 0x7f7f2cc314e0>,
 <__main__.Hypothesis at 0x7f7f2cc316a0>,
 <__main__.Hypothesis at 0x7f7f2cc31668>,
 <__main__.Hypothesis at 0x7f7f2c646668>,
 <__main__.Hypothesis at 0x7f7f2c6464e0>,
 <__main__.Hypothesis at 0x7f7f2c6460b8>,
 <__main__.Hypothesis at 0x7f7f2c646208>,
 <__main__.Hypothesis at 0x7f7f2c646630>,
 <__main__.Hypothesis at 0x7f7f2c6466a0>,
 <__main__.Hypothesis at 0x7f7f2c6466d8>,
 <__main__.Hypothesis at 0x7f7f2c646710>,
 <__main__.Hypothesis at 0x7f7f2c646780>,
 <__main__.Hypothesis at 0x7f7f2c6467b8>,
 <__main__.Hypothesis at 0x7f7f2c6467f0>,
 <__main__.Hypothesis at 0x7f7f2c646a58>,
 <__main__.Hypothesis at 0x7f7f2c70f198>,
 <__main__.Hypothesis at 0x7f7f2c70f1d0>,
 <__main__.Hypothesis at 0x7f7f2c70f208>,
 <__main__.Hypothesis at 0x7f7f2c70f240>,
 <__main__.Hypothesis at 0x7f7f2c70f278>,
 <__main__.Hypothesis at 0x7f7f2c70f2b0>,
 <__main__.Hypothesis at 0x7f7f2c70f2e8>,
 <__main__.Hypothesis at 0x7f7f2c7

In [40]:
hypo_hub.hypotheses[0].score

1.0

In [11]:
scores, words = sc.sccg.gen_candidates("привед")

In [29]:
words

('привез',
 'привей',
 'привел',
 'привес',
 'привет',
 'приведи',
 'приведу',
 'приведя',
 'привад',
 'привод',
 'присед',
 'при ед',
 'пр вед',
 'при вед',
 'привед')

In [49]:
len(res)

100

In [53]:
res = sc.analyze_sentence("привед ватрушка вобщем шо тебе сказать")
res

[hypothesis: пр вед ватрушке в общемчтотелескапать score: 0.999347,
 hypothesis: пр вед ватрушке в общемчтотеле слазать score: 0.999336,
 hypothesis: пр вед ватрушке в общем шортеве скачать score: 0.999319,
 hypothesis: приведватрушке вобщемшоннебеслазать score: 0.998875,
 hypothesis: привед ватрушкам вобщем го тесеказать score: 0.998816,
 hypothesis: пр вед ватрушке в общем йо теес казать score: 0.998631,
 hypothesis: привед ватрушкам вобщемоте е слазать score: 0.997880,
 hypothesis: привед ват ушкаобщемшотесе скатать score: 0.997859,
 hypothesis: пр ведватрушкам вобщем иотеве слазать score: 0.997762,
 hypothesis: приведватрушке вобщемшоннебескапать score: 0.997581,
 hypothesis: приведватрушке вобщемшонзебе с казать score: 0.996866,
 hypothesis: пр вед ватрушкевобьем ош требе скакать score: 0.996045,
 hypothesis: пр вед ватрушке в общем йо теесказа ь score: 0.995384,
 hypothesis: привед ват ушкаобщем шло гебескачать score: 0.995212,
 hypothesis: приведватрушки вобьем отекеуказать scor

In [50]:
for each_res in res:
    print(each_res.text, each_res.score)

при вед ватрушке вобщем шовтребескапать 0.9999937537165716
при ведватрушка в общемшоп тебескапать 0.9998266075892084
приведяватрушкивобьемкотеле сказа ь 0.9994121022028675
при вед ватрушкевобьем шовнебе скатать 0.999360361384877
при ведватрушках вобьем шоптеше указать 0.9992974041991589
при ведватрушках вобьемяостебеказать 0.9978808294514523
при ведватрушкахвобьем шолебе скакать 0.9970061430406191
приведя ватрушках общем шон теге скатать 0.996688087234212
при ведватрушкавобьемао ребе смазать 0.9966866641873804
при вед ватрушкам вобщемботебасказа ь 0.9945597773603589
при ведватрушкавобьем шутеще сказа ь 0.9944513122869074
приведяватрушкивобьемфо т бе сказать 0.9936121604780269
при вед ватрушке вобщем шов теве скатать 0.9930758896178744
при ведватрушкавобьемно тезе скакать 0.9923264826330239
при ведват ушка общем котебауказать 0.9919261487977555
приведя ватрушкахобщем шоктеве скатать 0.9913399328988822
приведяватрушкивобьемфо т бе скакать 0.9910095721175272
приведя ватрушках общемотенеск