In [36]:
from deeppavlov.models.spelling_correction.levenshtein.searcher_component import LevenshteinSearcherComponent
import numpy as np
DATA_PATH = "/home/alx/Cloud/spell_corr/py_spelling_corrector/data/"


In [37]:
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path
from typing import List, Tuple

import kenlm

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.core.common.log import get_logger


class KenlmElector(Component):
    """Component that chooses a candidate with the highest product of base and language model probabilities

    Args:
         load_path: path to the kenlm model file
         beam_size: beam size for highest probability search

    Attributes:
        lm: kenlm object
        beam_size: beam size for highest probability search
    """
    def __init__(self, load_path: Path, beam_size: int=4, *args, **kwargs):
        self.lm = kenlm.Model(str(expand_path(load_path)))
        self.beam_size = beam_size

    def __call__(self, batch: List[List[List[Tuple[float, str]]]]) -> List[List[str]]:
        """Choose the best candidate for every token

        Args:
            batch: batch of probabilities and string values of candidates for every token in a sentence.
            Ex.:
            [
                [
                    [
                        (-0.0, 'все'),(-4.0, 'вес'), (-4.0, 'вс'), (-4.0, 'всг'),(-4.0, 'вси'),
                        (-4.0, 'вск'),(-4.0, 'всл'),(-4.0, 'овсе')],
                    [
                        (-0.0, 'смешалось'),(-4.0, 'смешало ь'),(-4.0, 'мешалось'),
                        (-4.0, 'вмешалось'),(-4.0, 'с мешалось')],
                    [
                        (-0.0, 'кони'),(-4.0, 'кон'),(-4.0, 'кона'),(-4.0, 'конв'),
                        (-4.0, 'коне'),(-4.0, 'конн'),(-4.0, 'коно'),(-4.0, 'клони')],
                    [
                        (-0.0, 'люди'),(-4.0, 'люд'),(-4.0, 'леди'),(-4.0, 'лю ди'),
                        (-4.0, 'блюди')]
                ]
            ]

        Returns:
            batch of corrected tokenized sentences
        """
        return [self._infer_instance(candidates) for candidates in batch]

    def _infer_instance(self, candidates: List[List[Tuple[float, str]]]):
        candidates = candidates + [[(0, '</s>')]]
        state = kenlm.State()
        self.lm.BeginSentenceWrite(state)
        beam = [(0, state, [])]
        for sublist in candidates:
            new_beam = []
            for beam_score, beam_state, beam_words in beam:
                for score, candidate in sublist:
                    prev_state = beam_state
                    c_score = 0
                    cs = candidate.split()
                    for candidate in cs:
                        state = kenlm.State()
                        c_score += self.lm.BaseScore(prev_state, candidate, state)
                        prev_state = state
                    new_beam.append((beam_score + score + c_score, state, beam_words + cs))
            new_beam.sort(reverse=True)
            beam = new_beam[:self.beam_size]
        score, state, words = beam[0]
        return words[:-1]
    
    ##########################################################################
    def _tokenize(self, sentence):
        return sentence.split()
    
    def estimate_pure_likelihood(self, sentence):
        """Given a sentence it estimates its likelihood without spelling correction fixes"""
        return self.lm.score(sentence)        
    
    def score_sentences(self, sentences):
        """
        Scores batch of sentences
        """
        return [self.lm.score(sentence) for sentence in sentences]
    
    def estimate_likelihood_with_correction_scores(self, tokenized_sentence_with_correction_scores):
        """Given a sentence it estimates its likelihood with spelling correction fixes"""
        #TODO
        
    def kekeke(self, candidates: List[List[Tuple[float, str]]]):
        candidates = candidates + [[(0, '</s>')]]
        state = kenlm.State()
        self.lm.BeginSentenceWrite(state)
        beam = [(0, state, [])]
        for sublist in candidates:
            new_beam = []
            for beam_score, beam_state, beam_words in beam:
                for score, candidate in sublist:
                    prev_state = beam_state
                    c_score = 0
                    cs = candidate.split()
                    for candidate in cs:
                        state = kenlm.State()
                        c_score += self.lm.BaseScore(prev_state, candidate, state)
                        prev_state = state
                    new_beam.append((beam_score + score + c_score, state, beam_words + cs))
            new_beam.sort(reverse=True)
            beam = new_beam[:self.beam_size]
        score, state, words = beam[0]
        return words[:-1]
        

In [54]:

class LanguageModel():
    def tokenize(self, sent_str):
        pass

    def estimate_likelihood(self, sent_str):
        pass

    def score_sentences(self, sentences):
        return np.random.rand(len(sentences))

class TokenHypothesis():
    pass

class SentenceHypothesis():
    pass


class Hypothesis():
    def __init__(self, text, err_score=0.0):
        self.text = text
#         self.score = np.nan
        self.lm_score = 0
        # score for error:
        self.err_score = err_score
        
    def total_score(self):
        return self.lm_score+self.err_score

    def fork_for_each_suffix(self, suffixes, error_scores=None):
        """Given a list of suffixes strings it forks the current hypotheses into several
        hypotheses for each suffix
        
        : param error_scores: score for suffix, usually negative value (logit score), suffix error score is incremented to 
        baseline hypothesis score
        """
        hypotheses_list = []
        for idx, each_suffix in enumerate(suffixes):
            if each_suffix[0]==" ":
                # if suffix starts with space character, then we have a token split
                pass
                hypothesis = Hypothesis
            else:
                # continuation hypothesis
                pass
            hypotheses_list.append(Hypothesis(self.text + each_suffix, self.err_score+error_scores[idx]))

        return hypotheses_list   
    
    def __repr__(self):        
        out_text = " score: %f" % self.total_score()        
        text = "hypothesis: %s" % self.text
        return text + out_text
    
class HypothesesHub():
    def __init__(self):
        # init with null hypothesis:
        self.hypotheses = [Hypothesis("")]
        
        # dummy LM:
#         self.lm = LanguageModel()
        
#         # KenLM Elector as LM:
#         ROOT_PATH = "~/.deeppavlov"
#         DOWNLOADS_PATH = ROOT_PATH + "/downloads"        
#         self.lm = KenlmElector(load_path=DOWNLOADS_PATH+"/language_models/ru_wiyalen_no_punkt.arpa.binary")
        
        self.max_score = 0.0

    def get_scores(self):
        """Returns summarized scores Error + LM score"""
        scores = [each_hypo.total_score() for each_hypo in self.hypotheses]
        return scores

    def append_partial_hypotheses(self, partial_candidates, error_scores=None):
        """
        For each hypothesis in the hub it appends all candidates
        :param partial_candidates:
        :return: updated self
        """
        new_hypotheses = []
        if self.hypotheses:
            for each_hypothesis in self.hypotheses:
                hypos = each_hypothesis.fork_for_each_suffix(partial_candidates, error_scores=error_scores)
                new_hypotheses += hypos
        self.hypotheses = new_hypotheses
        return self

    def score_hypotheses(self):
        """
        Command to run scoring of all hypotheses by language model scoring function
        :return: list of scored hypotheses
        """
        sentences = [each_hypo.text for each_hypo in self.hypotheses]
        scores = self.lm.score_sentences(sentences)

        for num, each_hypo in enumerate(self.hypotheses):
            each_hypo.lm_score = scores[num]
        return self.hypotheses

    def prune_low_prob_hypotheses(self, prob_pruning_treshold=None, max_number_of_hypotheses=200):
        """
        Prunes hypotheses that has probability lower than treshold value.
        treshold value depends on value of max prob hypothesis.

        :param prob_pruning_treshold: [0,1] ratio of max probability that is required for a
        hypothesis to be kept in hypotheses hub
        :return:
        """
        scores = self.get_scores()
        # number check
        filtered_hypotheses = sorted(self.hypotheses, key=lambda x: x.total_score(), reverse=True)        
        if len(self.hypotheses)>max_number_of_hypotheses:
            # prune those which are the worst
            self.hypotheses = filtered_hypotheses[:max_number_of_hypotheses]     
            
        return self.hypotheses


class SpellingCorrectionCandidatesGenerator():
    """
    Generates candidates for words (words) with error scores
    """
    # decrement for frequent errors
    FREQUENT_ERRORS_DECREMENT_SCORE = -4.0
    EXCESSIVE_SPACE_ERROR_DECREMENT_SCORE = -6.0
    
    def __init__(self, path_to_dictionary=None):

        words_dict = []
        if not path_to_dictionary:
            path_to_dictionary = DATA_PATH + "compreno_wordforms.txt"
#             path_to_dictionary = DATA_PATH + "russian_words_vocab.dict"
        
        with open(path_to_dictionary, "r") as dict_file:
            words_dict = dict_file.read().splitlines()
        self.lsc = LevenshteinSearcherComponent(words=words_dict)

    def gen_candidates(self, token):
        """
        Given a token string generates candidates with error scores
        :param token: string with token
        :return:
        """
        #TODO black list support: some tokens should not variate (SB requirement)
        scored_candidates=self.lsc([[token]])[0][0]
        scores, w_forms = zip(*scored_candidates)
        w_forms = list(w_forms)
        scores = list(scores)
        # ############################################################################################
        # here is rule based/statistical substitutions with distant levenshtein can be applied:
                
        if token == "нить":
            w_forms.append("нибудь")
            scores.append(self.FREQUENT_ERRORS_DECREMENT_SCORE)
        elif token in ["оч"]:
            w_forms.append("очень")
            scores.append(self.FREQUENT_ERRORS_DECREMENT_SCORE)
        elif token in ["што", "шо", "чо", "чё"]:
            w_forms.append("что")
            scores.append(self.FREQUENT_ERRORS_DECREMENT_SCORE)
        elif token in ["ваще", "воще"]:
            w_forms.append("вообще")
            scores.append(self.FREQUENT_ERRORS_DECREMENT_SCORE)
        elif token in ["вообщем"]:
            w_forms.append("в общем")
            scores.append(self.FREQUENT_ERRORS_DECREMENT_SCORE)
        elif token in ["писят"]:
            w_forms.append("пятьдесят")
            scores.append(self.FREQUENT_ERRORS_DECREMENT_SCORE)
        elif token in ["аццкий"]:
            w_forms.append("адский")
            scores.append(self.FREQUENT_ERRORS_DECREMENT_SCORE)
        elif token in ["помойму", "помоиму"]:
            w_forms.append("по-моему")
            scores.append(self.FREQUENT_ERRORS_DECREMENT_SCORE)

        return scores, w_forms

    def variate_with_prefixes(self, candidates, error_scores):
        """
        Given a tokens candidates this method enriches the space of candidates with prefixed variants
        by default it prepends prefixes of the space and hyphen to tokens.

        So ["то"] -> ["то", "-то", " то"]

        :param candidates: list of candidate strings
        :param prefixes: list of possible prefixes
        :return: list of candidates enriched with prefixed versions
        """
        
#         prefixes = [" ", "-"]
        result_candidates = []
        result_scores = []
        for idx, each_candidate in enumerate(candidates):
            # add candidate produced by erroneous space problem.
            # Ex.: "при вет" -> "привет":
            result_candidates.append(each_candidate)
            result_scores.append(error_scores[idx] + self.EXCESSIVE_SPACE_ERROR_DECREMENT_SCORE)
            
            # add space candidate (no fix)
            result_candidates.append(" " + each_candidate)
            result_scores.append(error_scores[idx] + 0.0)
            
            # add hyphen candidates conditionally:
            # TODO improve heuristics for hyphen adding?
            # TODO add hyphen after "по"
            if each_candidate in ["то", "таки", "нибудь", "моему", "нашему", "твоему", "любому", "за", "другому", "как",
                                 "русски", "разному"]:
                result_candidates.append("-" + each_candidate)
                result_scores.append(error_scores[idx] + self.FREQUENT_ERRORS_DECREMENT_SCORE)
#         print("result_scores, result_candidates")
#         print(result_scores, result_candidates)
        return result_scores, result_candidates
    
class SpellingCorrector():
    # language_model;
    # error model;
    def __init__(self, sccg=None, language_model=None):
        """
        sccg - spelling corrector hypotheses generator instance
        lang_model - language model instance, by default KenLM
        """
        
        # setup hypotheses generator:
        if not sccg:
            self.sccg = SpellingCorrectionCandidatesGenerator()
        else:
            self.sccg = sccg
        
        # setup language model:
        if not language_model:
            # KenLM Elector as LM:
            ROOT_PATH = "~/.deeppavlov"
            DOWNLOADS_PATH = ROOT_PATH + "/downloads"        
            self.lm = KenlmElector(load_path=DOWNLOADS_PATH+"/language_models/ru_wiyalen_no_punkt.arpa.binary")

        else:
            self.lm = language_model
    
    def score_hypotheses_hub(self, hypotheses_hub):
        """Score hypotheses by LM"""
        return hypotheses_hub.score_hypotheses()
    
    def analyze_sentence(self, sentence):
        """
        Method for analyzing sentence: generating hypotheses and scoring them
        
        :param sentence: str, sentence with errors
        :return: correction hypotheses of the sentence
        """
        self.hypo_hub = HypothesesHub()
        # preprocessing:
        # TODO make lowercasing revertible:
        sentence = self.lowercase(sentence)        
        tokenized_input = self._tokenize(sentence)

        for idx, each_tok in enumerate(tokenized_input):
            # TODO optionally you could pass left and right context
            err_scores, tok_candidates = self.sccg.gen_candidates(each_tok)
            if idx>0:
                # TODO optionally you could pass left and right context
                err_scores, tok_candidates = self.sccg.variate_with_prefixes(tok_candidates, err_scores)
            
            self.hypo_hub = self.hypo_hub.append_partial_hypotheses(tok_candidates, err_scores)
            self.score_hypotheses_hub(self.hypo_hub)
            self.hypo_hub.prune_low_prob_hypotheses()
            
        return self.hypo_hub.hypotheses
    
    def __call__(self, input_sentences_batch):
        """
        Given a batch of sentences it returns a batch of corrected sentences
        """
        outputs = []
        for each_sent in input_sentences_batch:
            hypotheses = self.analyze_sentence(each_sent)
            the_best_hypothesis = hypotheses[0]
            outputs.append(the_best_hypothesis.text)
        return outputs
    
    def lowercase(self, sent_str):
        return sent_str.lower()
    
    def _tokenize(self, sent_str):
        return sent_str.split()

    def predict_correct(self, sentence_str):
        """
        predicts correction of the sentence
        :param sentence_str:
        :return:
        """

        pass

In [6]:
sc = SpellingCorrector()

In [55]:
# init it once because it loads for 2.5 minutes:
sccg = SpellingCorrectionCandidatesGenerator()

In [56]:
sc = SpellingCorrector(sccg=sccg)

In [41]:
# hypotheses = sc.analyze_sentence("когда нибудь чо нить")
hypotheses = sc.analyze_sentence("вот в инете откапал такую интеерсную статейку предлагаю вашему внимани")
# hypotheses = sc.analyze_sentence("давно стоит понять что в этой жизни никто не зделает так как лутче тебе")

# res
for each_hypothesis in hypotheses:
    print(each_hypothesis.text, each_hypothesis.total_score())

result_scores, result_candidates
[-6.0, 0.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0] ['в', ' в', '$', ' $', '%', ' %', '?', ' ?', '£', ' £', '§', ' §', '°', ' °', 'а', ' а', 'б', ' б', 'г', ' г', 'д', ' д', 'е', ' е', 'ж', ' ж', 'и', ' и', 'й', ' й', 'к', ' к', 'л', ' л', 'м', ' м', 'о', ' о', 'п', ' п', 'р', ' р', 'с', ' с', 'т', ' т', 'у', ' у', '

In [53]:
sc(["вот в инете откапал такую интеерсную статейку предлагаю вашему внимани"])

result_scores, result_candidates
[-6.0, 0.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0, -10.0, -4.0] ['в', ' в', '$', ' $', '%', ' %', '?', ' ?', '£', ' £', '§', ' §', '°', ' °', 'а', ' а', 'б', ' б', 'г', ' г', 'д', ' д', 'е', ' е', 'ж', ' ж', 'и', ' и', 'й', ' й', 'к', ' к', 'л', ' л', 'м', ' м', 'о', ' о', 'п', ' п', 'р', ' р', 'с', ' с', 'т', ' т', 'у', ' у', '

['вот в инете откапал такую интересную статейку предлагаю вашему вниманию']

In [20]:
hypotheses[0]

hypothesis: вот в инетеоткапалтакуюинтереснуюстатейку предлагаю вашему вниманию score: -51.936266

In [8]:
hypotheses[0].lm_score

-30.602611541748047

In [18]:
hypotheses[2].err_score

-28.0

In [70]:
hypotheses[15].lm_score

-16.81560516357422

In [74]:
hypotheses

[hypothesis: но когда нибудь score: -25.135276,
 hypothesis: но когда ни будь score: -25.287832,
 hypothesis: чо когда нибудь score: -25.789906,
 hypothesis: что когда нибудь score: -25.997870,
 hypothesis: что когда нибудь score: -25.997870,
 hypothesis: чо когда ни будь score: -26.056871,
 hypothesis: что когда ни будь score: -26.264837,
 hypothesis: что когда ни будь score: -26.264837,
 hypothesis: что когда-нибудь score: -26.422354,
 hypothesis: что когда-нибудь score: -26.422354,
 hypothesis: то когда нибудь score: -26.475263,
 hypothesis: до когда нибудь score: -26.531619,
 hypothesis: то когда ни будь score: -26.742228,
 hypothesis: до когда ни будь score: -26.798586,
 hypothesis: по когда нибудь score: -26.815306,
 hypothesis: о когда нибудь score: -26.992618,
 hypothesis: по когда ни будь score: -27.082272,
 hypothesis: но когда-нибудь score: -27.104317,
 hypothesis: ч когда нибудь score: -27.116168,
 hypothesis: чо когда-нибудь score: -27.241237,
 hypothesis: о когда ни будь 

In [27]:
ROOT_PATH = "~/.deeppavlov"
DOWNLOADS_PATH = ROOT_PATH + "/downloads"        
kenlm = KenlmElector(load_path=DOWNLOADS_PATH+"/language_models/ru_wiyalen_no_punkt.arpa.binary")


-23.836273193359375

In [28]:
kenlm.lm.score("вот в инете откапал такую интеерсную статейку предлагаю вашему внимани")

-63.04133224487305

In [29]:
kenlm.lm.score("вот в инетеоткапалтакуюинтереснуюстатейку предлагаю вашему вниманию")

-27.93626594543457

In [30]:
kenlm.lm.score("вот в инете откопал такую интересную статейку предлагаю вашему вниманию")

-48.60094451904297

In [20]:
tokenized_input = sc._tokenize("Привед ватрушка")
tokenized_input

['Привед', 'ватрушка']

In [28]:
sc.sccg.lsc([["Привед"]])

[[[(-8.0, 'Привед')]]]

In [8]:
scored_candidates = sc.sccg.gen_candidates("привед")
scored_candidates

([-4.0,
  -4.0,
  -4.0,
  -4.0,
  -4.0,
  -4.0,
  -4.0,
  -4.0,
  -4.0,
  -4.0,
  -4.0,
  -4.0,
  -4.0,
  -4.0,
  -8.0],
 ['привез',
  'привей',
  'привел',
  'привес',
  'привет',
  'приведи',
  'приведу',
  'приведя',
  'привад',
  'привод',
  'присед',
  'при ед',
  'пр вед',
  'при вед',
  'привед'])

In [29]:
scores, w_forms = (zip(*scored_candidates))

In [30]:
w_forms

('привез',
 'привей',
 'привел',
 'привес',
 'привет',
 'приведи',
 'приведу',
 'приведя',
 'привад',
 'привод',
 'присед',
 'при ед',
 'пр вед',
 'при вед',
 'привед')

In [32]:
part_hypos = sc.sccg.variate_with_prefixes(w_forms, prefixes=['-', ' '])
part_hypos

['привез',
 '-привез',
 ' привез',
 'привей',
 '-привей',
 ' привей',
 'привел',
 '-привел',
 ' привел',
 'привес',
 '-привес',
 ' привес',
 'привет',
 '-привет',
 ' привет',
 'приведи',
 '-приведи',
 ' приведи',
 'приведу',
 '-приведу',
 ' приведу',
 'приведя',
 '-приведя',
 ' приведя',
 'привад',
 '-привад',
 ' привад',
 'привод',
 '-привод',
 ' привод',
 'присед',
 '-присед',
 ' присед',
 'при ед',
 '-при ед',
 ' при ед',
 'пр вед',
 '-пр вед',
 ' пр вед',
 'при вед',
 '-при вед',
 ' при вед',
 'привед',
 '-привед',
 ' привед']

In [35]:
hypo_hub = HypothesesHub()
hypo_hub.hypotheses = [Hypothesis("")]
hypo_hub.append_partial_hypotheses(part_hypos)

<__main__.HypothesesHub at 0x7f7f2cbe7128>

In [36]:
hypo_hub.hypotheses

[<__main__.Hypothesis at 0x7f7f2cc31630>,
 <__main__.Hypothesis at 0x7f7f2cc314e0>,
 <__main__.Hypothesis at 0x7f7f2cc316a0>,
 <__main__.Hypothesis at 0x7f7f2cc31668>,
 <__main__.Hypothesis at 0x7f7f2c646668>,
 <__main__.Hypothesis at 0x7f7f2c6464e0>,
 <__main__.Hypothesis at 0x7f7f2c6460b8>,
 <__main__.Hypothesis at 0x7f7f2c646208>,
 <__main__.Hypothesis at 0x7f7f2c646630>,
 <__main__.Hypothesis at 0x7f7f2c6466a0>,
 <__main__.Hypothesis at 0x7f7f2c6466d8>,
 <__main__.Hypothesis at 0x7f7f2c646710>,
 <__main__.Hypothesis at 0x7f7f2c646780>,
 <__main__.Hypothesis at 0x7f7f2c6467b8>,
 <__main__.Hypothesis at 0x7f7f2c6467f0>,
 <__main__.Hypothesis at 0x7f7f2c646a58>,
 <__main__.Hypothesis at 0x7f7f2c70f198>,
 <__main__.Hypothesis at 0x7f7f2c70f1d0>,
 <__main__.Hypothesis at 0x7f7f2c70f208>,
 <__main__.Hypothesis at 0x7f7f2c70f240>,
 <__main__.Hypothesis at 0x7f7f2c70f278>,
 <__main__.Hypothesis at 0x7f7f2c70f2b0>,
 <__main__.Hypothesis at 0x7f7f2c70f2e8>,
 <__main__.Hypothesis at 0x7f7f2c7

In [37]:
hypo_hub.score_hypotheses()

[<__main__.Hypothesis at 0x7f7f2cc31630>,
 <__main__.Hypothesis at 0x7f7f2cc314e0>,
 <__main__.Hypothesis at 0x7f7f2cc316a0>,
 <__main__.Hypothesis at 0x7f7f2cc31668>,
 <__main__.Hypothesis at 0x7f7f2c646668>,
 <__main__.Hypothesis at 0x7f7f2c6464e0>,
 <__main__.Hypothesis at 0x7f7f2c6460b8>,
 <__main__.Hypothesis at 0x7f7f2c646208>,
 <__main__.Hypothesis at 0x7f7f2c646630>,
 <__main__.Hypothesis at 0x7f7f2c6466a0>,
 <__main__.Hypothesis at 0x7f7f2c6466d8>,
 <__main__.Hypothesis at 0x7f7f2c646710>,
 <__main__.Hypothesis at 0x7f7f2c646780>,
 <__main__.Hypothesis at 0x7f7f2c6467b8>,
 <__main__.Hypothesis at 0x7f7f2c6467f0>,
 <__main__.Hypothesis at 0x7f7f2c646a58>,
 <__main__.Hypothesis at 0x7f7f2c70f198>,
 <__main__.Hypothesis at 0x7f7f2c70f1d0>,
 <__main__.Hypothesis at 0x7f7f2c70f208>,
 <__main__.Hypothesis at 0x7f7f2c70f240>,
 <__main__.Hypothesis at 0x7f7f2c70f278>,
 <__main__.Hypothesis at 0x7f7f2c70f2b0>,
 <__main__.Hypothesis at 0x7f7f2c70f2e8>,
 <__main__.Hypothesis at 0x7f7f2c7

In [40]:
hypo_hub.hypotheses[0].score

1.0

In [11]:
scores, words = sc.sccg.gen_candidates("привед")

In [29]:
words

('привез',
 'привей',
 'привел',
 'привес',
 'привет',
 'приведи',
 'приведу',
 'приведя',
 'привад',
 'привод',
 'присед',
 'при ед',
 'пр вед',
 'при вед',
 'привед')

In [49]:
len(res)

100

[hypothesis: пр вед ватрушке в общемчтотелескапать score: 0.999347,
 hypothesis: пр вед ватрушке в общемчтотеле слазать score: 0.999336,
 hypothesis: пр вед ватрушке в общем шортеве скачать score: 0.999319,
 hypothesis: приведватрушке вобщемшоннебеслазать score: 0.998875,
 hypothesis: привед ватрушкам вобщем го тесеказать score: 0.998816,
 hypothesis: пр вед ватрушке в общем йо теес казать score: 0.998631,
 hypothesis: привед ватрушкам вобщемоте е слазать score: 0.997880,
 hypothesis: привед ват ушкаобщемшотесе скатать score: 0.997859,
 hypothesis: пр ведватрушкам вобщем иотеве слазать score: 0.997762,
 hypothesis: приведватрушке вобщемшоннебескапать score: 0.997581,
 hypothesis: приведватрушке вобщемшонзебе с казать score: 0.996866,
 hypothesis: пр вед ватрушкевобьем ош требе скакать score: 0.996045,
 hypothesis: пр вед ватрушке в общем йо теесказа ь score: 0.995384,
 hypothesis: привед ват ушкаобщем шло гебескачать score: 0.995212,
 hypothesis: приведватрушки вобьем отекеуказать scor

при вед ватрушке вобщем шовтребескапать 0.9999937537165716
при ведватрушка в общемшоп тебескапать 0.9998266075892084
приведяватрушкивобьемкотеле сказа ь 0.9994121022028675
при вед ватрушкевобьем шовнебе скатать 0.999360361384877
при ведватрушках вобьем шоптеше указать 0.9992974041991589
при ведватрушках вобьемяостебеказать 0.9978808294514523
при ведватрушкахвобьем шолебе скакать 0.9970061430406191
приведя ватрушках общем шон теге скатать 0.996688087234212
при ведватрушкавобьемао ребе смазать 0.9966866641873804
при вед ватрушкам вобщемботебасказа ь 0.9945597773603589
при ведватрушкавобьем шутеще сказа ь 0.9944513122869074
приведяватрушкивобьемфо т бе сказать 0.9936121604780269
при вед ватрушке вобщем шов теве скатать 0.9930758896178744
при ведватрушкавобьемно тезе скакать 0.9923264826330239
при ведват ушка общем котебауказать 0.9919261487977555
приведя ватрушкахобщем шоктеве скатать 0.9913399328988822
приведяватрушкивобьемфо т бе скакать 0.9910095721175272
приведя ватрушках общемотенеск

# Evaluate the model:

In [57]:
DIALOG_DATA_PATH = 'data/dialog/'
# Train data
TRAIN_ERRONEOUS_DATA = DIALOG_DATA_PATH + "source_sents.txt"
TRAIN_GOLDEN_DATA = DIALOG_DATA_PATH + "corrected_sents.txt"

# Test data
TEST_ERRONEOUS_DATA = DIALOG_DATA_PATH + "test_sample_testset.txt"
TEST_GOLDEN_DATA = DIALOG_DATA_PATH + "corr_sample_testset.txt"

In [58]:
with open(TEST_ERRONEOUS_DATA, 'r') as sents_file:
    erroneous_lines = sents_file.readlines()
    erroneous_lines = [line.strip() for line in erroneous_lines]
    
with open(TEST_GOLDEN_DATA, 'r') as sents_file:
    golden_lines = sents_file.readlines()
    golden_lines = [line.strip() for line in golden_lines]

In [62]:
from evaluate import evaluate_spelling_corrector
import datetime as dt

In [60]:
len(erroneous_lines)

2008

In [65]:
start_dt = dt.datetime.now()
hypotheses = sc(erroneous_lines)
fin_dt = dt.datetime.now()
print(fin_dt-start_dt)

0:23:38.088249


In [66]:
results = evaluate_spelling_corrector(erroneous_lines, golden_lines, hypotheses)
print(results['precision'])
print(results['recall'])
print(results['f_measure'])

Precision=54.33 Recall=56.77 FMeasure=55.53
1123 2067 1978
0.5432994678277697
0.5677451971688574
0.5552533992583436


In [68]:
hypotheses

['\ufeffесть у вас оформленный и подписанный мною заказ',
 'вот в инете откапал такую интересную статейку предлагаю вашему вниманию',
 'я на всю жизнь запомню свое первое купание в зимнем ледяноменисее',
 'думаем что не ошибемся если скажем что выставка лучшие фотографии россии 2012 станет одним из самых значимых событий в культурной жизни перми и ее жителей',
 'судьба человека может складываться очень разнообразно в жизни много прекрасных светлых радостных моментов',
 'а я думаю вот если бы хозяин этой лодки тебе ее подарил и ты бы собрал знакомых и устроил на этой лодке большие катание с размахом',
 'например парадный портрет николаяii при орденах и регалиях освященный как икона и снабженный 5-рублевой монетой и роскошным текстом снизу',
 'двубортное кашемировое пальто песочного цвета классической фасона',
 'поговорила с директором и он мне обещал позвонить вечером и сказать берет он меня на работу',
 'съездил потом в рыб оказалось что на нем и царапины купил мази и капли',
 'но даже

In [69]:
from where_is_mistake import where_is_mistake
where_is_mistake(erroneous_lines, golden_lines, [hypotheses], ["naive"], "naive_sc.txt")