In [6]:
from collections import defaultdict
from preprocessing import preprocess
from spell_checker import SpellChecker
from utils.amharic_tokenizer import AmharicSegmenter
from utils.file_reader import read_file, read_lines
from nltk import ngrams
import time, pickle, os

In [7]:
dictionary_path = '../data/amharic_dictionary.txt'
corpus_path = '../data/amharic_corpus.txt'
tokenizer = AmharicSegmenter()
spell_checker = SpellChecker(dictionary_path)


In [8]:
def check(text):
    result = {
        "text": text,
        "errors": {}
    }
    text = preprocess(text)
    print(text)
    sentences = tokenizer.tokenize_sentence(text)
    for sentence in sentences:
        words = tokenizer.tokenize(sentence)
        i = 0
        for word in words:
            if not spell_checker.check_spelling(word):
                suggestions = spell_checker.suggest_corrections(word)
                result["errors"][word] = {
                    'sentence': sentence,
                    'tokenized': words,
                    'suggestions': suggestions,
                    'index': i}
            i += 1
    return result

In [9]:
from nltk.util import ngrams
import pickle

class NgramModel:
    def __init__(self, corpus: str, ngram_size=2):
        self.corpus = corpus
        self.tokenizer = AmharicSegmenter()
        self.preprocess()
        self.ngram_size = ngram_size
        self.ngram_counts = defaultdict(int)
    
    def preprocess(self):
        self.corpus = preprocess(self.corpus)
        self.corpus = self.tokenizer.tokenize_sentence(self.corpus)
        
    def train(self):
        for sentence in self.corpus:
            words = self.tokenizer.tokenize(sentence)
            for ngram in ngrams(words, self.ngram_size):
                self.ngram_counts[ngram] += 1
    
    def get_ngram_count(self, ngram):
        return self.ngram_counts[ngram]
    
    def get_ngram_probability(self, ngram):
        ngram_count = self.get_ngram_count(ngram)
        total_count = sum(self.ngram_counts.values())
        return ngram_count / total_count

    def save(self, path):
        if not path.endswith('.pkl'):
            path += '.pkl'
        # check: if path does not exist create one
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        with open(path, 'wb') as f:
            pickle.dump(self, f)
    
    def get_ngram_probability_with_smoothing(self, ngram):
        """
        ngram: (a, b, c) = (before, word, after)
        """ 
        ngram_count = self.get_ngram_count(ngram)
        total_count = sum(self.ngram_counts.values())
        return (ngram_count + 1) / (total_count + len(self.ngram_counts))
    
    
    

In [10]:
from utils.file_reader import read_file
corpus = read_file(corpus_path)

In [11]:
model = NgramModel(corpus, ngram_size=2)

In [12]:
model.train()

In [21]:
model.save('../models/bigram_model_001.pkl')

In [18]:
def rank_suggestions(dictionary_result):
    result = {}
    
    for error, suggestion in dictionary_result["errors"].items():
        wrong_word = error
        sentence = suggestion["sentence"]
        index = suggestion["index"]
        words = suggestion["tokenized"]
        suggestions = suggestion["suggestions"]
        
        ngram = []
        if len(words) == index-1 and len(words) > 1:
            # the word is at the end of the sentence
            before_word = words[index-1]
            suggestions = sorted(suggestions, key=lambda x: model.get_ngram_probability((before_word,) + (x,)), reverse=True)
        elif index == 0 and len(words) > 1:
            # the word is at the beginning of the sentence
            after_word = words[index+1]
            suggestions = sorted(suggestions, key=lambda x: model.get_ngram_probability((x,) + (after_word,)), reverse=True)
        elif len(words) == 1:
            pass
        else:
            before_word = words[index-1] # we will use this to rank the suggestions
            # after_word = words[index+1]
            suggestions = sorted(suggestions, key=lambda x: model.get_ngram_probability((before_word,) + (x,)), reverse=True)
            
        
        word_result = {
            'sentence': sentence,
            'suggestions': suggestions,
        }
        
        result[wrong_word] = word_result
    
    return result
   

In [13]:
sample_amharic_text = "የቤት ውስጥ ስራ \"የሴቶች ስራ\" ብቻ ሳይቻሆን የሁሉም ሰው ሊሆን ይችላን።"
result = check(sample_amharic_text)

የቤት ውስጥ ስራ የሴቶች ስራ ብቻ ሳይቻሆን የሁሉም ሰው ሊሆን ይችላን።


In [19]:
rank_suggestions(result)

{'ሳይቻሆን': {'sentence': 'የቤት ውስጥ ስራ የሴቶች ስራ ብቻ ሳይቻሆን የሁሉም ሰው ሊሆን ይችላን።',
  'suggestions': ['ሳይሆን', 'ሳይቻሆን']},
 'ይችላን': {'sentence': 'የቤት ውስጥ ስራ የሴቶች ስራ ብቻ ሳይቻሆን የሁሉም ሰው ሊሆን ይችላን።',
  'suggestions': ['ይችላል', 'ይችላሉ', 'ይችላን']}}

In [20]:
model.get_ngram_probability(('ሊሆን','ይችላል'))

4.566887202941682e-05