In [None]:
import re
import math
from collections import defaultdict

file_path = 'corpus.txt'

with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

def train_model(text):
    words = re.findall(r'\b\w+\b', text.lower())
    unigram_counts = defaultdict(int)
    bigram_counts = defaultdict(int)

    total_words = len(words)

    # Подсчет униграмм
    for word in words:
        unigram_counts[word] += 1

    # Подсчет биграмм
    if total_words > 1:
        for i in range(total_words - 1):
            bigram_counts[(words[i], words[i+1])] += 1

    # Подсчет частоты символов для обработки неизвестных слов
    char_unigram_counts = defaultdict(int)
    for char in text:
        if 'а' <= char.lower() <= 'я':
            char_unigram_counts[char.lower()] += 1

    total_chars = sum(char_unigram_counts.values())

    # Вычисление вероятностей (сглаживание Лапласа)
    vocab_size = len(unigram_counts)
    unigram_probs = {word: (count + 1) / (total_words + vocab_size)
                     for word, count in unigram_counts.items()}

    bigram_probs = {
        (w1, w2): (count + 1) / (unigram_counts[w1] + vocab_size)
        for (w1, w2), count in bigram_counts.items()
    }

    char_unigram_probs = {
        char: (count + 1) / (total_chars + len(char_unigram_counts))
        for char, count in char_unigram_counts.items()
    }

    return unigram_probs, bigram_probs, char_unigram_probs, set(unigram_counts.keys())

UNIGRAM_PROBS, BIGRAM_PROBS, CHAR_UNIGRAM_PROBS, VOCABULARY = train_model(text)

def get_word_prob(word, prev_word, unigram_probs, bigram_probs, char_probs):
    prob = 1e-10

    if prev_word is not None and (prev_word, word) in bigram_probs:
        prob = bigram_probs[(prev_word, word)]
    elif word in unigram_probs:
        prob = unigram_probs[word]
    else:
        for char in word:
            prob *= char_probs.get(char, 1e-10)

    return math.log(prob)

def reconstruct_text(text_without_spaces, unigram_probs, bigram_probs, char_probs, vocabulary):
    """
    Основной алгоритм восстановления текста.
    Использует динамическое программирование для поиска наиболее вероятного разбиения.
    """
    n = len(text_without_spaces)
    dp = [(-math.inf, -1, None)] * (n + 1)
    dp[0] = (0, -1, None)

    for i in range(1, n + 1):
        for j in range(i):
            word = text_without_spaces[j:i]

            prev_word = dp[j][2]

            log_prob = get_word_prob(word, prev_word, unigram_probs, bigram_probs, char_probs)

            if dp[j][0] != -math.inf:
                current_log_prob = dp[j][0] + log_prob
                if current_log_prob > dp[i][0]:
                    dp[i] = (current_log_prob, j, word)

    if dp[n][0] == -math.inf:
        return "Не удалось восстановить текст."

    result_words = []
    current_index = n
    while current_index > 0:
        log_prob, prev_index, word = dp[current_index]
        result_words.append(word)
        current_index = prev_index

    return ' '.join(reversed(result_words))


if __name__ == '__main__':
    texts_to_process = ["приветкакдела"]


    for text in texts_to_process:
        result = reconstruct_text(text, UNIGRAM_PROBS, BIGRAM_PROBS, CHAR_UNIGRAM_PROBS, VOCABULARY)
        print(f"Входной текст: '{text}'")
        print(f"Восстановленный текст: '{result}'")