In [1]:
import re
from collections import Counter
from typing import List, Tuple

from pymorphy2 import MorphAnalyzer

In [2]:
morph = MorphAnalyzer()


def normalize(line: str) -> List[str]:
    return [morph.parse(word)[0].normal_form for word in (v.lower() for v in re.findall(r"('?[а-яА-ЯёЁ][а-яА-ЯёЁ]*(?:-[а-яА-ЯёЁ]+)*'?)", line))]

def get_n_gramm(words: list, n: int = 2): 
    return [tuple(words[i-n+1:i+1]) for i in range(n - 1, len(words))]

def get_new_gramms(line_words: List[str], n: str, gramms: dict) -> bool:
    new_gramms = {}
    for i in range(0, len(line_words) - n + 1):
        key = tuple(line_words[i:i+n-1])
        if key in gramms:
            new_key = key + (line_words[i+n-1],)
            new_gramms.setdefault(new_key, 0)
            new_gramms[new_key] += 1
    
    return new_gramms

In [3]:
with open("./data/example_text.txt", 'r', encoding='utf8') as fp:
    text = fp.read()

In [4]:
normal_text = [normalize(line) for line in text.split('\n')]

all_2_gramm = []
for normal_line in normal_text:
    if normal_line:
        gramms = get_n_gramm(normal_line, 2)
        if gramms:
            all_2_gramm.extend(gramms)
dict_2_gramms = {name: count for name, count in Counter(all_2_gramm).items() if count > 1}

gramm_found = [True]*len(normal_text)
dicts_n_gramms = [{}, {}, dict_2_gramms]
for n in range(3, max([len(normal_line) for normal_line in normal_text])//10 + 1):
    last_gramms = dicts_n_gramms[n-1]
    dict_n_gramms = Counter()
    for i, normal_line in enumerate(normal_text):
        if not gramm_found[i]:
            continue
        found = False
        new_gramms = get_new_gramms(normal_line, n, last_gramms)
        if new_gramms:
            found = True
            dict_n_gramms += Counter(new_gramms)
        gramm_found[i] = found
    dict_n_gramms = {name: count for name, count in dict_n_gramms.items() if count > 1}
    if not dict_n_gramms:
        break
    dicts_n_gramms.append(dict_n_gramms)

result = Counter()
for i in dicts_n_gramms:
    result += Counter(i)
result = Counter({' '.join(key): value for key, value in result.items()})

In [5]:
len(result)

852878

In [None]:
len([n for n, v in result.items() if v > 3])

In [None]:
# sorted([n for n, v in result.items() if v == 3], key=lambda x: len(x), reverse=True)

In [None]:
sorted(sorted([n for n, v in result.items()]), key=lambda x: len(x), reverse=True)