In [60]:
import re
from navec import Navec
from scipy.spatial import distance
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import sys

In [8]:
def create_array_from_file(file):
    array = []
    for l in file:
        k = re.split('\s+', l)
        array.append(k[0])
    return array

In [9]:
#получение всех возможных разбиений аббревиатуры на блоки
def segmentation(abbreviation, n=10, max_len=6):
    n = min(n, len(abbreviation))
    parts_set = []
    if n==1 or len(abbreviation)==1:
        return [[abbreviation]]
    for i in range(1, min(max_len+1, len(abbreviation))):
        pref = [abbreviation[0:i]]
        parts = segmentation(abbreviation[i:len(abbreviation)], n-1)
        if pref!=['']:
            for j in range(len(parts)):
                parts[j] = pref+parts[j]
        parts_set += parts
    if len(abbreviation)<=max_len:
        parts_set += [[abbreviation]]
    return parts_set 

In [11]:
#получение всех значимых разбиений аббревиатуры
def get_reasonable_segments(abbreviation):
    parts_set = segmentation(abbreviation)
    rus_dict = open("../dictionaries/dictionary.txt", 'r', encoding='utf-8')
    rus_dict_array = create_array_from_file(rus_dict)
    list_of_extended_words_lists = []
    for partition in parts_set:
        #print(partition)
        partition_is_reasonable = True
        extended_words_list = []
        for part in partition:
            l = 0
            r = len(rus_dict_array)
            #ищём в словаре место для данного блока
            while(l < r-1):
                m = (l+r)//2
                if rus_dict_array[m] <= part:
                    l = m
                else:
                    r = m
            #print('hi')
            if rus_dict_array[l]==part:
                start = l
            else:
                start = l+1
            #находим все слова, являющиеся расширением данного блока
            words = []
            while start<len(rus_dict_array):
                word = rus_dict_array[start]
                if word[0:len(part)]==part:
                    words.append(word)
                    start += 1
                else:
                    break
            if len(words)==0:
                partition_is_reasonable = False
                break
            else:
                extended_words_list.append(words)
        if partition_is_reasonable:
            #print(extended_words_list)
            list_of_extended_words_lists.append(extended_words_list)
    return list_of_extended_words_lists         

In [12]:
def add_vectors(v, w):
    res = [(vi + wi)/2 for vi, wi in zip(v, w)]
    return res

In [13]:
def recursive_checking(part, navec, threashold, idx=0, vec=[]):
    eps = 1e-9
    if idx==len(part):
        return [[]]
    suitable = []
    for i in range(len(part[idx])):
        try:
            word_vec = navec[part[idx][i]]
        except KeyError:
            continue
        if vec!=[]:
            dist = distance.cosine(vec, word_vec)
            if dist+eps>threashold:
                continue
            new_vec = add_vectors(vec, word_vec)
        else:
            new_vec = word_vec
        word = [part[idx][i]]    
        phrases = recursive_checking(part, navec, threashold, idx+1, new_vec)
        for j in range(len(phrases)):
            phrases[j] = word+phrases[j]
        suitable += phrases
    return suitable

In [14]:
def segments_union(partition, threashold):
    path = '../dataset/fit_language_models/navec_hudlit_v1_12B_500K_300d_100q.tar'
    navec = Navec.load(path)
    phrases = []
    for part in partition:
        phrases += recursive_checking(part, navec, threashold)
    return phrases

In [49]:
def get_num_for_friend(phrases, navec):
    phrase_num = []
    for phrase in phrases:
        vec = navec[phrase[0]]
        for i in range(1, len(phrase)-1):
            vec = add_vectors(vec, navec[phrase[i]])
        last_word = phrase[len(phrase)-1]
        num = distance.cosine(vec, navec[last_word])
        phrase_num.append((phrase, num))
    return phrase_num

In [56]:
def clasterization(phrases):
    path = '../dataset/fit_language_models/navec_hudlit_v1_12B_500K_300d_100q.tar'
    navec = Navec.load(path)
    phrase_num = get_num_for_friend(phrases, navec)
    X = []
    for phrase in phrases:
        vec = []
        for word in phrase:
            if vec==[]:
                vec = navec[word]
            else:
                vec = add_vectors(vec, navec[word])
        X.append(vec)
    X = StandardScaler().fit_transform(X)
    db = DBSCAN(eps=16, min_samples=1).fit(X)
    labels = db.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    clastered_phrases = []
    best_phares_for_label = {}
    for i in range(len(phrases)):
        if labels[i]==-1:
            clastered_phrases.append(phrases[i])
            continue
        if not labels[i] in best_phares_for_label:
            best_phares_for_label[labels[i]]=phrases[i]
        else:
            existed = best_phares_for_label[labels[i]]
            best_phares_for_label[labels[i]]=min(phrases[i], existed)
    for phrase in best_phares_for_label.values():
        clastered_phrases.append(phrase)
    return clastered_phrases

In [None]:
def estimation():
    

In [10]:
# parts_set = segmentation('даздрасперма')
# rationality_filtration(parts_set)

In [65]:
k = get_reasonable_segments('тд')

In [66]:
o = segments_union(k, 0.7)

  # This is added back by InteractiveShellApp.init_path()


In [None]:
['табак', 'тверь'] ['кот', 'трубка'] 

In [61]:
p = get_reasonable_segments('йюй')

In [62]:
l = segments_union(k, 0.7)

  # This is added back by InteractiveShellApp.init_path()


In [63]:
l

[['йодистый', 'юнкерский', 'йод'],
 ['йодистый', 'юнкерский', 'йодистый'],
 ['йоркшир', 'юго-восточный', 'йоркшир'],
 ['йоркшир', 'юго-восточный', 'йоркширский'],
 ['йоркшир', 'юго-западный', 'йоркшир'],
 ['йоркшир', 'юго-западный', 'йоркширский'],
 ['йоркшир', 'южный', 'йоркшир'],
 ['йоркшир', 'южный', 'йоркширский'],
 ['йоркширский', 'южноамериканский', 'йодистый'],
 ['йоркширский', 'южноамериканский', 'йоркшир'],
 ['йоркширский', 'южноамериканский', 'йоркширский'],
 ['йоркширский', 'южноафриканский', 'йоркшир'],
 ['йоркширский', 'южноафриканский', 'йоркширский'],
 ['йоркширский', 'южнорусский', 'йоркширский'],
 ['йоркширский', 'юморок', 'йоркширский']]

In [64]:
clasterization(l)

  if __name__ == '__main__':


[['йодистый', 'юнкерский', 'йод'],
 ['йодистый', 'юнкерский', 'йодистый'],
 ['йоркшир', 'юго-восточный', 'йоркшир'],
 ['йоркширский', 'южноамериканский', 'йодистый']]