In [None]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel, LsiModel, TfidfModel, Doc2Vec, FastText
from gensim import corpora
import pandas as pd
import re

In [None]:
# !pip install bokeh
import bokeh.models as bm, bokeh.plotting as pl, bokeh.palettes as palettes
from bokeh.io import output_notebook
output_notebook()

In [None]:
def draw_vectors(x, y, radius=10, alpha=0.25, classes=None,
                 width=600, height=400, show=True, tokens=None):
    
    if classes is None:
        color = ["blue"] * len(x)
    else:
        n = np.unique(classes)
        palette = palettes.viridis(len(n))
        indx = {n[i]:i for i in range(len(n))}
        color = [palette[indx[i]] for i in classes]
    
    if isinstance(color, str): color = [color] * len(x)
    if classes is None:
        data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, "tokens": tokens })
    else:
        data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, 'classes':classes, "tokens": tokens })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    if classes is None:
        fig.circle('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)
    else:
        fig.circle('x', 'y', size=radius, color='color', alpha=alpha, source=data_source, legend='classes')

    fig.add_tools(bm.HoverTool(tooltips=[("tokens", "@" + "tokens")]))
    
    url = "https://@tokens"
    fig.add_tools(bm.TapTool(callback=bm.OpenURL(url=url)))
    if not classes is None:
        fig.legend.location = "top_left"
    if show: pl.show(fig)
    return fig

Загружаем модели

In [None]:
lda = LdaModel.load('700K/gensim_lda_model_700K_400_de8')
lsi = LsiModel.load('700K/gensim_lsi_model_700K_de8')
doc2vec = Doc2Vec.load('700K/doc2vec_model_700K_de')
fast = FastText.load('700K/fasttext_700K_de.model')

In [None]:
lda_size = lda.num_topics
lsi_size = lsi.num_topics
doc2vec_size = doc2vec.vector_size
fast_size = fast.vector_size

In [None]:
def flatify(pair_list, size):
    result = [0. for _ in range(size)]
    for (idx, val) in pair_list:
        result[idx] = float(val)
    return result

In [None]:
df = pd.read_csv('de_inter_shop.csv', sep='\t')
df.describe()

In [None]:
from nltk import  word_tokenize
import string
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem.cistem import Cistem
from nltk.tokenize.toktok import ToktokTokenizer


stemm = Cistem()
tokk = RegexpTokenizer('\-?[0-9]+(?:\.[0-9]+)?|\w+')
# sw = stop_words.get_stop_words("de")

def stemm_it(t):
    ll = tokk.tokenize(t)
    #ll = [x for x in ll if not re.fullmatch('[' + string.punctuation + ']+', x)]
    return [i for i in map(stemm.stem, ll)]

In [None]:
df.content = df.content.apply(lambda x: x.lower())
df.content = df.content.apply(lambda x: re.sub('[0-9]', ' ', x))
df.content = df.content.apply(lambda x: re.sub(r's+', ' ', x))
df.head(2)

In [None]:
loaded_dct = Dictionary.load_from_text("700K/de_dict_700K.dict")
tfidf = TfidfModel.load("700K/tfidfModel_700K")

In [None]:
# tokens = df.content.apply(lambda x: x.split())
tokens = df.content.apply(stemm_it)
d2b_vector = [loaded_dct.doc2bow(token) for token in tokens]
tf_vector = tfidf[d2b_vector]

In [None]:
df['tf_vectors'] = tf_vector
df['tokens'] = tokens
df['content'] = df.tokens.apply(lambda r: ' '.join(r))
df.head(2)

In [None]:
df['vector_lda'] = df.tf_vectors.apply(lambda x: flatify(lda[x], lda_size))

In [None]:
df['vector_lsi'] = df.tf_vectors.apply(lambda x: flatify(lsi[x], lsi_size))

In [None]:
df['fast'] = df.content.apply(lambda x: fast.wv[x])

In [None]:
df['doc2vec'] = df.tokens.apply(lambda x: doc2vec.infer_vector(x))

In [None]:
df.drop(['clusters_lda', 'clusters_lsi', 'clusters_d2v', 'clusters_fast'], axis=1, inplace=True)

In [None]:
df.to_csv('de_int_shop_cluster.csv', sep='\t', index=False)

In [None]:
vector_lda = df.vector_lda.apply(pd.Series).values
vector_lsi = df.vector_lsi.apply(pd.Series).values
vector_fast = df.fast.apply(pd.Series).values
vector_doc2vec = df.doc2vec.apply(pd.Series).values

In [None]:
from sklearn.manifold import TSNE
lda_pic = TSNE(n_components=2).fit_transform(vector_lda)
lsi_pic = TSNE(n_components=2).fit_transform(vector_lsi)
fast_pic = TSNE(n_components=2).fit_transform(vector_fast)
doc2vec_pic = TSNE(n_components=2).fit_transform(vector_doc2vec)

In [None]:
# !pip3 install umap-learn

In [None]:
import umap
reducer1 = umap.UMAP()
lda_umap = reducer1.fit_transform(vector_lda)
reducer2 = umap.UMAP()
lsi_umap = reducer2.fit_transform(vector_lsi)
reducer3 = umap.UMAP()
fast_umap = reducer3.fit_transform(vector_fast)
reducer4 = umap.UMAP()
doc2vec_umap = reducer4.fit_transform(vector_doc2vec)

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
import numpy as np

def de_clusters(type_clust ,vectors, num_clusters, name_vec, vec_for_draw):
    if type_clust == 'aglo':
        agglo = AgglomerativeClustering(n_clusters=num_clusters, affinity='euclidean')
        answer = agglo.fit_predict(vectors)
    elif type_clust == 'kmeans':
        km = KMeans(n_clusters=num_clusters)
        km.fit(vectors)
        answer = km.labels_.tolist()
    df['clusters_{}'.format(name_vec)] = answer
    data = df['domain']
    x, y = vec_for_draw[:,0], vec_for_draw[:,1]
    color = answer
    draw_vectors(x,y, classes=color, alpha=0.7, tokens=data)

In [None]:
num_clusters = 3
type_clust = 'kmeans'
zip_param = [
    [type_clust, vector_lda, num_clusters, 'lda', lda_umap],
    [type_clust, vector_lsi, num_clusters, 'lsi', lsi_umap],
    [type_clust, vector_fast, num_clusters, 'fast', fast_umap],
    [type_clust, vector_doc2vec, num_clusters, 'd2v', doc2vec_umap],
]

In [None]:
for z in zip_param:
    de_clusters(*z)