In [None]:
import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from elasticsearch_dsl import Search, Q

In [None]:
es = Elasticsearch([
    {'host': '172.17.0.2'}
])

In [None]:
s = Search(using=es, index="simplewiki")

In [None]:
s = Search(using=es, index="simplewiki").query("match", _all="dog")

In [None]:
s = Search(using=es, index="simplewiki")

from functools import reduce
s.query = reduce((lambda x, y: x | y), [Q("match", title=t) for t in ["dog", "cat", "fox"]])

In [None]:
import pandas as pd

In [None]:
docs = pd.DataFrame({"title": hit.title, "text": hit.text} for hit in s.scan())

In [None]:
len(docs)

In [None]:
from gensim import models
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [None]:
id2word = Dictionary(tokenize(d) for d in docs["text"])
id2word.filter_extremes(no_below=5, no_above=0.1)
id2word.compactify()

In [None]:
corpus = [id2word.doc2bow(tokenize(d)) for d in docs["text"]]

In [None]:
#from sklearn.manifold import TSNE
from MulticoreTSNE import MulticoreTSNE as TSNE

perplexity = 15
learning_rate = 400

#tsne = TSNE(n_components=2, perplexity=perplexity, learning_rate=learning_rate, random_state=1)
tsne = TSNE(n_components=2, perplexity=perplexity, learning_rate=learning_rate, random_state=1, n_jobs=8)

# LDA

In [None]:
from gensim.models import LdaModel

In [None]:
model = LdaModel(corpus, num_topics=20, id2word=id2word, passes=10)

In [None]:
gamma, _ = model.inference(corpus)
doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
doc_topics_df = pd.DataFrame(doc_topic_dists, columns = ["Topic %d" % i for i in range(doc_topic_dists.shape[1])])

In [None]:
tsne_data = tsne.fit_transform(doc_topic_dists)
tsne_df = pd.DataFrame(tsne_data, columns=['Component 1', 'Component 2'], index=docs.index)
docs_all = pd.concat([docs, tsne_df], axis = 1)

In [None]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [None]:
vis_data = gensimvis.prepare(model, corpus, id2word)
pyLDAvis.display(vis_data)

# LSI

In [None]:
from gensim.models import LsiModel
import numpy as np
lsi_model = LsiModel(corpus, id2word=id2word, num_topics=20)

doc_lsi_topics = np.array([[dd[1] for dd in d] if len(d) == 20 else [0] * 20 for d in lsi_model[corpus]])
doc_lsi_topics_df = pd.DataFrame(doc_lsi_topics, columns = ["Topic %d" % i for i in range(lsi_model.projection.k)])

In [None]:
tsne_data = tsne.fit_transform(doc_lsi_topics)
tsne_df = pd.DataFrame(tsne_data, columns=['Component 1', 'Component 2'], index=docs.index)
docs_all = pd.concat([docs, tsne_df], axis = 1)

# Both

In [None]:
from bokeh.plotting import figure, ColumnDataSource, output_notebook, output_file, show, save 
from bokeh.models import HoverTool, WheelZoomTool, PanTool, BoxZoomTool, ResetTool, TapTool, SaveTool
from bokeh.palettes import brewer
output_notebook()

In [None]:
title = "Search Results"

source = ColumnDataSource(docs_all)

#hover = HoverTool(tooltips=[(column, '@' + column) for column in reversed(docs_all.columns)])
hover = HoverTool(tooltips=[(column, '@' + column) for column in ["title"]])

tools = [hover, WheelZoomTool(), PanTool(), BoxZoomTool(), ResetTool(), TapTool(), SaveTool()]

p = figure(
    tools=tools,
    title=title,
    plot_width=800,
    plot_height=800,
    toolbar_location='below',
    toolbar_sticky=False, )

p.circle(
    x='Component 1',
    y='Component 2',
    source=source,
    size=10,
    line_color='#333333',
    line_width=0.5,
    fill_alpha=0.8,
    color='#333333')

show(p)