In [None]:
import pickle
import os
os.chdir('../../')
from src.utils.helper_som import get_minimal_distance_factors

import numpy as np
import pandas as pd

from matplotlib import cm
from bokeh.colors import RGB
from bokeh.models import HoverTool
from bokeh.plotting import ColumnDataSource, figure, output_file, show
from bokeh.io import curdoc, show, output_notebook

output_notebook()

In [None]:
# import data, normed document vectors and som
df_sample = pd.read_pickle(filepath_or_buffer='data/processed/df_sample.pkl')
normed_array_doc_vec = np.load(file='data/processed/doc_vec_norm_sample.npy')
with open('data/processed/som.p', 'rb') as infile:
    som = pickle.load(infile)

# compute parameters
len_vector = normed_array_doc_vec.shape[1]
x, y = get_minimal_distance_factors(n=5 * np.sqrt(normed_array_doc_vec.shape[0]))

# will consider all the sample mapped into a specific neuron as a cluster.
# to identify each cluster more easily, will translate the bi-dimensional indices
# of the neurons on the SOM into mono-dimensional indices.
# each neuron represents a cluster
winner_coordinates = np.array([som.winner(x) for x in normed_array_doc_vec]).T
# with np.ravel_multi_index, we convert the bi-dimensional coordinates to a mono-dimensional index
cluster_index = np.ravel_multi_index(multi_index=winner_coordinates, dims=(x, y))

In following three lines, we return:
- position of neurons on a euclidean plane that reflects chosen topology in meshgrids xx, yy e.g. (1,4) -> xx[1,4], yy[1,4]
- distance map of the weights
- weights of neural network

In [None]:
xx, yy = som.get_euclidean_coordinates()
umatrix = som.distance_map() 
weights = som.get_weights()

In [None]:
tile_centres_column = []
tile_centres_row = []
hex_colours = []
for i in range(weights.shape[0]):
    for j in range(weights.shape[1]):
        wy = yy[(i, j)] * 2 / np.sqrt(3) * 3 / 4
        tile_centres_column.append(xx[(i, j)])
        tile_centres_row.append(wy)
        hex_colours.append(cm.viridis(umatrix[i, j]))
        
weight_x = []
weight_y = []
for i in normed_array_doc_vec:
    w = som.winner(i)
    wx, wy = som.convert_map_to_euclidean(xy=w)
    wy = wy * 2 / np.sqrt(3) * 3/4
    weight_x.append(wx)
    weight_y.append(wy)

# convert matplotlib colour palette (RGB float tuple) to bokeh colour palette (hex strings)
viridis_plt_hex = [(255 * np.array(i)).astype(int) for i in hex_colours]
viridis_bokeh_hex = [RGB(*tuple(rgb)).to_hex() for rgb in viridis_plt_hex]

## Visualisation
We plot the hexagonal topology below, using bokeh so we can enable interactivity and thereby allow users to see which pages belong to which clusters of similar content.

In [None]:
output_file("outputs/som_visualise_bokeh.html")

fig = figure(title="SOM: Hexagonal Topology",
             plot_height=800, plot_width=800,
             match_aspect=True,
             tools="wheel_zoom,save,reset")

source_hex = ColumnDataSource(
    data = dict(
        x=tile_centres_column,
        y=tile_centres_row,
        c=viridis_bokeh_hex
    )
)

source_dot = ColumnDataSource(
    data=dict(
        wx=weight_x,
        wy=weight_y,
        bp=df_sample['base_path']
    )
)

fig.hex(x='y', y='x', source=source_hex,
        size=50 * (.95 / np.sqrt(3)),
        alpha=.4,
        line_color='gray',
        fill_color='c')

fig.dot(x='wy', y='wx', source=source_dot,
        size=30, 
        line_color='black')

fig.add_tools(HoverTool(
    tooltips=[
        ("base_path", '@bp'), 
        #("(x,y)", "($x, $y)")
    ],
    mode="mouse", 
    point_policy="follow_mouse"
))

show(fig)