In [1]:
import networkx as nx
import numpy as np
import pandas as pd
from math import sqrt
import csv
import sys

In [2]:
from sklearn.cluster import DBSCAN
from scipy.sparse import *

In [3]:
from tqdm.notebook import tqdm

In [4]:
from py.pyBallMapper_Bokeh import graph_GUI, read_graph_from_list

In [5]:
from matplotlib.colors import ListedColormap
from matplotlib import cm

from bokeh.models import FixedTicker, LinearColorMapper, LogColorMapper, ColorBar, BasicTicker, LogTicker
from matplotlib.colors import to_hex

In [6]:
from bokeh.plotting import figure, show

In [7]:
# to deal with large csv
maxInt = sys.maxsize
decrement = True

while decrement:
    # decrease the maxInt value by factor 10
    # as long as the OverflowError occurs.
    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt/10)
        decrement = True

# Let us visualize the BM plots

In [8]:
# table with the coloring functions
coloring_df = pd.read_csv('data/digits_y.csv', sep=' ')
# R indices start from 1...
coloring_df.index = range(1, len(coloring_df)+1)

print(coloring_df.shape)

(1797, 1)


## BM on full digits data

In [9]:
# PLOT THE BM GRAPH o

EPSILON = 50

# adj lists path
GRAPH1_PATH = 'BM_graphs/digits_X/{}_edges'.format(EPSILON)
# point covered by each node path
GRAPH1_POINTS_PATH = 'BM_graphs/digits_X/{}_points_covered_by_landmarks'.format(EPSILON)


###########
# GRAPH 1 #
###########

#Here we adopt standard colour palette
my_red_palette = cm.get_cmap(name='jet') # multicolor
#my_red_palette = cm.get_cmap(name='Reds') # monochrome

# read graph
# ASSUME NODES ARE NUMBERED FROM 1 TO N
G1 = read_graph_from_list(GRAPH1_PATH, GRAPH1_POINTS_PATH,
                          coloring_df[['label']],
                          add_points_covered=False
                          )

# create a GUI with input our BM graph, 
# a dataframe with coloring functions (one value per point in the pointcloud)
# and a color palette
# in this case we use the pointcloud as coloring function
print('creating GUI')
my_fancy_gui = graph_GUI(G1, my_red_palette, 
                         coloring_df[['label']].columns.to_list(),
                         figsize=(800, 600),
                         render_iterations=2000)

my_fancy_gui.color_by_variable('label', MIN_VALUE=0, MAX_VALUE=9)

# add a legend
num_ticks = 10
low = 0
high = 9
color_mapper = LinearColorMapper(palette=[to_hex(my_red_palette(color_id)) 
                                          for color_id in np.linspace(0, 1, num_ticks)], 
                                 low=low-0.5, high=high+0.5)

ticks = [i for i in range(low, high+1, 1)]
color_ticks = FixedTicker(ticks=ticks)

color_bar = ColorBar(color_mapper=color_mapper, 
                     major_label_text_font_size='14pt',
                     label_standoff=12,
                     ticker=color_ticks,
                    )

my_fancy_gui.plot.add_layout(color_bar, 'right')

my_fancy_gui.plot.title = 'BM plot on full digits data - EPSILON {}'.format(EPSILON)

show(my_fancy_gui.plot)

loading edgelist
loading points covered


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


computing coloring


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


creating GUI
color by variable label 
MIN_VALUE: 0.000, MAX_VALUE: 9.000


ValueError: expected an instance of type Title, got BM plot on full digits data - EPSILON 50 of type str

## BM on PCA (10 dimensions) digits data

In [11]:
# PLOT THE BM GRAPH o

EPSILON = 20

# adj lists path
GRAPH1_PATH = 'BM_graphs/digits_X/{}_edges'.format(EPSILON)
# point covered by each node path
GRAPH1_POINTS_PATH = 'BM_graphs/digits_X/{}_points_covered_by_landmarks'.format(EPSILON)


###########
# GRAPH 1 #
###########

#Here we adopt standard colour palette
my_red_palette = cm.get_cmap(name='jet') # multicolor
#my_red_palette = cm.get_cmap(name='Reds') # monochrome

# read graph
# ASSUME NODES ARE NUMBERED FROM 1 TO N
G1 = read_graph_from_list(GRAPH1_PATH, GRAPH1_POINTS_PATH,
                          coloring_df[['label']],
                          add_points_covered=False
                          )

# create a GUI with input our BM graph, 
# a dataframe with coloring functions (one value per point in the pointcloud)
# and a color palette
# in this case we use the pointcloud as coloring function
print('creating GUI')
my_fancy_gui = graph_GUI(G1, my_red_palette, 
                         coloring_df[['label']].columns.to_list(),
                         figsize=(800, 600),
                         render_iterations=2000)

my_fancy_gui.color_by_variable('label', MIN_VALUE=0, MAX_VALUE=9)

# add a legend
num_ticks = 10
low = 0
high = 9
color_mapper = LinearColorMapper(palette=[to_hex(my_red_palette(color_id)) 
                                          for color_id in np.linspace(0, 1, num_ticks)], 
                                 low=low-0.5, high=high+0.5)

ticks = [i for i in range(low, high+1, 1)]
color_ticks = FixedTicker(ticks=ticks)

color_bar = ColorBar(color_mapper=color_mapper, 
                     major_label_text_font_size='14pt',
                     label_standoff=12,
                     ticker=color_ticks,
                    )

my_fancy_gui.plot.add_layout(color_bar, 'right')

my_fancy_gui.plot.title = 'BM on PCA (10 dimensions) digits data - EPSILON {}'.format(EPSILON)

show(my_fancy_gui.plot)

loading edgelist
loading points covered


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


computing coloring


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


creating GUI
color by variable label 
MIN_VALUE: 0.000, MAX_VALUE: 9.000


ValueError: expected an instance of type Title, got BM on PCA (10 dimensions) digits data - EPSILON 20 of type str

# MAPPER ON BM

In [12]:
# mapper on BM using DBscan as clustering algo
# it can use scipy csr sparse matrix to speed up computations
# inputs:
#     origin_BM   ball mapper graph
#     target_pts  pandas dataframe where to pull back elements in the BM
#     EPS         radius for the DBscan algo
#     MIN_SAMPLES min number of elements in a cluster that make it a cluster and not noise
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html

def mapper_on_BM(origin_BM, target_pts, EPS, MIN_SAMPLES=1, sparse=False):
    new_graph = nx.Graph()

    # creates a sparse CSR matrix
    if sparse:
        target_pts = csr_matrix(target_pts.values)
    else:
        target_pts = target_pts.values

    for node in tqdm(origin_BM.nodes):
        X = target_pts[origin_BM.nodes[node]['points covered'], :]

        db = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES).fit(X)
        # create a set of unique labels
        labels = set(db.labels_) - {-1} # outliers are not clusters

        print('\n **********')
        print('node {} contains {} points'.format(node, X.shape[0]))
        print('it has been divided in {} clusters'.format(len(labels)))

        # for each cluster
        # add a new vertex to the new graph
        for cluster in labels:
            # print the number of points in the cluster
            print('\t cluster {} has size {}'.format(cluster, (db.labels_ == cluster).sum()))
            # retrives the indeces of the points covered by the cluster
            points_covered_by_cluster = np.array(origin_BM.nodes[node]['points covered'])[np.where(db.labels_
                                                                                     == cluster)].tolist()
            # creates a node
            new_graph.add_node(str(node)+'_'+str(cluster),
                               points_covered=points_covered_by_cluster)

        for neigh in [v for v in nx.neighbors(origin_BM, node) if v > node]:
            neigh_X = target_pts[origin_BM.nodes[neigh]['points covered'], :]

            neigh_db = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES).fit(neigh_X)
            neigh_labels = set(neigh_db.labels_) - {-1} # outliers are not clusters

            # add edges between clusters that belongs to neigh in the original graph
            # if they share at least one element
            for cluster in labels:
                for neigh_cluster in neigh_labels:
                    points_covered_by_cluster = np.array(origin_BM.nodes[node]['points covered'])[np.where(db.labels_
                                                                                             == cluster)].tolist()
                    points_covered_by_neigh=np.array(origin_BM.nodes[neigh]['points covered'])[np.where(neigh_db.labels_
                                                                                          == neigh_cluster)].tolist()
                    if len( set(points_covered_by_cluster)&set(points_covered_by_neigh) ) != 0:
                        new_graph.add_edge(str(node)+'_'+str(cluster), str(neigh)+'_'+str(neigh_cluster) )


    return new_graph

In [None]:
# we will save the mapper_on_BM to disk as pickle files 
# this way we can plot them in a second moment

In [13]:
# Read the mapper_on_BM graph from pickle

def read_graph_from_pickle(GRAPH_PATH,
                           values_df,
                           my_palette):
    # read graph 
    G = nx.read_gpickle(GRAPH_PATH)
    
    MIN_SCALE = 7
    MAX_SCALE = 20

    MAX_NODE_SIZE = 0
    for node in G.nodes:
        if len(G.nodes[node]['points_covered']) > MAX_NODE_SIZE:
            MAX_NODE_SIZE = len(G.nodes[node]['points_covered'])

    for node in G.nodes:
        G.nodes[node]['size'] = len(G.nodes[node]['points_covered'])
        # rescale the size for display
        G.nodes[node]['size rescaled'] = MAX_SCALE*G.nodes[node]['size']/MAX_NODE_SIZE + MIN_SCALE

        G.nodes[node]['color'] = my_palette(0)

        for name, avg in values_df.loc[G.nodes[node]['points_covered']].mean().iteritems():
            G.nodes[node][name] = avg

    return G

# EXAMPLE 1
## handwritten digits to their PCA rapresentation

In [14]:
original_BM_EPSILON = 50
DBSCAN_EPSILON = 15

original_BM = read_graph_from_list('BM_graphs/digits_X/{}_edges'.format(original_BM_EPSILON),
                                   'BM_graphs/digits_X/{}_points_covered_by_landmarks'.format(original_BM_EPSILON),
                                   add_points_covered=True)

digits_PCA = pd.read_csv('data/digits_X_PCA3.csv')
print('data loaded')

print('computing mapper on BM')
print('mapping the BM into a pointcloud of shape {}'.format(digits_PCA.shape))

pullback_to_PCA = mapper_on_BM(original_BM, digits_PCA,
                               EPS=DBSCAN_EPSILON, MIN_SAMPLES=1)

nx.write_gpickle(pullback_to_PCA, 'pullback_digits_to_PCA.gpickle')

loading edgelist
loading points covered


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


computing coloring


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


data loaded
computing mapper on BM
mapping the BM into a pointcloud of shape (1797, 3)


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


 **********
node 1 contains 1180 points
it has been divided in 1 clusters
	 cluster 0 has size 1180

 **********
node 2 contains 1031 points
it has been divided in 1 clusters
	 cluster 0 has size 1031

 **********
node 3 contains 570 points
it has been divided in 1 clusters
	 cluster 0 has size 570

 **********
node 4 contains 813 points
it has been divided in 1 clusters
	 cluster 0 has size 813

 **********
node 5 contains 304 points
it has been divided in 1 clusters
	 cluster 0 has size 304

 **********
node 6 contains 615 points
it has been divided in 1 clusters
	 cluster 0 has size 615

 **********
node 7 contains 382 points
it has been divided in 1 clusters
	 cluster 0 has size 382

 **********
node 8 contains 675 points
it has been divided in 1 clusters
	 cluster 0 has size 675



In [15]:
GRAPH1_PATH = 'pullback_digits_to_PCA.gpickle'

# table with the coloring functions
coloring_df = pd.read_csv('data/digits_y.csv')
coloring_df.index = range(len(coloring_df))

###########
# GRAPH 1 #
###########

#Here we adopt standard colour palette
my_palette = cm.get_cmap(name='jet')

# read graph
# ASSUME NODES ARE NUMBERED FROM 1 TO N
G1 = read_graph_from_pickle(GRAPH1_PATH, coloring_df, my_palette)

for node in G1.nodes:
    G1.nodes[node]['points covered'] = G1.nodes[node]['points_covered']
print('loaded graph with {} nodes and {} edges'.format(len(G1.nodes), len(G1.edges)))

# create a GUI with input our BM graph, 
# a dataframe with coloring functions (one value per point in the pointcloud)
# and a color palette
# in this case we use the pointcloud as coloring function
my_fancy_gui = graph_GUI(G1, my_palette, coloring_df[['label']])
my_fancy_gui.color_by_variable('label', MIN_VALUE=0, MAX_VALUE=9)

# add a discrete colorbar
num_ticks = 10
low = 0
high = 9
color_mapper = LinearColorMapper(palette=[to_hex(my_palette(color_id)) 
                                          for color_id in np.linspace(0, 1, num_ticks)], 
                                 low=low-0.5, high=high+0.5)

ticks = [i for i in range(low, high+1, 1)]
color_ticks = FixedTicker(ticks=ticks)

color_bar = ColorBar(color_mapper=color_mapper, 
                     major_label_text_font_size='14pt',
                     label_standoff=12,
                     ticker=color_ticks,
                    )

my_fancy_gui.plot.add_layout(color_bar, 'right')
my_fancy_gui.plot.title= 'pullback_digits_to_PCA'

loaded graph with 8 nodes and 28 edges
color by variable label 
MIN_VALUE: 0.000, MAX_VALUE: 9.000


ValueError: expected an instance of type Title, got pullback_digits_to_PCA of type str

In [16]:
# creates an html file with the graph 
# and opens it in another tab
show(my_fancy_gui.plot)

# EXAMPLE 2
## the opposite
## handwritten digits in PCA rapresentation to the full dataset

In [18]:
original_BM_EPSILON = 20
DBSCAN_EPSILON = 25

original_BM = read_graph_from_list('BM_graphs/digits_X/{}_edges'.format(original_BM_EPSILON),
                                   'BM_graphs/digits_X/{}_points_covered_by_landmarks'.format(original_BM_EPSILON),
                                   add_points_covered=True)

digits_PCA = pd.read_csv('data/digits_X.csv')

print('data loaded')
print('computing mapper on BM')
print('mapping the BM into a pointcloud of shape {}'.format(digits_PCA.shape))
pullback_to_PCA = mapper_on_BM(original_BM, digits_PCA,
                               EPS=DBSCAN_EPSILON, MIN_SAMPLES=1)

nx.write_gpickle(pullback_to_PCA, 'pullback_digits_PCA_to_full.gpickle')

loading edgelist
loading points covered


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


computing coloring


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


data loaded
computing mapper on BM
mapping the BM into a pointcloud of shape (1797, 64)


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


 **********
node 1 contains 1180 points
it has been divided in 37 clusters
	 cluster 0 has size 937
	 cluster 1 has size 126
	 cluster 2 has size 55
	 cluster 3 has size 1
	 cluster 4 has size 1
	 cluster 5 has size 13
	 cluster 6 has size 11
	 cluster 7 has size 1
	 cluster 8 has size 1
	 cluster 9 has size 1
	 cluster 10 has size 1
	 cluster 11 has size 1
	 cluster 12 has size 1
	 cluster 13 has size 1
	 cluster 14 has size 1
	 cluster 15 has size 1
	 cluster 16 has size 1
	 cluster 17 has size 1
	 cluster 18 has size 1
	 cluster 19 has size 1
	 cluster 20 has size 1
	 cluster 21 has size 1
	 cluster 22 has size 1
	 cluster 23 has size 1
	 cluster 24 has size 1
	 cluster 25 has size 1
	 cluster 26 has size 1
	 cluster 27 has size 1
	 cluster 28 has size 1
	 cluster 29 has size 1
	 cluster 30 has size 4
	 cluster 31 has size 1
	 cluster 32 has size 2
	 cluster 33 has size 1
	 cluster 34 has size 1
	 cluster 35 has size 3
	 cluster 36 has size 1

 **********
node 2 contains 1031 point

In [22]:
GRAPH1_PATH = 'pullback_digits_PCA_to_full.gpickle'

# table with the coloring functions
coloring_df = pd.read_csv('data/digits_y.csv')
coloring_df.index = range(len(coloring_df))

###########
# GRAPH 1 #
###########

#Here we adopt standard colour palette
my_palette = cm.get_cmap(name='jet')

# read graph
# ASSUME NODES ARE NUMBERED FROM 1 TO N
G1 = read_graph_from_pickle(GRAPH1_PATH, coloring_df, my_palette)

for node in G1.nodes:
    G1.nodes[node]['points covered'] = G1.nodes[node]['points_covered']
print('loaded graph with {} nodes and {} edges'.format(len(G1.nodes), len(G1.edges)))

# create a GUI with input our BM graph, 
# a dataframe with coloring functions (one value per point in the pointcloud)
# and a color palette
# in this case we use the pointcloud as coloring function
my_fancy_gui = graph_GUI(G1, my_palette, coloring_df[['label']])
my_fancy_gui.color_by_variable('label', MIN_VALUE=0, MAX_VALUE=9)

# add a discrete colorbar
num_ticks = 10
low = 0
high = 9
color_mapper = LinearColorMapper(palette=[to_hex(my_palette(color_id)) 
                                          for color_id in np.linspace(0, 1, num_ticks)], 
                                 low=low-0.5, high=high+0.5)

ticks = [i for i in range(low, high+1, 1)]
color_ticks = FixedTicker(ticks=ticks)

color_bar = ColorBar(color_mapper=color_mapper, 
                     major_label_text_font_size='14pt',
                     label_standoff=12,
                     ticker=color_ticks,
                    )

my_fancy_gui.plot.add_layout(color_bar, 'right')
#my_fancy_gui.plot.title ='pullback_digits_PCA_to_full'

loaded graph with 225 nodes and 466 edges
color by variable label 
MIN_VALUE: 0.000, MAX_VALUE: 9.000


In [23]:
# creates an html file with the graph 
# and opens it in another tab
show(my_fancy_gui.plot)