## Load libraries

In [4]:
import kmapper as km
# import sklearn

from sklearn.cluster import DBSCAN # clustering algorithm
from sklearn.decomposition import PCA # projection (lens) creation
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import hdbscan

# from sklearn import ensemble
# from sklearn.manifold import MDS

import plotly.graph_objs as go
# from ipywidgets import interactive, HBox, VBox, widgets, interact # ?
# import dash_html_components as html # ?
# import dash_core_components as dcc # ?

from kmapper.plotlyviz import * # static and interactive plots
import psutil # for plotlyviz
import kaleido # for plotlyviz
# import networkx # ?

# import dash # ?
import warnings #? 
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Read data

X is a rescaled dataframe using standard scaling (mean and standard deviation). I included unscaled variables year, season, field number, stratum, and sheetbar as well.

In [7]:
water20 = pd.read_csv("../../LTRM data/RF interpolation/water_full.csv")
print(water20.head())


X = water20[["WDP", "SECCHI", "TEMP", "DO", "TURB",
             "VEL", "TP", "TN", "SS", "CHLcal", 
             "YEAR", "SEASON", "FLDNUM", "STRATUM", "SHEETBAR"]]

ct = ColumnTransformer([
        ('somename', StandardScaler(), ["WDP", "SECCHI", "TEMP", "DO", "TURB",
                                        "VEL", "TP", "TN", "SS", "CHLcal"])
    ], remainder='passthrough')

X = pd.DataFrame(ct.fit_transform(X), columns = ["WDP", "SECCHI", "TEMP", "DO", "TURB",
                                                 "VEL", "TP", "TN", "SS", "CHLcal", 
                                                 "YEAR", "SEASON", "FLDNUM", "STRATUM", "SHEETBAR"])

print(X.head())

   SHEETBAR        DATE   LATITUDE  LONGITUDE         FLDNUM       STRATUM  \
0  41000065  07/26/1993  44.571864 -92.510970  Lake City, MN  Main channel   
1  41000066  07/26/1993  44.575497 -92.518497  Lake City, MN  Main channel   
2  41000067  07/26/1993  44.573718 -92.523549  Lake City, MN  Main channel   
3  41000068  07/26/1993  44.566588 -92.541238  Lake City, MN  Main channel   
4  41000069  07/26/1993  44.568419 -92.548780  Lake City, MN  Main channel   

   LOCATCD     TN     TP  TEMP   DO  TURB   COND   VEL    SS  WDP   CHLcal  \
0  9312103  3.955  0.228  23.0  6.6    28  550.0  0.50  42.3  2.2  9.44875   
1  9312002  4.876  0.229  23.0  6.6    28  554.0  0.72  37.6  8.2  8.24230   
2  9312102  3.955  0.220  22.9  6.3    24  564.0  0.66  34.1  4.3  8.72488   
3  9312003  4.257  0.212  22.9  6.4    28  563.0  0.69  33.4  9.1  8.48359   
4  9312104  4.030  0.237  23.0  6.6    33  556.0  0.68  48.0  6.7  9.52918   

   SECCHI  YEAR  SEASON  
0      40  1993       2  
1      42 

## Define db and hdb mapper functions

In [122]:
def mapper_pca2_db(df, DBSCAN_EPSILON = 10, DBSCAN_MIN_SAMPLES = 20,
                   N_CUBES = [10,10], PERC_OVERLAP = [.25,.25], return_with_df = False, print_pca_info = False):
    """
    """
    
    df.reset_index(drop = True, inplace = True)
    
    X = df[["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                "VEL", "TP", "TN", "SS", "CHLcal"]]
     
    # for discerning primary variables in PCA 
    continuous_variables = ["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                            "VEL", "TP", "TN", "SS", "CHLcal"]
    
    var_to_index = {continuous_variables[i] : i for i in range(len(continuous_variables))}
    projected_vars = continuous_variables
    projected_var_indices = [var_to_index[var] for var in projected_vars]
    
    
    # create instance of clustering alg
    cluster_alg = DBSCAN(eps = DBSCAN_EPSILON, min_samples = DBSCAN_MIN_SAMPLES, 
                         metric='euclidean')

    # instantiate kepler mapper object
    mapper = km.KeplerMapper(verbose = 0)
    
    # defining filter function as projection on to the first 2 component axis
    pca = PCA(n_components = 2)
    lens = pca.fit_transform(X)
    
    if print_pca_info: 
        for j in range(2):
            pc_j = pca.components_[j]
            largest_magnitude = max(abs(pc_j))
            idx_magnitude = np.where(abs(pc_j) == largest_magnitude)[0][0]

            print("*** PCA", j+1, " ***")
            print("Primary variable: ", continuous_variables[idx_magnitude])
            print("Corresponding component: ", pc_j[idx_magnitude])
            print("Explained variance: ", pca.explained_variance_ratio_[j])
    
    summary_variable = mapper.project(np.array(X), projection=projected_var_indices, scaler=None)
    # similar to fit transform
    
    # Generate the simplicial complex
    scomplex = mapper.map(lens, X, 
                          cover=km.Cover(n_cubes = N_CUBES, perc_overlap = PERC_OVERLAP), 
                          clusterer = cluster_alg)
    
    if return_with_df: 
        return(scomplex, df)

    return(scomplex)

In [8]:
def mapper_pca2_hdb(df, HDB_MIN_CLUSTER = 45, HDB_MIN_SAMPLES = 10, HDB_EPSILON = 1,
                    N_CUBES = [10,10], PERC_OVERLAP = [.25,.25], return_with_df = False, print_pca_info = False):
    """
    """
  
    df.reset_index(drop = True, inplace = True)
    
    X = df[["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                "VEL", "TP", "TN", "SS", "CHLcal"]]

    # for discerning primary variables in PCA 
    continuous_variables = ["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                            "VEL", "TP", "TN", "SS", "CHLcal"]
    
    var_to_index = {continuous_variables[i] : i for i in range(len(continuous_variables))}
    projected_vars = continuous_variables
    projected_var_indices = [var_to_index[var] for var in projected_vars]
    
    # create instance of clustering alg
    cluster_alg = hdbscan.HDBSCAN(min_cluster_size = HDB_MIN_CLUSTER, min_samples = HDB_MIN_SAMPLES,
                                  cluster_selection_epsilon= HDB_EPSILON, cluster_selection_method = 'eom')

    # instantiate kepler mapper object
    mapper = km.KeplerMapper(verbose = 0)
    
    # defining filter function as projection on to the first 2 component axis
    pca = PCA(n_components = 2)
    lens = pca.fit_transform(X)
    
    if print_pca_info: 
        for j in range(2):
            pc_j = pca.components_[j]
            largest_magnitude = max(abs(pc_j))
            idx_magnitude = np.where(abs(pc_j) == largest_magnitude)[0][0]

            print("*** PCA", j+1, " ***")
            print("Primary variable: ", continuous_variables[idx_magnitude])
            print("Corresponding component: ", pc_j[idx_magnitude])
            print("Explained variance: ", pca.explained_variance_ratio_[j])
    
    summary_variable = mapper.project(np.array(X), projection=projected_var_indices, scaler=None)
    # similar to fit transform
    
    # Generate the simplicial complex
    scomplex = mapper.map(lens, X, 
                          cover=km.Cover(n_cubes = N_CUBES, perc_overlap = PERC_OVERLAP), 
                          clusterer = cluster_alg)  
    
    if return_with_df:
        return(scomplex, df)
    
    return(scomplex)

## DBscan uncolored graph

In [123]:
db_scomplex, dbscan_df = mapper_pca2_db(X, DBSCAN_EPSILON = 10, DBSCAN_MIN_SAMPLES = 10,
                                        N_CUBES = [75,75], PERC_OVERLAP = [0.5,0.5], 
                                        return_with_df = True, print_pca_info = False)

In [124]:
plotlyviz(db_scomplex, graph_layout = "kk", dashboard = True)

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

#### color DBscan

In [126]:
color_by_var(db_scomplex, dbscan_df, "YEAR")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [127]:
color_by_var(db_scomplex, dbscan_df, "SEASON")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [128]:
color_by_var(db_scomplex, dbscan_df, "CHLcal")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [133]:
color_by_var(db_scomplex, dbscan_df, "SS")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [130]:
color_by_var(db_scomplex, dbscan_df, "TURB")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [131]:
color_by_var(db_scomplex, dbscan_df, "SECCHI")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [132]:
color_by_var(db_scomplex, dbscan_df, "WDP")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

## HDBscan uncolored graph

In [9]:
hdb_scomplex, hdbscan_df = mapper_pca2_hdb(X, HDB_MIN_CLUSTER = 10, HDB_MIN_SAMPLES = 10,
                                           HDB_EPSILON = 1, N_CUBES = [75, 75], PERC_OVERLAP = [0.5, 0.5],
                                           return_with_df = True, print_pca_info = False)

*** PCA 1  ***
Primary variable:  SS
Corresponding component:  0.46292883376482535
Explained variance:  0.3447943056177738
*** PCA 2  ***
Primary variable:  CHLcal
Corresponding component:  -0.474609974501097
Explained variance:  0.18571896962264153


In [10]:
plotlyviz(hdb_scomplex, graph_layout = "fr", dashboard = True)

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

## Color the hdb graph

In [110]:
def color_by_var(scomplex, df, cont_var_str):
    """
    params
    * scomplex is the first return from mapper_pca2_db or mapper_pca2_hdb
    * df is the second retrun from mapper_pca2_db or mapper_pca2_hdb
    """

    kmgraph, mapper_summary, colorf_distribution = get_mapper_graph(scomplex,
                                                                    color_values = df[cont_var_str],
                                                                    color_function_name = [cont_var_str])

    for node in kmgraph['nodes']:
        node['custom_tooltips'] = np.array(df[cont_var_str])[scomplex['nodes'][node['name']]]
        # scomplex['nodes'] is a dictionary with keys that are the mapper nodes, cube600_cluster1, cube5_cluster0, etc....
        # [node['name']] is the name of the mapper node (such as cube600_cluster1)
        # scomplex['nodes'][node['name']] is a list of row numbers
        # that correspond with X
        
    plotly_graph_data = plotly_graph(kmgraph, graph_layout='kk')
    
    title_str = "HDBscan" + cont_var_str
    layout = plot_layout(title=title_str,
                         width=620, height=570,
                         annotation_text=get_kmgraph_meta(mapper_summary))

    fw_graph = go.FigureWidget(data = plotly_graph_data, layout=layout)
    fw_hist = node_hist_fig(colorf_distribution)
    fw_summary = summary_fig(mapper_summary, height=300)
    dashboard = hovering_widgets(kmgraph,
                                 fw_graph,
                                 ctooltips=True, # ctooltips = True, because we assigned a label to each
                                                 #cluster member
                                 member_textbox_width=600)

    #Update the fw_graph colorbar, setting its title:

    fw_graph.data[1].marker.colorbar.title = cont_var_str
    
    return(dashboard)
    


In [111]:
color_by_var(hdb_scomplex, hdbscan_df, "YEAR")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [112]:
color_by_var(hdb_scomplex, hdbscan_df, "SEASON")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [113]:
color_by_var(hdb_scomplex, hdbscan_df, "CHLcal")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [114]:
color_by_var(hdb_scomplex, hdbscan_df, "SS")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [115]:
color_by_var(hdb_scomplex, hdbscan_df, "TURB")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [116]:
color_by_var(hdb_scomplex, hdbscan_df, "SECCHI")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [117]:
color_by_var(hdb_scomplex, hdbscan_df, "WDP")

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [None]:
def color_by_var(scomplex, df, cont_var_str):
    """
    params
    * scomplex is the first return from mapper_pca2_db or mapper_pca2_hdb
    * df is the second retrun from mapper_pca2_db or mapper_pca2_hdb
    """

    kmgraph, mapper_summary, colorf_distribution = get_mapper_graph(scomplex,
                                                                    color_values = df[cont_var_str],
                                                                    color_function_name = [cont_var_str])

    for node in kmgraph['nodes']:
        node['custom_tooltips'] = np.array(df[cont_var_str])[scomplex['nodes'][node['name']]]
        # scomplex['nodes'] is a dictionary with keys that are the mapper nodes, cube600_cluster1, cube5_cluster0, etc....
        # [node['name']] is the name of the mapper node (such as cube600_cluster1)
        # scomplex['nodes'][node['name']] is a list of row numbers
        # that correspond with X
        
    plotly_graph_data = plotly_graph(kmgraph, graph_layout='kk')
    
    title_str = "HDBscan" + cont_var_str
    layout = plot_layout(title=title_str,
                         width=620, height=570,
                         annotation_text=get_kmgraph_meta(mapper_summary))

    fw_graph = go.FigureWidget(data = plotly_graph_data, layout=layout)
    fw_hist = node_hist_fig(colorf_distribution)
    fw_summary = summary_fig(mapper_summary, height=300)
    dashboard = hovering_widgets(kmgraph,
                                 fw_graph,
                                 ctooltips=True, # ctooltips = True, because we assigned a label to each
                                                 #cluster member
                                 member_textbox_width=600)

    #Update the fw_graph colorbar, setting its title:

    fw_graph.data[1].marker.colorbar.title = cont_var_str
    
    return(dashboard)
    


In [81]:
pl_brewer = [[0.0, '#006837'],
             [0.1, '#1a9850'],
             [0.2, '#66bd63'],
             [0.3, '#a6d96a'],
             [0.4, '#d9ef8b'],
             [0.5, '#ffffbf'],
             [0.6, '#fee08b'],
             [0.7, '#fdae61'],
             [0.8, '#f46d43'],
             [0.9, '#d73027'],
             [1.0, '#a50026']]

kmgraph, mapper_summary, colorf_distribution = get_mapper_graph(scomplex,
                                                                colorscale = pl_brewer,
                                                                color_values = X["SEASON"],
                                                                color_function_name = ["season"])



In [82]:
for node in kmgraph['nodes']:
    node['custom_tooltips'] = np.array(X["SEASON"])[scomplex['nodes'][node['name']]]
    # scomplex['nodes'] is a dictionary with keys that are the mapper nodes, cube600_cluster1, cube5_cluster0, etc....
    # [node['name']] is the name of the mapper node (such as cube600_cluster1)
    # scomplex['nodes'][node['name']] is a list of row numbers
    # that correspond with X

In [83]:
plotly_graph_data = plotly_graph(kmgraph, graph_layout='fr', colorscale = pl_brewer)
layout = plot_layout(title='HDBScan, year colored, tool tips is stratum',
                     width=620, height=570,
                     annotation_text=get_kmgraph_meta(mapper_summary))

fw_graph = go.FigureWidget(data = plotly_graph_data, layout=layout)
fw_hist = node_hist_fig(colorf_distribution)
fw_summary = summary_fig(mapper_summary, height=300)
dashboard = hovering_widgets(kmgraph,
                             fw_graph,
                             ctooltips=True, # ctooltips = True, because we assigned a label to each
                                             #cluster member
                             member_textbox_width=600)

#Update the fw_graph colorbar, setting its title:

fw_graph.data[1].marker.colorbar.title = 'year'

{'custom_meta': {'projection': '[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]',
  'n_cubes': [75, 75],
  'perc_overlap': [0.5, 0.5],
  'clusterer': 'HDBSCAN(cluster_selection_epsilon=1, min_cluster_size=10, min_samples=10)',
  'scaler': 'None'},
 'color_function_name': ['season'],
 'node_color_function': 'mean',
 'n_nodes': 457,
 'n_edges': 1523,
 'n_total': 242622,
 'n_unique': 69244}

In [85]:
dashboard

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

## Playing around with the structure of scomplex

In [4]:
scomplex_db = mapper_pca2_db(X, DBSCAN_EPSILON = 1, DBSCAN_MIN_SAMPLES = 5, N_CUBES = [100,100], 
                             PERC_OVERLAP = [.5,.5], remove_cond_bool = True, scomplex_bool = True)

In [25]:
plotly_graph_data = plotly_graph(kmgraph, graph_layout = 'fr')

layout = plot_layout(title = 'HDBScan',
                     width = 620, height = 570,
                     annotation_text = get_kmgraph_meta(mapper_summary))

fw_graph = go.FigureWidget(data=plotly_graph_data, layout=layout)

In [3]:
def mapper_pca2_db(df, DBSCAN_EPSILON = 10, DBSCAN_MIN_SAMPLES = 20, 
                N_CUBES = [10,10], PERC_OVERLAP = [.25,.25],
                remove_cond_bool = False, scomplex_bool = False):
    """
    """
    
    if remove_cond_bool:
        
        X = df[["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                "VEL", "TP", "TN", "SS", "CHLcal"]]
        
    else:
        
        X = df[["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                "VEL", "TP", "TN", "SS", "CHLcal", "COND"]]
    
    # for discerning primary variables in PCA 
    continuous_variables = ["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                            "VEL", "TP", "TN", "SS", "CHLcal"]
    
    if not remove_cond_bool:
        
        continuous_variables.append("COND")
    
    var_to_index = {continuous_variables[i] : i for i in range(len(continuous_variables))}
    projected_vars = continuous_variables
    projected_var_indices = [var_to_index[var] for var in projected_vars]
    
#     if X.shape[0]<10:
#         #print(X)
#         print("Not enough data in ", title, "_size = ", X.shape[0])
#         return(X.shape[0])
    
    # to match up indices in scomplex with the original dataframe X
    X.reset_index(drop = True, inplace = True) 
    
    # create instance of clustering alg
    cluster_alg = DBSCAN(eps = DBSCAN_EPSILON, min_samples = DBSCAN_MIN_SAMPLES, 
                         metric='euclidean')

    # instantiate kepler mapper object
    mapper = km.KeplerMapper(verbose = 0)
    
    # defining filter function as projection on to the first 2 component axis
    pca = PCA(n_components = 2)
    lens = pca.fit_transform(X)
    
#     for j in range(2):
#         pc_j = pca.components_[j]
#         largest_magnitude = max(abs(pc_j))
#         idx_magnitude = np.where(abs(pc_j) == largest_magnitude)[0][0]
        
#         print("*** PCA", j+1, " ***")
#         print("Primary variable: ", continuous_variables[idx_magnitude])
#         print("Corresponding component: ", pc_j[idx_magnitude])
#         print("Explained variance: ", pca.explained_variance_ratio_[j])
    
    summary_variable = mapper.project(np.array(X), projection=projected_var_indices, scaler=None)
    # similar to fit transform
    
    # Generate the simplicial complex
    scomplex = mapper.map(lens, X, 
                          cover=km.Cover(n_cubes = N_CUBES, perc_overlap = PERC_OVERLAP), 
                          clusterer = cluster_alg)  

    if scomplex_bool: 
        return(scomplex)
    
    # the rest of this is for coloring 
    
    pl_brewer = [[0.0, '#006837'],
             [0.1, '#1a9850'],
             [0.2, '#66bd63'],
             [0.3, '#a6d96a'],
             [0.4, '#d9ef8b'],
             [0.5, '#ffffbf'],
             [0.6, '#fee08b'],
             [0.7, '#fdae61'],
             [0.8, '#f46d43'],
             [0.9, '#d73027'],
             [1.0, '#a50026']]

    color_values = lens[:,0] - lens[:,0].min() # changes if PCA1 or PCA1 and PCA2
    # can change to other variables
    color_function_name = ['Distance to x-min'] # set name of color function
    my_colorscale = pl_brewer
    kmgraph, mapper_summary, colorf_distribution = get_mapper_graph(scomplex,
                                                                    color_values,
                                                                    color_function_name=color_function_name, 
                                                                    colorscale=my_colorscale)
    
    plotly_graph_data = plotly_graph(kmgraph, graph_layout='fr', colorscale=my_colorscale, 
                                     factor_size=2.5, edge_linewidth=0.5)
    
    plot_title = str(DBSCAN_EPSILON) + str(DBSCAN_EPSILON) + ', MIN_SAMPLES ' + str(DBSCAN_MIN_SAMPLES) 

    layout = plot_layout(title=plot_title,  
                         width=620, height=570,
                         annotation_text=get_kmgraph_meta(mapper_summary))

    # FigureWidget is responsible for event listeners

    fw_graph = go.FigureWidget(data=plotly_graph_data, layout=layout)
    fw_summary = summary_fig(mapper_summary, height=300)

    dashboard = hovering_widgets(kmgraph, fw_graph, member_textbox_width=600)

    # DESIRED FILE PATH, CHANGE TO FIT YOUR LOCAL MACHINE
    directory_path = "mapper outputs"
    
    #Update the fw_graph colorbar, setting its title:
    fw_graph.data[1].marker.colorbar.title = 'dist to<br>x-min'
    html_output_path = directory_path + 'Eps_' + str(DBSCAN_EPSILON) +'_MinS_' + str(DBSCAN_MIN_SAMPLES) + '_NCubes_' + str(N_CUBES) + '_PercOvlp_' + str(PERC_OVERLAP) + '.html'
    mapper.visualize(scomplex, color_values=color_values, color_function_name=color_function_name,
                     path_html=html_output_path, lens = summary_variable, lens_names = projected_vars)
    
    return scomplex, X