# Color by variables in TDA

## Load libraries

In [1]:
import kmapper as km
# import sklearn

from sklearn.cluster import DBSCAN # clustering algorithm
from sklearn.decomposition import PCA # projection (lens) creation
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
import hdbscan

# from sklearn import ensemble
# from sklearn.manifold import MDS

import plotly.graph_objs as go
# from ipywidgets import interactive, HBox, VBox, widgets, interact # ?
# import dash_html_components as html # ?
# import dash_core_components as dcc # ?

from kmapper.plotlyviz import * # static and interactive plots
import psutil # for plotlyviz
import kaleido # for plotlyviz
# import networkx # ?

# import dash # ?
import warnings #? 
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Read data

X is a rescaled dataframe using robust scaling (mean and standard deviation). I included unscaled variables year, season, field number, stratum, and sheetbar as well.

In [2]:
water20 = pd.read_csv("../../LTRM data/RF interpolation/water_full.csv")

water_df = water20[["WDP", "SECCHI", "TEMP", "DO", "TURB",
             "VEL", "TP", "TN", "SS", "CHLcal", 
             "YEAR", "SEASON", "FLDNUM", "STRATUM", "SHEETBAR", "DATE"]]

ct = ColumnTransformer([
        ('somename', RobustScaler(), ["WDP", "SECCHI", "TEMP", "DO", "TURB",
                                        "VEL", "TP", "TN", "SS", "CHLcal"])
    ], remainder='passthrough')

water_df = pd.DataFrame(ct.fit_transform(water_df), columns = ["WDP", "SECCHI", "TEMP", "DO", "TURB",
                                                 "VEL", "TP", "TN", "SS", "CHLcal", 
                                                 "YEAR", "SEASON", "FLDNUM", "STRATUM", "SHEETBAR", "DATE"])

water_df = water_df.replace({'SEASON': {1: "Spring", 2: "Summer", 3: "Fall", 4: "Winter"}})

water_df.head()

Unnamed: 0,WDP,SECCHI,TEMP,DO,TURB,VEL,TP,TN,SS,CHLcal,YEAR,SEASON,FLDNUM,STRATUM,SHEETBAR,DATE
0,-0.018088,-0.046512,0.572414,-0.72093,0.210526,0.576923,0.481481,0.918033,0.361752,-0.294693,1993,Summer,"Lake City, MN",Main channel,41000065,07/26/1993
1,1.5323,0.0,0.572414,-0.72093,0.210526,1.0,0.488889,1.547131,0.262323,-0.342123,1993,Summer,"Lake City, MN",Main channel,41000066,07/26/1993
2,0.524548,0.023256,0.565517,-0.790698,0.105263,0.884615,0.422222,0.918033,0.18828,-0.323151,1993,Summer,"Lake City, MN",Main channel,41000067,07/26/1993
3,1.764858,-0.093023,0.565517,-0.767442,0.210526,0.942308,0.362963,1.124317,0.173472,-0.332637,1993,Summer,"Lake City, MN",Main channel,41000068,07/26/1993
4,1.144703,0.069767,0.572414,-0.72093,0.342105,0.923077,0.548148,0.969262,0.482336,-0.291531,1993,Summer,"Lake City, MN",Main channel,41000069,07/26/1993


**Add column for time period**

In [3]:
np.linspace(1993, 2020, 4)

array([1993., 2002., 2011., 2020.])

In [4]:
even_time_periods = ['1993-1999', '1996-2002', '1999-2005', '2002-2008', '2005-2011',
                  '2008-2014', '2011-2017', '2014-2020']

eco_time_periods = ['1993-2000', '1998-2004', '2001-2013', '2010-2016', '2014-2020']

for time_period in even_time_periods:
    begin_year = int(time_period[0:4])
    end_year = int(time_period[5:9])
    
    water_df[time_period] = np.where((water_df['YEAR'] >= begin_year) & (water_df['YEAR'] <= end_year), 1, 0)
    
for time_period in eco_time_periods:
    begin_year = int(time_period[0:4])
    end_year = int(time_period[5:9])
    
    water_df[time_period] = np.where((water_df['YEAR'] >= begin_year) & (water_df['YEAR'] <= end_year), 1, 0)

In [5]:
even_time_periods.append("YEAR")
water_df[even_time_periods].sample(frac = 0.0001)

Unnamed: 0,1993-1999,1996-2002,1999-2005,2002-2008,2005-2011,2008-2014,2011-2017,2014-2020,YEAR
71207,0,0,0,0,1,1,0,0,2009
20150,0,0,0,1,1,1,0,0,2008
32570,0,0,1,1,1,0,0,0,2005
32475,0,0,1,1,1,0,0,0,2005
34925,0,0,0,0,1,1,0,0,2009
51915,0,0,0,0,0,0,0,1,2019
35568,0,0,0,0,1,1,0,0,2010
1933,1,1,0,0,0,0,0,0,1998


In [6]:
eco_time_periods.append("YEAR")
water_df[eco_time_periods].sample(frac = 0.0001)

Unnamed: 0,1993-2000,1998-2004,2001-2013,2010-2016,2014-2020,YEAR
35441,0,0,1,1,0,2010
1028,1,0,0,0,0,1996
46981,0,0,1,0,0,2008
48376,0,0,1,1,0,2011
75859,0,0,0,0,1,2018
74811,0,0,0,1,1,2016
68243,0,1,1,0,0,2002
72203,0,0,1,1,0,2011


In [7]:
season_var = ["Spring", "Summer", "Fall", "Winter"]

for season in season_var:
    water_df[season] = np.where((water_df[["SEASON"]] == season), 1, 0)


## Define Mapper functions

In [8]:
def mapper_pca2_db(df, DBSCAN_EPSILON = 10, DBSCAN_MIN_SAMPLES = 20,
                   N_CUBES = [10,10], PERC_OVERLAP = [.25,.25], return_with_df = False, print_pca_info = False):
    """
    """
    
    df.reset_index(drop = True, inplace = True)
    
    X = df[["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                "VEL", "TP", "TN", "SS", "CHLcal"]]
     
    # for discerning primary variables in PCA 
    continuous_variables = ["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                            "VEL", "TP", "TN", "SS", "CHLcal"]
    
    var_to_index = {continuous_variables[i] : i for i in range(len(continuous_variables))}
    projected_vars = continuous_variables
    projected_var_indices = [var_to_index[var] for var in projected_vars]
    
    
    # create instance of clustering alg
    cluster_alg = DBSCAN(eps = DBSCAN_EPSILON, min_samples = DBSCAN_MIN_SAMPLES, 
                         metric='euclidean')

    # instantiate kepler mapper object
    mapper = km.KeplerMapper(verbose = 0)
    
    # defining filter function as projection on to the first 2 component axis
    pca = PCA(n_components = 2)
    lens = pca.fit_transform(X)
    
    if print_pca_info: 
        for j in range(2):
            pc_j = pca.components_[j]
            largest_magnitude = max(abs(pc_j))
            idx_magnitude = np.where(abs(pc_j) == largest_magnitude)[0][0]

            print("*** PCA", j+1, " ***")
            print("Primary variable: ", continuous_variables[idx_magnitude])
            print("Corresponding component: ", pc_j[idx_magnitude])
            print("Explained variance: ", pca.explained_variance_ratio_[j])
    
    summary_variable = mapper.project(np.array(X), projection=projected_var_indices, scaler=None)
    # similar to fit transform
    
    # Generate the simplicial complex
    scomplex = mapper.map(lens, X, 
                          cover=km.Cover(n_cubes = N_CUBES, perc_overlap = PERC_OVERLAP), 
                          clusterer = cluster_alg)
    
    if return_with_df: 
        return(scomplex, df)

    return(scomplex)

## DBscan uncolored graph

In [9]:
db_scomplex, dbscan_df = mapper_pca2_db(water_df, DBSCAN_EPSILON = 1.2, DBSCAN_MIN_SAMPLES = 20,
                                        N_CUBES = [125,125], PERC_OVERLAP = [0.4,0.4], 
                                        return_with_df = True, print_pca_info = False)

In [10]:
plotlyviz(db_scomplex, graph_layout = "fr", dashboard = True)

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

## color DBscan by

even time periods, ecological time periods, and season

In [11]:
mapper = km.KeplerMapper(verbose = 0)

html_output_path = 'even time periods.html'

mapper.visualize(db_scomplex, color_values = water_df[even_time_periods], color_function_name = even_time_periods, 
                path_html = html_output_path, lens_names = even_time_periods)

'<!DOCTYPE html>\n<html>\n\n<head>\n  <meta charset="utf-8">\n  <meta name="generator" content="KeplerMapper">\n  <title>Kepler Mapper | KeplerMapper</title>\n\n  <link rel="icon" type="image/png" href="http://i.imgur.com/axOG6GJ.jpg" />\n\n  <link href=\'https://fonts.googleapis.com/css?family=Roboto+Mono:700,300\' rel=\'stylesheet\' type=\'text/css\'>\n  <style>* {\n  margin: 0;\n  padding: 0;\n}\n\nhtml, body {\n  height: 100%;\n}\n\nbody {\n  font-family: "Roboto Mono", "Helvetica", sans-serif;\n  font-size: 14px;\n}\n\n#logo {\n  width:  85px;\n  height: 85px;\n}\n\n#display {\n  color: #95A5A6;\n  background: #212121;\n}\n\n#header {\n  background: #111111;\n}\n\n#print {\n  color: #000;\n  background: #FFF;\n}\n\nh1 {\n  font-size: 21px;\n  font-weight: 300;\n  font-weight: 300;\n}\n\nh2 {\n  font-size: 18px;\n  padding-bottom: 20px;\n  font-weight: 300;\n}\n\nh3 {\n  font-size: 14px;\n  font-weight: 700;\n  text-transform: uppercase;\n}\n\nh4 {\n  font-size: 13px;\n  font-weigh

In [12]:
html_output_path = 'eco time periods.html'

mapper.visualize(db_scomplex, color_values = water_df[eco_time_periods], color_function_name = eco_time_periods, 
                path_html = html_output_path, lens_names = eco_time_periods)

'<!DOCTYPE html>\n<html>\n\n<head>\n  <meta charset="utf-8">\n  <meta name="generator" content="KeplerMapper">\n  <title>Kepler Mapper | KeplerMapper</title>\n\n  <link rel="icon" type="image/png" href="http://i.imgur.com/axOG6GJ.jpg" />\n\n  <link href=\'https://fonts.googleapis.com/css?family=Roboto+Mono:700,300\' rel=\'stylesheet\' type=\'text/css\'>\n  <style>* {\n  margin: 0;\n  padding: 0;\n}\n\nhtml, body {\n  height: 100%;\n}\n\nbody {\n  font-family: "Roboto Mono", "Helvetica", sans-serif;\n  font-size: 14px;\n}\n\n#logo {\n  width:  85px;\n  height: 85px;\n}\n\n#display {\n  color: #95A5A6;\n  background: #212121;\n}\n\n#header {\n  background: #111111;\n}\n\n#print {\n  color: #000;\n  background: #FFF;\n}\n\nh1 {\n  font-size: 21px;\n  font-weight: 300;\n  font-weight: 300;\n}\n\nh2 {\n  font-size: 18px;\n  padding-bottom: 20px;\n  font-weight: 300;\n}\n\nh3 {\n  font-size: 14px;\n  font-weight: 700;\n  text-transform: uppercase;\n}\n\nh4 {\n  font-size: 13px;\n  font-weigh

In [13]:
html_output_path = 'seasons.html'

mapper.visualize(db_scomplex, color_values = water_df[season_var], color_function_name = season_var, 
                path_html = html_output_path, lens_names = season_var)

'<!DOCTYPE html>\n<html>\n\n<head>\n  <meta charset="utf-8">\n  <meta name="generator" content="KeplerMapper">\n  <title>Kepler Mapper | KeplerMapper</title>\n\n  <link rel="icon" type="image/png" href="http://i.imgur.com/axOG6GJ.jpg" />\n\n  <link href=\'https://fonts.googleapis.com/css?family=Roboto+Mono:700,300\' rel=\'stylesheet\' type=\'text/css\'>\n  <style>* {\n  margin: 0;\n  padding: 0;\n}\n\nhtml, body {\n  height: 100%;\n}\n\nbody {\n  font-family: "Roboto Mono", "Helvetica", sans-serif;\n  font-size: 14px;\n}\n\n#logo {\n  width:  85px;\n  height: 85px;\n}\n\n#display {\n  color: #95A5A6;\n  background: #212121;\n}\n\n#header {\n  background: #111111;\n}\n\n#print {\n  color: #000;\n  background: #FFF;\n}\n\nh1 {\n  font-size: 21px;\n  font-weight: 300;\n  font-weight: 300;\n}\n\nh2 {\n  font-size: 18px;\n  padding-bottom: 20px;\n  font-weight: 300;\n}\n\nh3 {\n  font-size: 14px;\n  font-weight: 700;\n  text-transform: uppercase;\n}\n\nh4 {\n  font-size: 13px;\n  font-weigh

## Color the DB graph

In [120]:
def color_by_var(scomplex, df, var_str):
    """
    params
    * scomplex is the first return from mapper_pca2_db 
    * df is the second return from mapper_pca2_db
    """

    kmgraph, mapper_summary, colorf_distribution = get_mapper_graph(scomplex,
                                                                    color_values = df[var_str],
                                                                    color_function_name = [[var_str]])

    for node in kmgraph['nodes']:
            
        node['custom_tooltips'] = np.array(df[var_str])[scomplex['nodes'][node['name']]]
        # scomplex['nodes'] is a dictionary with keys that are the mapper nodes like cube600_cluster1, cube5_cluster0, etc....
        # [node['name']] is the name of the mapper node (such as cube600_cluster1)
        # scomplex['nodes'][node['name']] is a list of row numbers
        # that correspond with X

    plotly_graph_data = plotly_graph(kmgraph, graph_layout='kk',
                                     edge_linewidth = 0.47, node_linewidth = 0.47)
    
    title_str = "DBscan " + var_str
    layout = plot_layout(title=title_str,
                         width=700, height=700,
                         annotation_text=get_kmgraph_meta(mapper_summary))

    fw_graph = go.FigureWidget(data = plotly_graph_data, layout=layout)
    fw_hist = node_hist_fig(colorf_distribution)
    fw_summary = summary_fig(mapper_summary, height=300)
    dashboard = hovering_widgets(kmgraph,
                                 fw_graph,
                                 ctooltips=True, # ctooltips = True, because we assigned a label to each
                                                 #cluster member
                                 member_textbox_width=600)

    #Update the fw_graph colorbar, setting its title:

    fw_graph.data[1].marker.colorbar.title = var_str
    
    return(dashboard)
    


In [121]:
eco_plots = []

for time_period in eco_time_periods:
    eco_plots.append(color_by_var(db_scomplex, dbscan_df, time_period))

In [122]:
eco_plots[0]

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [123]:
eco_plots[1]

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'none',
              'line': {'color'…

In [109]:
eco_plotly = []

for time_period in eco_time_periods:
    eco_plotly.append(color_by_var(db_scomplex, dbscan_df, time_period))

In [119]:
eco_plotly[0][1]['marker']['opacity']

1.0

## Playing around with the structure of scomplex

In [None]:
scomplex_db = mapper_pca2_db(X, DBSCAN_EPSILON = 1, DBSCAN_MIN_SAMPLES = 5, N_CUBES = [100,100], 
                             PERC_OVERLAP = [.5,.5], remove_cond_bool = True, scomplex_bool = True)

In [None]:
plotly_graph_data = plotly_graph(kmgraph, graph_layout = 'fr')

layout = plot_layout(title = 'HDBScan',
                     width = 620, height = 570,
                     annotation_text = get_kmgraph_meta(mapper_summary))

fw_graph = go.FigureWidget(data=plotly_graph_data, layout=layout)

In [None]:
def mapper_pca2_db(df, DBSCAN_EPSILON = 10, DBSCAN_MIN_SAMPLES = 20, 
                N_CUBES = [10,10], PERC_OVERLAP = [.25,.25],
                remove_cond_bool = False, scomplex_bool = False):
    """
    """
    
    if remove_cond_bool:
        
        X = df[["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                "VEL", "TP", "TN", "SS", "CHLcal"]]
        
    else:
        
        X = df[["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                "VEL", "TP", "TN", "SS", "CHLcal", "COND"]]
    
    # for discerning primary variables in PCA 
    continuous_variables = ["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                            "VEL", "TP", "TN", "SS", "CHLcal"]
    
    if not remove_cond_bool:
        
        continuous_variables.append("COND")
    
    var_to_index = {continuous_variables[i] : i for i in range(len(continuous_variables))}
    projected_vars = continuous_variables
    projected_var_indices = [var_to_index[var] for var in projected_vars]
    
#     if X.shape[0]<10:
#         #print(X)
#         print("Not enough data in ", title, "_size = ", X.shape[0])
#         return(X.shape[0])
    
    # to match up indices in scomplex with the original dataframe X
    X.reset_index(drop = True, inplace = True) 
    
    # create instance of clustering alg
    cluster_alg = DBSCAN(eps = DBSCAN_EPSILON, min_samples = DBSCAN_MIN_SAMPLES, 
                         metric='euclidean')

    # instantiate kepler mapper object
    mapper = km.KeplerMapper(verbose = 0)
    
    # defining filter function as projection on to the first 2 component axis
    pca = PCA(n_components = 2)
    lens = pca.fit_transform(X)
    
#     for j in range(2):
#         pc_j = pca.components_[j]
#         largest_magnitude = max(abs(pc_j))
#         idx_magnitude = np.where(abs(pc_j) == largest_magnitude)[0][0]
        
#         print("*** PCA", j+1, " ***")
#         print("Primary variable: ", continuous_variables[idx_magnitude])
#         print("Corresponding component: ", pc_j[idx_magnitude])
#         print("Explained variance: ", pca.explained_variance_ratio_[j])
    
    summary_variable = mapper.project(np.array(X), projection=projected_var_indices, scaler=None)
    # similar to fit transform
    
    # Generate the simplicial complex
    scomplex = mapper.map(lens, X, 
                          cover=km.Cover(n_cubes = N_CUBES, perc_overlap = PERC_OVERLAP), 
                          clusterer = cluster_alg)  

    if scomplex_bool: 
        return(scomplex)
    
    # the rest of this is for coloring 
    
    pl_brewer = [[0.0, '#006837'],
             [0.1, '#1a9850'],
             [0.2, '#66bd63'],
             [0.3, '#a6d96a'],
             [0.4, '#d9ef8b'],
             [0.5, '#ffffbf'],
             [0.6, '#fee08b'],
             [0.7, '#fdae61'],
             [0.8, '#f46d43'],
             [0.9, '#d73027'],
             [1.0, '#a50026']]

    color_values = lens[:,0] - lens[:,0].min() # changes if PCA1 or PCA1 and PCA2
    # can change to other variables
    color_function_name = ['Distance to x-min'] # set name of color function
    my_colorscale = pl_brewer
    kmgraph, mapper_summary, colorf_distribution = get_mapper_graph(scomplex,
                                                                    color_values,
                                                                    color_function_name=color_function_name, 
                                                                    colorscale=my_colorscale)
    
    plotly_graph_data = plotly_graph(kmgraph, graph_layout='fr', colorscale=my_colorscale, 
                                     factor_size=2.5, edge_linewidth=0.5)
    
    plot_title = str(DBSCAN_EPSILON) + str(DBSCAN_EPSILON) + ', MIN_SAMPLES ' + str(DBSCAN_MIN_SAMPLES) 

    layout = plot_layout(title=plot_title,  
                         width=620, height=570,
                         annotation_text=get_kmgraph_meta(mapper_summary))

    # FigureWidget is responsible for event listeners

    fw_graph = go.FigureWidget(data=plotly_graph_data, layout=layout)
    fw_summary = summary_fig(mapper_summary, height=300)

    dashboard = hovering_widgets(kmgraph, fw_graph, member_textbox_width=600)

    # DESIRED FILE PATH, CHANGE TO FIT YOUR LOCAL MACHINE
    directory_path = "mapper outputs"
    
    #Update the fw_graph colorbar, setting its title:
    fw_graph.data[1].marker.colorbar.title = 'dist to<br>x-min'
    html_output_path = directory_path + 'Eps_' + str(DBSCAN_EPSILON) +'_MinS_' + str(DBSCAN_MIN_SAMPLES) + '_NCubes_' + str(N_CUBES) + '_PercOvlp_' + str(PERC_OVERLAP) + '.html'
    mapper.visualize(scomplex, color_values=color_values, color_function_name=color_function_name,
                     path_html=html_output_path, lens = summary_variable, lens_names = projected_vars)
    
    return scomplex, X