## Load libraries

In [1]:
import kmapper as km
# import sklearn

from sklearn.cluster import DBSCAN # clustering algorithm
from sklearn.decomposition import PCA # projection (lens) creation
from sklearn.preprocessing import StandardScaler
import hdbscan

# from sklearn import ensemble
# from sklearn.manifold import MDS

import plotly.graph_objs as go
# from ipywidgets import interactive, HBox, VBox, widgets, interact # ?
# import dash_html_components as html # ?
# import dash_core_components as dcc # ?

from kmapper.plotlyviz import * # static and interactive plots
import psutil # for plotlyviz
import kaleido # for plotlyviz
# import networkx # ?

# import dash # ?
import warnings #? 
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Read data

In [2]:

water20 = pd.read_csv("../../LTRM data/RF interpolation/water_full.csv")
water20.head()


X = water20[["WDP", "SECCHI", "TEMP", "DO", "TURB",
             "VEL", "TP", "TN", "SS", "CHLcal"]]

X = StandardScaler().fit_transform(X)

# n_data = watershort.shape[0]
n_data = water20.shape[0]

X = pd.DataFrame(X, columns = ["WDP", "SECCHI", "TEMP", "DO", "TURB",
                             "VEL", "TP", "TN", "SS", "CHLcal"])

X.head()

Unnamed: 0,WDP,SECCHI,TEMP,DO,TURB,VEL,TP,TN,SS,CHLcal
0,-0.362348,-0.31958,0.861452,-1.070321,-0.201687,0.38197,0.178906,0.72931,-0.110019,-0.561444
1,1.517428,-0.268383,0.861452,-1.070321,-0.201687,0.923263,0.185471,1.305956,-0.165548,-0.601495
2,0.295574,-0.242785,0.850848,-1.163927,-0.263691,0.775638,0.126391,0.72931,-0.206899,-0.585475
3,1.799394,-0.370777,0.850848,-1.132725,-0.201687,0.84945,0.073876,0.918395,-0.215169,-0.593485
4,1.047484,-0.191589,0.861452,-1.070321,-0.124182,0.824846,0.237986,0.776269,-0.042676,-0.558773


## Define functions

In [3]:
def mapper_pca2_db(df, DBSCAN_EPSILON = 10, DBSCAN_MIN_SAMPLES = 20, 
                N_CUBES = [10,10], PERC_OVERLAP = [.25,.25],
                remove_cond_bool = False, scomplex_bool = False):
    """
    """
    
    if remove_cond_bool:
        
        X = df[["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                "VEL", "TP", "TN", "SS", "CHLcal"]]
        
    else:
        
        X = df[["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                "VEL", "TP", "TN", "SS", "CHLcal", "COND"]]
    
    # for discerning primary variables in PCA 
    continuous_variables = ["WDP", "SECCHI", "TEMP", "DO", "TURB", 
                            "VEL", "TP", "TN", "SS", "CHLcal"]
    
    if not remove_cond_bool:
        
        continuous_variables.append("COND")
    
    var_to_index = {continuous_variables[i] : i for i in range(len(continuous_variables))}
    projected_vars = continuous_variables
    projected_var_indices = [var_to_index[var] for var in projected_vars]
    
#     if X.shape[0]<10:
#         #print(X)
#         print("Not enough data in ", title, "_size = ", X.shape[0])
#         return(X.shape[0])
    
    # to match up indices in scomplex with the original dataframe X
    X.reset_index(drop = True, inplace = True) 
    
    # create instance of clustering alg
    cluster_alg = DBSCAN(eps = DBSCAN_EPSILON, min_samples = DBSCAN_MIN_SAMPLES, 
                         metric='euclidean')

    # instantiate kepler mapper object
    mapper = km.KeplerMapper(verbose = 0)
    
    # defining filter function as projection on to the first 2 component axis
    pca = PCA(n_components = 2)
    lens = pca.fit_transform(X)
    
#     for j in range(2):
#         pc_j = pca.components_[j]
#         largest_magnitude = max(abs(pc_j))
#         idx_magnitude = np.where(abs(pc_j) == largest_magnitude)[0][0]
        
#         print("*** PCA", j+1, " ***")
#         print("Primary variable: ", continuous_variables[idx_magnitude])
#         print("Corresponding component: ", pc_j[idx_magnitude])
#         print("Explained variance: ", pca.explained_variance_ratio_[j])
    
    summary_variable = mapper.project(np.array(X), projection=projected_var_indices, scaler=None)
    # similar to fit transform
    
    # Generate the simplicial complex
    scomplex = mapper.map(lens, X, 
                          cover=km.Cover(n_cubes = N_CUBES, perc_overlap = PERC_OVERLAP), 
                          clusterer = cluster_alg)  

    if scomplex_bool: 
        return(scomplex)
    
    # the rest of this is for coloring 
    
    pl_brewer = [[0.0, '#006837'],
             [0.1, '#1a9850'],
             [0.2, '#66bd63'],
             [0.3, '#a6d96a'],
             [0.4, '#d9ef8b'],
             [0.5, '#ffffbf'],
             [0.6, '#fee08b'],
             [0.7, '#fdae61'],
             [0.8, '#f46d43'],
             [0.9, '#d73027'],
             [1.0, '#a50026']]

    color_values = lens[:,0] - lens[:,0].min() # changes if PCA1 or PCA1 and PCA2
    # can change to other variables
    color_function_name = ['Distance to x-min'] # set name of color function
    my_colorscale = pl_brewer
    kmgraph, mapper_summary, colorf_distribution = get_mapper_graph(scomplex,
                                                                    color_values,
                                                                    color_function_name=color_function_name, 
                                                                    colorscale=my_colorscale)
    
    plotly_graph_data = plotly_graph(kmgraph, graph_layout='fr', colorscale=my_colorscale, 
                                     factor_size=2.5, edge_linewidth=0.5)
    
    plot_title = str(DBSCAN_EPSILON) + str(DBSCAN_EPSILON) + ', MIN_SAMPLES ' + str(DBSCAN_MIN_SAMPLES) 

    layout = plot_layout(title=plot_title,  
                         width=620, height=570,
                         annotation_text=get_kmgraph_meta(mapper_summary))

    # FigureWidget is responsible for event listeners

    fw_graph = go.FigureWidget(data=plotly_graph_data, layout=layout)
    fw_summary = summary_fig(mapper_summary, height=300)

    dashboard = hovering_widgets(kmgraph, fw_graph, member_textbox_width=600)

    # DESIRED FILE PATH, CHANGE TO FIT YOUR LOCAL MACHINE
    directory_path = "mapper outputs"
    
    #Update the fw_graph colorbar, setting its title:
    fw_graph.data[1].marker.colorbar.title = 'dist to<br>x-min'
    html_output_path = directory_path + 'Eps_' + str(DBSCAN_EPSILON) +'_MinS_' + str(DBSCAN_MIN_SAMPLES) + '_NCubes_' + str(N_CUBES) + '_PercOvlp_' + str(PERC_OVERLAP) + '.html'
    mapper.visualize(scomplex, color_values=color_values, color_function_name=color_function_name,
                     path_html=html_output_path, lens = summary_variable, lens_names = projected_vars)
    
    return scomplex, X

## Playing around with the structure of scomplex

In [4]:
scomplex_db = mapper_pca2_db(X, DBSCAN_EPSILON = 1, DBSCAN_MIN_SAMPLES = 5, N_CUBES = [100,100], 
                             PERC_OVERLAP = [.5,.5], remove_cond_bool = True, scomplex_bool = True)

In [7]:
for node in scomplex_db:
    print(node)
    
    print(scomplex_db[node])

nodes
defaultdict(<class 'list'>, {'cube5_cluster0': [22392, 22402, 22406, 22449, 22473, 22486, 22492], 'cube6_cluster0': [2545, 3094, 6468, 6470, 6471, 6472, 6989, 9980, 20783, 22392, 22402, 22406, 22409, 22423, 22425, 22430, 22486, 22492], 'cube7_cluster0': [2545, 6468, 6470, 6471, 6472, 6495, 6976, 6999, 7371, 9980, 20783, 22409, 22423, 22425, 22426, 22430], 'cube9_cluster0': [3092, 8917, 22401, 22412, 22449, 22473], 'cube10_cluster0': [2522, 2556, 3084, 3092, 3094, 3095, 3515, 3528, 3578, 3579, 4486, 4491, 4937, 6496, 6498, 6506, 6983, 6985, 6989, 7030, 7437, 7438, 8456, 8896, 8917, 9448, 9460, 9987, 21874, 22392, 22401, 22410, 22412, 22413, 22414, 22416, 22438, 22439, 22449, 22473, 22483, 22486, 22492, 22494, 26648], 'cube11_cluster0': [346, 2522, 2534, 2545, 2546, 2556, 3084, 3093, 3094, 3095, 3515, 3528, 3578, 3579, 4486, 4491, 4937, 4941, 6427, 6428, 6467, 6470, 6471, 6472, 6496, 6498, 6506, 6975, 6983, 6985, 6989, 7030, 7437, 7438, 7455, 8456, 8896, 9448, 9460, 9980, 9987, 207