### Imports

In [3]:
import kmapper as km
from kmapper.plotlyviz import *
import sklearn
# DBSCAN from sklearn for clustering algorithms
from sklearn.cluster import DBSCAN
# PCA from sklearn for projection/lens creation
from sklearn.decomposition import PCA

# Plotly and Dash
import plotly.graph_objs as go
import dash_html_components as html
import dash_core_components as dcc
import dash
from ipywidgets import interactive, HBox, VBox, widgets, interact
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

### Functions

In [4]:
def cluster_fun(X, DBSCAN_EPSILON = 20, DBSCAN_MIN_SAMPLES = 1, N_CUBES = [7,7], PERC_OVERLAP = [.5,.5]):
    """

    """
    
    
#     keys = list(dict_df.keys())
#     print(keys)
#     X = dict_df.get(keys[0])
    X = X[["PREDICTED_WDP", "PREDICTED_SECCHI", "PREDICTED_TEMP", "PREDICTED_DO", 
           "PREDICTED_TURB","PREDICTED_COND", "PREDICTED_VEL", "PREDICTED_TP", 
           "PREDICTED_TN", "PREDICTED_SS", "PREDICTED_CHLcal"]]
    if X.shape[0]<DBSCAN_MIN_SAMPLES:
        #print(X)
        print("Not enough data to cluster in ", keys, "_size = ", X.shape[0])
        print("DBSCAN_MIN_SAMPLES", DBSCAN_MIN_SAMPLES)
        return([DBSCAN_MIN_SAMPLES, X.shape[0]])
    
    
    db = DBSCAN(eps=20, min_samples=2).fit(X)
    # Number of clusters in labels, ignoring noise if present.
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    print('Estimated number of clusters: %d' % n_clusters_)
    print('Estimated number of noise points: %d' % n_noise_)
    
    return(db)

    
    
def mapper_pca_func(X,title, DBSCAN_EPSILON = 20, DBSCAN_MIN_SAMPLES = 1, N_CUBES = 7, PERC_OVERLAP = .5):
    """
    str(df_stratum_season_time_dict_list[1].keys())
    """
    
    # 
#     keys = list(dict_df.keys())
#     print(keys)
#     X = dict_df.get(keys[0])
    X = X[["PREDICTED_WDP", "PREDICTED_SECCHI", "PREDICTED_TEMP", "PREDICTED_DO", 
           "PREDICTED_TURB","PREDICTED_COND", "PREDICTED_VEL", "PREDICTED_TP", 
           "PREDICTED_TN", "PREDICTED_SS", "PREDICTED_CHLcal"]]
    
    if X.shape[0]<10:
        #print(X)
        print("Not enough data in ", keys, "_size = ", X.shape[0])
        return(X.shape[0])

    # defining clustering and kmapper parameters
    
    # create instance of clustering alg
    cluster_alg = sklearn.cluster.DBSCAN(eps=DBSCAN_EPSILON, min_samples=DBSCAN_MIN_SAMPLES, metric='euclidean')

    # Instantiate kepler mapper object
    mapper = km.KeplerMapper(verbose=0)
    
    # defining filter function as projection on to the first 2 component axis
    pca = PCA(n_components=1)
    lens = pca.fit_transform(X)
    
    # Generate the simplicial complex
    scomplex = mapper.map(lens, X, cover=km.Cover(n_cubes=N_CUBES, perc_overlap=PERC_OVERLAP), clusterer=cluster_alg)  


    pl_brewer = [[0.0, '#006837'],
             [0.1, '#1a9850'],
             [0.2, '#66bd63'],
             [0.3, '#a6d96a'],
             [0.4, '#d9ef8b'],
             [0.5, '#ffffbf'],
             [0.6, '#fee08b'],
             [0.7, '#fdae61'],
             [0.8, '#f46d43'],
             [0.9, '#d73027'],
             [1.0, '#a50026']]

    color_values = lens [:,0] - lens[:,0].min()
    my_colorscale = pl_brewer
    kmgraph,  mapper_summary, colorf_distribution = get_mapper_graph(scomplex, 
                                                                     color_values,  
                                                                     color_function_name='Distance to x-min', 
                                                                     colorscale=my_colorscale)

    bgcolor = 'rgba(10,10,10, 0.9)'
    # y_gridcolor = 'rgb(150,150,150)'# on a black background the gridlines are set on  grey

    plotly_graph_data = plotly_graph(kmgraph, graph_layout='fr', colorscale=my_colorscale, 
                                     factor_size=2.5, edge_linewidth=0.5)
    plot_title = title + str(DBSCAN_EPSILON) + str(DBSCAN_EPSILON) + ', MIN_SAMPLES ' + str(DBSCAN_MIN_SAMPLES) 
    # plot_title = 'Pool 13, Summer 1993-1999; Epsilon ' + str(DBSCAN_EPSILON) + ', MIN_SAMPLES ' + str(DBSCAN_MIN_SAMPLES) 
    layout = plot_layout(title=plot_title,  
                         width=620, height=570,
                         annotation_text=get_kmgraph_meta(mapper_summary),  
                         bgcolor=bgcolor)
    
    # FigureWidget is responsible for event listeners

    fw_graph = go.FigureWidget(data=plotly_graph_data, layout=layout)
    fw_hist = node_hist_fig(colorf_distribution, bgcolor=bgcolor)
    fw_summary = summary_fig(mapper_summary, height=300)

    dashboard = hovering_widgets(kmgraph, 
                                 fw_graph, 
                                 bgcolor=bgcolor, 
                                 member_textbox_width=600)

    # DESIRED FILE PATH, CHANGE TO FIT YOUR LOCAL MACHINE
    directory_path = r"Mapper outputs"
    
    #Update the fw_graph colorbar, setting its title:
    fw_graph.data[1].marker.colorbar.title = 'dist to<br>x-min'
    html_output_path = directory_path + "\\" + title + 'PCA_1' + 'all_var_' + 'Eps_' + str(DBSCAN_EPSILON) +'MinS_' + str(DBSCAN_MIN_SAMPLES) + 'NCUBES_' + str(N_CUBES) + 'PEROvLp_' + str(PERC_OVERLAP) + '.html'
    html_output_path = html_output_path.replace(":","_")
    mapper.visualize(scomplex, path_html=html_output_path)
    return(scomplex)

### Load data

In [5]:
predicted_df = pd.read_csv(r"Interpolated Data\allvars_interpolated_3yearsxseason.csv")
predicted_df.drop("Unnamed: 0",axis = 1,inplace = True)
predicted_df.head()

Unnamed: 0,SHEETBAR,DATE,LATITUDE,LONGITUDE,FLDNUM,STRATUM,LOCATCD,TN,TP,TEMP,...,PREDICTED_TP,PREDICTED_TEMP,PREDICTED_DO,PREDICTED_TURB,PREDICTED_COND,PREDICTED_VEL,PREDICTED_SS,PREDICTED_WDP,PREDICTED_CHLcal,PREDICTED_SECCHI
0,45001219,04/24/1995,37.309363,-89.512099,5,1,9551101,,,12.9,...,0.208,12.9,8.5,210.0,417.0,0.566517,158.799943,12.1,12.87946,13.0
1,45001220,04/24/1995,37.311229,-89.514268,5,1,9551100,,,13.0,...,0.208,13.0,8.5,230.0,414.0,0.565424,176.740056,9.4,12.25618,15.0
2,45001221,04/24/1995,37.323359,-89.497913,5,1,9551099,,,12.9,...,0.44675,12.9,8.6,220.0,418.0,0.530415,226.904952,12.7,12.43426,15.0
3,45001222,04/24/1995,37.325091,-89.495577,5,1,9551018,7.961,0.508,12.8,...,0.508,12.8,8.6,210.0,418.0,0.52222,212.720407,14.7,11.36578,15.0
4,45001223,04/24/1995,37.32849,-89.488649,5,1,9551017,4.282,0.11,12.8,...,0.11,12.8,8.6,210.0,418.0,0.493716,182.309561,5.8,11.98906,15.0


In [6]:
### Creating three main time spans and two overlapping time spans,
### a total of five time spans

# defining different time periods
# first decade
time_dec1 = [1993, 1994, 1995, 1997, 1998, 1999, 2000]
# second decade
time_dec2 = [2001, 2002,2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013]
# third decade
time_dec3 = [2014, 2015, 2016, 2017, 2018, 2019, 2020]
# overlap time periods for continuity
time_overlap1 = [1998, 1999, 2000, 2001, 2002,2003, 2004]
time_overlap2 = [2010, 2011, 2012, 2013, 2014, 2015, 2016]
time_list = [time_dec1, time_overlap1, time_dec2, time_overlap2, time_dec3]

time_list_names = ['93-00', '98-04', '01-13', '10-16', '14-20']
# The stratums in my pool
stratum_list = [1, 2] #, 4, 5, 6, 7, 9]
# The seasons we are currently looking at
Season_names = ["SUMMER"] #, "SPRING", "FALL", "WINTER"]


df_stratum_season_time_dict = {}
s=""
for i in range(len(time_list)):
    for j in stratum_list:
        for k in Season_names:
            s= "Stratum " + str(j)+ " " + k + " " + time_list_names[i] + ": "
            df_stratum_season_time_dict[s] = predicted_df[(predicted_df['YEAR'].isin(time_list[i])) &
                                                             (predicted_df['STRATUM'].isin([j])) & 
                                                             (predicted_df['SEASON'].isin([k]))]
            s=""
print(len(df_stratum_season_time_dict))

10


In [58]:
test = "Stratum 1 SUMMER 93-00:"
test.replace(":","_")

'Stratum 1 SUMMER 93-00_'

In [7]:
for key in df_stratum_season_time_dict:
    print(key)

Stratum 1 SUMMER 93-00: 
Stratum 2 SUMMER 93-00: 
Stratum 1 SUMMER 98-04: 
Stratum 2 SUMMER 98-04: 
Stratum 1 SUMMER 01-13: 
Stratum 2 SUMMER 01-13: 
Stratum 1 SUMMER 10-16: 
Stratum 2 SUMMER 10-16: 
Stratum 1 SUMMER 14-20: 
Stratum 2 SUMMER 14-20: 


In [8]:
# dbclus_dict = {}
# for i in df_stratum_season_time_dict_list:
#     dbclus_dict[str(list(i.keys()))]=cluster_fun(i)

NameError: name 'df_stratum_season_time_dict_list' is not defined

In [9]:
dbclus_dict = {}
for key in df_stratum_season_time_dict:
    dbclus_dict[key] = cluster_fun(df_stratum_season_time_dict[key])

Estimated number of clusters: 32
Estimated number of noise points: 48
Estimated number of clusters: 21
Estimated number of noise points: 54
Estimated number of clusters: 20
Estimated number of noise points: 36
Estimated number of clusters: 18
Estimated number of noise points: 40
Estimated number of clusters: 18
Estimated number of noise points: 30
Estimated number of clusters: 11
Estimated number of noise points: 41
Estimated number of clusters: 23
Estimated number of noise points: 44
Estimated number of clusters: 25
Estimated number of noise points: 43
Estimated number of clusters: 25
Estimated number of noise points: 50
Estimated number of clusters: 30
Estimated number of noise points: 47


In [34]:
n_cubes = [5, 10, 15]
perc_overlap = [.35, .45, .55]

mapper_pca_output_dict = {}
for key in df_stratum_season_time_dict:
    for j in n_cubes:
        for k in perc_overlap:
            newkey = key+"ncubes_"+str(j)+"_overlap_"+str(k)
            mapper_pca_output_dict[newkey] = mapper_pca_func(df_stratum_season_time_dict[key],key, N_CUBES = j, PERC_OVERLAP = k)

In [35]:
len(mapper_pca_output_dict)

90

Saving all outputs to json

In [36]:
import json

jsonFile = json.dumps(mapper_pca_output_dict)
f = open("All_graphs.json","w")
f.write(jsonFile)
f.close()

Load mapper data

In [37]:
jsonFilePath = r"All_graphs.json"
jsonFile = open(jsonFilePath,"r")
data = json.load(jsonFile)
jsonFile.close()

In [38]:
len(data.keys())

90

Pick one output to analyze

In [43]:
#key = list(data.keys())[0]
key = "Stratum 2 SUMMER 01-13: ncubes_10_overlap_0.45"
print(key)
output = data[key]
output.keys()

Stratum 2 SUMMER 01-13: ncubes_10_overlap_0.45


dict_keys(['nodes', 'links', 'simplices', 'meta_data', 'meta_nodes'])

Function to return a nested list of connected components

In [127]:
def connected_components(output):
    simplices = output.get("simplices")
    # Pick out the edges
    pairs = [item for item in simplices if len(item)==2]
    # Pick out all nodes
    nodes = output.get("nodes").keys()
    # Unpacks the list of lists to get every node that is connected to at least one other node with an edge
    connected_nodes = [node for sublist in edges for node in sublist]
    # Pick out the nodes that are its own connected component (not a connected node)
    singles = [[node] for node in nodes if node not in connected_nodes]
    
    # Build connected components
    components = []
    for a, b in pairs:
        for component in components:
            if a in component:
                for i, other_component in enumerate(components):
                    if b in other_component and other_component != component: # a, and b are already in different components: merge
                        component.extend(other_component)
                        components[i:i+1] = []
                        break # we don't have to look for other components for b
                else: # b wasn't found in any other component
                    if b not in component:
                        component.append(b)
                break # we don't have to look for other components for a
            if b in component: # a wasn't in in the component 
                component.append(a)
                break # we don't have to look further
        else: # neither a nor b were found
            components.append([a, b])
            
    # Add the singles into the components
    print("Number of singles:",len(singles))
    components.extend(singles)
    
    # Sort the components by size
    components.sort(reverse = True, key = len)
    
    return components

In [128]:
components = connected_components(output)
len(components)
for component in components:
    print(len(component))

Number of singles: 15


In [130]:
components

[['cube0_cluster0',
  'cube1_cluster0',
  'cube2_cluster0',
  'cube2_cluster9',
  'cube3_cluster0',
  'cube3_cluster5',
  'cube3_cluster16',
  'cube4_cluster4',
  'cube5_cluster2'],
 ['cube3_cluster19',
  'cube4_cluster14',
  'cube3_cluster20',
  'cube5_cluster7',
  'cube6_cluster3'],
 ['cube5_cluster5',
  'cube6_cluster2',
  'cube7_cluster0',
  'cube8_cluster0',
  'cube9_cluster7'],
 ['cube4_cluster0', 'cube5_cluster0', 'cube6_cluster0', 'cube6_cluster1'],
 ['cube2_cluster17', 'cube3_cluster14', 'cube4_cluster12'],
 ['cube3_cluster11', 'cube4_cluster7', 'cube3_cluster12'],
 ['cube0_cluster1', 'cube1_cluster6'],
 ['cube0_cluster2', 'cube1_cluster11'],
 ['cube1_cluster1', 'cube2_cluster6'],
 ['cube1_cluster2', 'cube2_cluster7'],
 ['cube1_cluster3', 'cube2_cluster8'],
 ['cube1_cluster4', 'cube2_cluster11'],
 ['cube1_cluster5', 'cube2_cluster12'],
 ['cube1_cluster7', 'cube2_cluster13'],
 ['cube1_cluster8', 'cube2_cluster14'],
 ['cube1_cluster9', 'cube2_cluster15'],
 ['cube1_cluster10', 'c