In [1]:
import time
import numpy as np
import pandas as pd
import sys
import networkx as nx
from sklearn.neighbors import kneighbors_graph
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import math

In [2]:
def pairwise_loop(iterable):
    return zip(iterable, iterable[1:])

In [3]:
sys.path.append('/home/sebastian/Documents/PhD/pyprojects/')
sys.path.append('/home/sebastian/Documents/PhD/pyprojects/Modality/')

from Modality.SigMA import SigMA

In [4]:
fname = 'cluster_data_uppersco_extravel_covmtx.csv'
data = pd.read_csv(fname)

In [5]:
data_axes = ['X', 'Y', 'Z', 'v_alpha', 'v_delta']
pos_cols = ['X', 'Y', 'Z']
c_pos, c_vel = 1, 5   # See infos on scaling in Kerr+2021 (https://arxiv.org/pdf/2105.09338.pdf)

df_cluster = data[data_axes]
df_cluster[['v_alpha', 'v_delta']] *= c_vel



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [6]:
%%time
max_knn = 70
te = SigMA(df_cluster, max_neighbors=50, beta=0.99, knn_initcluster_graph=50, knn_hypotest=20, knn_rho_max=max_knn)


Pass n_neighbors=50 as keyword args. From version 0.25 passing these as positional arguments will result in an error



CPU times: user 51.7 s, sys: 344 ms, total: 52.1 s
Wall time: 17.6 s


In [7]:
%%time
res = te.fit(alpha=0.01)

CPU times: user 2.25 s, sys: 4.63 ms, total: 2.26 s
Wall time: 2.25 s


## Find path between any tow points 

In [10]:
edges, columns = te.cluster_borders.nonzero()
# Index array sorted by density
sorted_by_density = np.argsort(te.cluster_borders[edges, columns])[::-1]

In [11]:
data_idx = np.arange(te.data.shape[0])
parents = np.arange(te.t.n_leaves_)
# Get modal points of all 
cluster_modes = [data_idx[te.t.leaf_labels_ == i][
                     np.argmax(te.t.weights_[te.t.leaf_labels_ == i])]
                 for i in range(te.t.n_leaves_)
                 ]

In [12]:
# Loop through saddle points and merge clusters
merge_sequence = [] # <-- keys...mode that merges into other value mode
for i, (e, c) in enumerate(zip(edges[sorted_by_density], columns[sorted_by_density])):
    
    e_rho = te.t.weights_[cluster_modes[e]]
    c_rho = te.t.weights_[cluster_modes[c]]
    
    if e_rho>c_rho:
        # c merges into e
        merge_sequence.append((cluster_modes[c], cluster_modes[e]))
    else:
        # e merges into c
        merge_sequence.append((cluster_modes[e], cluster_modes[c]))

In [18]:
def shortest_path(merge_sequence, until_idx, source, target):
    G = nx.Graph()
    G.add_edges_from(merge_sequence[:until_idx])
    try:
        path = nx.shortest_path(G, source=source, target=target)
    except:
        path = []
    return path


def source_occurance(merge_sequence, source, target):
    source_list = np.asarray([source_target[0] for source_target in merge_sequence])
    return np.sort(np.where((source_list==source) | (source_list==target) )[0])

In [19]:
map_clustermodes_2_saddleidx = {cm: idx for idx, cm in enumerate(cluster_modes)}

In [20]:
def path_outline(te, source, target, merge_sequence, map_clustermodes_2_saddleidx):
    space_path = []
    for until_idx in source_occurance(merge_sequence, source, target):
        path = shortest_path(merge_sequence, until_idx=until_idx+1, source=source, target=target)
        # If the has has entries, it is the correct path
        if len(path)>0:
            for i, (start, end) in enumerate(pairwise_loop(path)):    
                saddle_pt = te.cluster_saddle_points[frozenset({map_clustermodes_2_saddleidx[start], map_clustermodes_2_saddleidx[end]})]
                start_pt = te.data[start]
                end_pt = te.data[end]
                if i==0:
                    space_path += [start_pt, saddle_pt, end_pt]
                else:
                    # start_pt is last entries end_pt --> avoid duplicates
                    space_path += [saddle_pt, end_pt]
                    
            return space_path
    # If no path was found raise error
    raise ValueError(f'Source ({source}) and target ({target}) do not have a connecting path!')
    

## Example usage

In [23]:
path_outline(te=te, source=26350, target=33772, merge_sequence=merge_sequence, map_clustermodes_2_saddleidx=map_clustermodes_2_saddleidx)

[array([  94.50539639,  -26.75910567,   66.54171553, -243.11073934,
          69.69481751]),
 array([ 185.20114341,  -16.8944994 ,   89.97010394, -269.26466764,
          39.06044086]),
 array([ 136.94191959,  -39.62362433,   63.71649851, -181.53778074,
        -149.73861696]),
 array([  67.85526158,   11.40465988,   90.0949484 , -154.2174657 ,
        -217.99981529]),
 array([ 107.30681039,  -28.25023506,   25.86836394,  -89.85204779,
        -201.04284483]),
 array([  69.8047169 ,   19.64529397,   76.87469795, -143.25555434,
        -133.422049  ]),
 array([ 91.49952456, -42.09763771,  46.33520427, -72.73366383,
        -77.16740656]),
 array([  80.10157371,  -24.83735711,   95.82008199,  -94.02590881,
        -170.94131926]),
 array([ 132.81386993,  -14.361417  ,   16.39860176,  -31.80304362,
        -158.34815797]),
 array([ 132.11687609,   20.57386582,   87.8928632 ,  -94.35820965,
        -186.27838552]),
 array([ 143.63609484,  -24.23798521,   53.94326357, -175.26989999,
       