# Initialization

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# imports
import pandas as pd
import numpy as np
import os
import sys
import pickle
from matplotlib import pyplot as plt
import matplotlib
import boto3

# random seed
seed = 42
np.random.seed(seed)

# local files paths
local_home_dir_path = r'C:\Users\ahershko\OneDrive - Qualcomm\Documents\Thesis'  # os.path.expanduser("~")
local_work_dir_path = os.path.join(local_home_dir_path, 'git')
local_code_dir_path = os.path.join(local_work_dir_path , 'code')

# S3 file paths
endpoint_url = 'https://s3-west.nrp-nautilus.io'
bucket_name = 'tau-astro'
prefix = 'almogh'
s3_work_dir_path = os.path.join(prefix, 'workdir3')
s3_saves_dir_path = os.path.join(s3_work_dir_path , 'model_saves')
s3_data_dir_path = os.path.join(s3_work_dir_path , 'data')
s3_data_ver_dir_path = os.path.join(s3_data_dir_path,'HighSNR_12K_V1')

s3_client = boto3.client("s3", endpoint_url=endpoint_url)

# adding code folder to path
sys.path.insert(1, local_code_dir_path)
from s3 import to_s3_npy, to_s3_pkl, from_s3_npy, from_s3_pkl, to_s3_fig

# Load

In [3]:
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
N = 9000 # dimmension of the distance matrix
wl_grid = np.linspace(3825.0,7725.0,7800) # shortcut for the wl_grid vector
perplexity = 5

## RF

In [4]:
# RF dir path
RF_save_dir_name = 'simple___2021_11_27___22_09_00___standard_RF_max_depth_10'
s3_rf_save_dir_path = os.path.join(s3_saves_dir_path, 'RF', RF_save_dir_name)

# loads - common
I_slice = from_s3_npy(s3_client, bucket_name, os.path.join(s3_rf_save_dir_path, 'I_slice.npy').replace("\\","/"))
I_train = from_s3_npy(s3_client, bucket_name, os.path.join(s3_rf_save_dir_path, 'I_train.npy').replace("\\","/"))
X_train_real = from_s3_npy(s3_client, bucket_name, os.path.join(s3_rf_save_dir_path, 'X.npy').replace("\\","/"))
gs = from_s3_pkl(s3_client, bucket_name, os.path.join(s3_data_ver_dir_path, 'gs.pkl').replace("\\","/"))
I_real_train = I_train[I_train<len(I_slice)]
snr = gs.snMedian.iloc[I_slice[I_real_train]]

# loads - RF
I_train_NN, I_test_NN = train_test_split(np.arange(snr.shape[0]), train_size=N, random_state=seed)
sim_mat = from_s3_npy(s3_client, bucket_name, os.path.join(s3_rf_save_dir_path, 'sim_mat.npy').replace("\\","/"))
sim_mat = sim_mat[I_train_NN,:]
sim_mat = sim_mat[:,I_train_NN]
D_RF = 1-sim_mat
RF_weird_scores = np.mean(D_RF, axis=1)
RF_sne = TSNE(n_components=2, perplexity=perplexity, metric='precomputed', verbose=1, random_state=seed).fit_transform(D_RF)

loading from uri: s3://tau-astro/almogh/workdir3/model_saves/RF/simple___2021_11_27___22_09_00___standard_RF_max_depth_10/I_slice.npy
loading from uri: s3://tau-astro/almogh/workdir3/model_saves/RF/simple___2021_11_27___22_09_00___standard_RF_max_depth_10/I_train.npy
loading from uri: s3://tau-astro/almogh/workdir3/model_saves/RF/simple___2021_11_27___22_09_00___standard_RF_max_depth_10/X.npy
loading from uri: s3://tau-astro/almogh/workdir3/data/HighSNR_12K_V1/gs.pkl
loading from uri: s3://tau-astro/almogh/workdir3/model_saves/RF/simple___2021_11_27___22_09_00___standard_RF_max_depth_10/sim_mat.npy
[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 9000 samples in 0.206s...
[t-SNE] Computed neighbors for 9000 samples in 3.085s...
[t-SNE] Computed conditional probabilities for sample 1000 / 9000
[t-SNE] Computed conditional probabilities for sample 2000 / 9000
[t-SNE] Computed conditional probabilities for sample 3000 / 9000
[t-SNE] Computed conditional probabilities for sample 4

## NN

In [5]:
## NN dir path
NN_save_dir_name = 'LongTrain___2022_01_18___12_39_15___Kernels_31_Filters_64_32_16_8_4_Hiddens_512_128_tanh'
s3_NN_save_dir_path = os.path.join(s3_saves_dir_path, 'NN', NN_save_dir_name)

# Loading Z - the upper triangular part of the distance matrices - and recreating the distances matrix
Z = from_s3_npy(s3_client, bucket_name, os.path.join(s3_NN_save_dir_path, 'Z.npy').replace("\\","/"))
D_NN = np.zeros(shape=(N,N))
D_NN[np.triu_indices(N)] = Z
D_NN = D_NN.T
D_NN[np.triu_indices(N)] = Z

# calculating weirdness scores and t-SNE
NN_weird_scores = np.mean(D_NN, axis=1)
NN_sne = TSNE(n_components=2, perplexity=perplexity, metric='precomputed', verbose=1, random_state=seed).fit_transform(D_NN)

loading from uri: s3://tau-astro/almogh/workdir3/model_saves/NN/LongTrain___2022_01_18___12_39_15___Kernels_31_Filters_64_32_16_8_4_Hiddens_512_128_tanh/Z.npy
[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 9000 samples in 0.216s...
[t-SNE] Computed neighbors for 9000 samples in 2.763s...
[t-SNE] Computed conditional probabilities for sample 1000 / 9000
[t-SNE] Computed conditional probabilities for sample 2000 / 9000
[t-SNE] Computed conditional probabilities for sample 3000 / 9000
[t-SNE] Computed conditional probabilities for sample 4000 / 9000
[t-SNE] Computed conditional probabilities for sample 5000 / 9000
[t-SNE] Computed conditional probabilities for sample 6000 / 9000
[t-SNE] Computed conditional probabilities for sample 7000 / 9000
[t-SNE] Computed conditional probabilities for sample 8000 / 9000
[t-SNE] Computed conditional probabilities for sample 9000 / 9000
[t-SNE] Mean sigma: 0.053606
[t-SNE] KL divergence after 250 iterations with early exaggeration: 101.67471

## NN+AWGN

In [6]:
## NN dir path
NN_save_dir_name = 'LongTrainDenoise___2022_01_18___12_39_30___Kernels_31_Filters_64_32_16_8_4_Hiddens_512_128_tanh'
s3_NN_save_dir_path = os.path.join(s3_saves_dir_path, 'NN', NN_save_dir_name)

# Loading Z - the upper triangular part of the distance matrices - and recreating the distances matrix
Z = from_s3_npy(s3_client, bucket_name, os.path.join(s3_NN_save_dir_path, 'Z.npy').replace("\\","/"))
D_NN = np.zeros(shape=(N,N))
D_NN[np.triu_indices(N)] = Z
D_NN = D_NN.T
D_NN[np.triu_indices(N)] = Z

# calculating weirdness scores and t-SNE
NN_AWGN_weird_scores = np.mean(D_NN, axis=1)
NN_AWGN_sne = TSNE(n_components=2, perplexity=perplexity, metric='precomputed', verbose=1, random_state=seed).fit_transform(D_NN)

loading from uri: s3://tau-astro/almogh/workdir3/model_saves/NN/LongTrainDenoise___2022_01_18___12_39_30___Kernels_31_Filters_64_32_16_8_4_Hiddens_512_128_tanh/Z.npy
[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 9000 samples in 0.207s...
[t-SNE] Computed neighbors for 9000 samples in 2.651s...
[t-SNE] Computed conditional probabilities for sample 1000 / 9000
[t-SNE] Computed conditional probabilities for sample 2000 / 9000
[t-SNE] Computed conditional probabilities for sample 3000 / 9000
[t-SNE] Computed conditional probabilities for sample 4000 / 9000
[t-SNE] Computed conditional probabilities for sample 5000 / 9000
[t-SNE] Computed conditional probabilities for sample 6000 / 9000
[t-SNE] Computed conditional probabilities for sample 7000 / 9000
[t-SNE] Computed conditional probabilities for sample 8000 / 9000
[t-SNE] Computed conditional probabilities for sample 9000 / 9000
[t-SNE] Mean sigma: 0.051001
[t-SNE] KL divergence after 250 iterations with early exaggeration: 92

# Plot

## Definitions

In [7]:
# Imports
from datetime import datetime
import traceback
import holoviews as hv
from holoviews import opts
from holoviews.streams import Selection1D
from bokeh.models import HoverTool
from scipy import stats
import panel as pn
from holoviews.plotting.links import DataLink
hv.extension('bokeh')

In [8]:
# creading the dataframe for RF
RF_df = pd.DataFrame()
RF_df['feature_1'] = RF_sne[:,0]
RF_df['feature_2'] = RF_sne[:,1]
RF_df['score'] = RF_weird_scores
RF_df['snr'] = snr
RF_df['index'] = np.arange(len(RF_df))

# creading the dataframe for NN
NN_df = pd.DataFrame()
NN_df['feature_1'] = NN_sne[:,0]
NN_df['feature_2'] = NN_sne[:,1]
NN_df['score'] = NN_weird_scores
NN_df['snr'] = snr
NN_df['index'] = np.arange(len(NN_df))

# creading the dataframe for NN+AWGN
NN_AWGN_df = pd.DataFrame()
NN_AWGN_df['feature_1'] = NN_AWGN_sne[:,0]
NN_AWGN_df['feature_2'] = NN_AWGN_sne[:,1]
NN_AWGN_df['score'] = NN_weird_scores
NN_AWGN_df['snr'] = snr
NN_AWGN_df['index'] = np.arange(len(NN_AWGN_df))

# full dataframe
full_df = pd.DataFrame()
full_df['RF_feature_1'] = RF_sne[:,0]
full_df['RF_feature_2'] = RF_sne[:,1]
full_df['NN_feature_1'] = NN_sne[:,0]
full_df['NN_feature_2'] = NN_sne[:,1]
full_df['NN_AWGN_feature_1'] = NN_AWGN_sne[:,0]
full_df['NN_AWGN_feature_2'] = NN_AWGN_sne[:,1]
full_df['score'] = RF_weird_scores
full_df['snr'] = snr
full_df['index'] = np.arange(len(RF_df))

In [9]:
def points_dmap_callable_inner(src, color_src):
    points = hv.Points(full_df, kdims=[src+'_feature_1', src+'_feature_2']).opts(color=color_src, cmap='jet').opts(tools=['tap','box_select','lasso_select']).opts(selection_line_color='black', selection_alpha=0.7, nonselection_alpha=0.1).opts(framewise=True, width=700, height=500, colorbar=True)
    return points

def points_dmap_callable(color_src):
    """
    The callable function for the points DynamicMap.
    """
    RF_points = points_dmap_callable_inner('RF', color_src)
    NN_points = points_dmap_callable_inner('NN', color_src)
    NN_AWGN_points = points_dmap_callable_inner('NN_AWGN', color_src)
    NN_dlink = DataLink(RF_points, NN_points)
    NN_AWGN_dlink = DataLink(RF_points, NN_AWGN_points)
    points_layout = (RF_points+NN_points+NN_AWGN_points)
    return points_layout

def spectra_dmap_callable(index):
    """
    The callable function for the spectra DynamicMap.
    """
    with open(r'C:\Users\ahershko\OneDrive - Qualcomm\Documents\Thesis\git\debug.txt','w') as f:
        f.write('in spectra_dmap_callable - '+datetime.now().strftime("%d/%m/%Y %H:%M:%S")+'\n')
        try:
            w = wl_grid
            if len(index)==0:
                f.write('len==0\n')
                # No Selection
                x = np.zeros(shape=wl_grid.shape)
                label = 'No Selection'
                x_max_err = x
                x_min_err = x
            else:
                f.write('len!=0\n')
                x = np.nanmean(X_train_real[index], axis=0)
                #x_valid = ~np.isnan(x)
                #x = x[x_valid]
                #w = w[x_valid]
                if len(index)==1:
                    f.write('len==1\n')
                    # a single point - plotting the outlier feature importance
                    label = 'index=%s, snr=%f, score=%f' % (index[0], snr[index[0]], weird_scores[index[0]])
                    x_max_err = np.zeros_like(x)
                    x_min_err = np.zeros_like(x)
                else:
                    f.write('len>1\n')
                    # Multiple points - plotting the cluster feature importance
                    label = '%d points selected - plotting the average' % len(index)
                    x_max_err = np.nanmax(X_train_real[index], axis=0)-x
                    x_min_err = x-np.nanmin(X_train_real[index], axis=0)
                
            # decimating max and min by 2 (for some reason, spread is not showing from over ~5000 points)
            f.write('x_max_err type = {0}\n'.format(str(type(x_max_err))))
            f.write('x_max_err shape = {0}\n'.format(str(x_max_err.shape)))
            D = 2
            w_spread = w[::D].reshape(-1)
            x_spread = x[::D].reshape(-1)
            x_max_err = x_max_err[::D].reshape(-1)
            x_min_err = x_min_err[::D].reshape(-1)
            #x_spread = np.mean(x.reshape(-1,D),axis=1).reshape(-1)
            #x_max_err = np.max(x_max_err.reshape(-1,D),axis=1).reshape(-1)
            #x_max_err = np.zeros_like(x_spread)
            #x_min_err = np.max(x_min_err.reshape(-1,D),axis=1).reshape(-1)
            #x_min_err = np.zeros_like(x_spread)
            assert len(w_spread)==len(x_spread)==len(x_max_err)==len(x_min_err), 'length must be equal! shapes are {0}, {1}, {2}, {3}.'.format(w_spread.shape, x_spread.shape,x_max_err.shape, x_min_err.shape)

            #flux = hv.Curve((w,x), kdims=['w'],vdims=['flux']).opts(color='black')
            flux = hv.Curve((w,x), kdims=['w'],vdims=['flux']).opts(color='black').opts(norm=dict(framewise=True)) * hv.Spread((w_spread,x_spread,x_min_err,x_max_err), kdims=['w'],vdims=['flux', 'yerrneg', 'yerrpos']).opts(fill_alpha=0.5, line_alpha=0).opts(norm=dict(framewise=True))
            #flux = hv.Curve((w,x), kdims=['w'],vdims=['flux']).opts(color='black') * hv.Curve((w_spread,x_spread), kdims=['w'],vdims=['flux']).opts(color='red')
            #flux = hv.Spread((w,x,x_min_err,x_max_err), kdims=['w'],vdims=['y', 'yerrneg', 'yerrpos'])
            #np.save(r'C:\Users\ahershko\OneDrive - Qualcomm\Documents\Thesis\git\w.npy', w)
            #np.save(r'C:\Users\ahershko\OneDrive - Qualcomm\Documents\Thesis\git\x.npy', x)
            #np.save(r'C:\Users\ahershko\OneDrive - Qualcomm\Documents\Thesis\git\x_max_err.npy', x_max_err)
            #np.save(r'C:\Users\ahershko\OneDrive - Qualcomm\Documents\Thesis\git\x_min_err.npy', x_min_err)
        
        except Exception as e:
            f.write('exception!\n')
            f.write(str(e)+'\n')
            tb = traceback.format_exc()
            f.write(tb)
            
        flux = flux.opts(tools=['hover']).relabel(label).opts(width=800, height=300, show_grid=True)

        #f.write('flux is an object of type: {0}\n'.format(str(type(flux))))
        #f.write('exiting...\n')
    
    return flux

## Interactive plot

In [10]:
color_src = 'score'
RF_points = points_dmap_callable_inner('RF', color_src)
NN_points = points_dmap_callable_inner('NN', color_src)
NN_AWGN_points = points_dmap_callable_inner('NN_AWGN', color_src)

NN_dlink = DataLink(RF_points, NN_points)
NN_AWGN_dlink = DataLink(RF_points, NN_AWGN_points)

selection = Selection1D(source=RF_points) # creating a selection from the points
spectra_dmap = hv.DynamicMap(spectra_dmap_callable, kdims=[], streams=[selection])
spectra_dmap.opts(norm=dict(framewise=True))

# Building the layout full layout
layout = (RF_points+NN_points+NN_AWGN_points+spectra_dmap).opts(merge_tools=False)
layout.cols(1)