# Initialization

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# imports
import pandas as pd
import numpy as np
import os
import sys
import pickle
from matplotlib import pyplot as plt
import matplotlib
import boto3

# random seed
seed = 42
np.random.seed(seed)

# local files paths
local_work_dir_path = os.path.split(os.path.split(os.path.split(os.path.dirname(os.path.realpath("__file__")))[0])[0])[0]
local_code_dir_path = os.path.join(local_work_dir_path , 'code')

# S3 file paths
endpoint_url = 'https://s3-west.nrp-nautilus.io'
bucket_name = 'tau-astro'
prefix = 'almogh'
s3_work_dir_path = '/'.join([prefix, 'workdir3'])
s3_saves_dir_path = '/'.join([s3_work_dir_path , 'model_saves'])
s3_data_dir_path = '/'.join([s3_work_dir_path , 'data'])
s3_data_ver_dir_path = '/'.join([s3_data_dir_path,'100K_V4'])

s3_client = boto3.client("s3", endpoint_url=endpoint_url)

# adding code folder to path
sys.path.insert(1, local_code_dir_path)
from s3 import to_s3_npy, to_s3_pkl, from_s3_npy, from_s3_pkl, to_s3_fig

In [3]:
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
perplexity = 5

# Load

## dataset

In [4]:
X = from_s3_npy(s3_client, bucket_name, '/'.join([s3_data_ver_dir_path, 'X_test_set.npy']))

loading from uri: s3://tau-astro/almogh/workdir3/data/100K_V4/X_test_set.npy


In [5]:
wl_grid = from_s3_npy(s3_client, bucket_name, '/'.join([s3_data_ver_dir_path, 'wl_100K_V4.npy']))

loading from uri: s3://tau-astro/almogh/workdir3/data/100K_V4/wl_100K_V4.npy


In [6]:
gs = from_s3_pkl(s3_client, bucket_name, '/'.join([s3_data_ver_dir_path,'gs_test_V4.pkl']))

loading from uri: s3://tau-astro/almogh/workdir3/data/100K_V4/gs_test_V4.pkl


In [7]:
snr = gs.snMedian.tolist()

## Large RF

In [8]:
Large_RF_weird_scores = from_s3_npy(s3_client, bucket_name, 'almogh/workdir3/model_saves/RF/simple___2022_05_10___11_24_58___100K_V4_full_data_set/weird_scores_hat_test_set.npy')

loading from uri: s3://tau-astro/almogh/workdir3/model_saves/RF/simple___2022_05_10___11_24_58___100K_V4_full_data_set/weird_scores_hat_test_set.npy


In [9]:
Large_RF_sne = from_s3_npy(s3_client, bucket_name, 'almogh/workdir3/model_saves/RF/simple___2022_05_10___11_24_58___100K_V4_full_data_set/test_set_tsne.npy')

loading from uri: s3://tau-astro/almogh/workdir3/model_saves/RF/simple___2022_05_10___11_24_58___100K_V4_full_data_set/test_set_tsne.npy


## Small RF

In [10]:
RF_weird_scores = from_s3_npy(s3_client, bucket_name, 'almogh/workdir3/model_saves/RF/simple___2022_05_07___18_57_07___100K_V4_training_set/weird_scores_hat_test_set.npy')

loading from uri: s3://tau-astro/almogh/workdir3/model_saves/RF/simple___2022_05_07___18_57_07___100K_V4_training_set/weird_scores_hat_test_set.npy


In [11]:
RF_sne = from_s3_npy(s3_client, bucket_name, 'almogh/workdir3/model_saves/RF/simple___2022_05_07___18_57_07___100K_V4_training_set/test_set_tsne.npy')

loading from uri: s3://tau-astro/almogh/workdir3/model_saves/RF/simple___2022_05_07___18_57_07___100K_V4_training_set/test_set_tsne.npy


## NN

In [12]:
NN_weird_scores = from_s3_npy(s3_client, bucket_name, 'almogh/workdir3/model_saves/NN/100K_V4/LongTrainV4___2022_05_09___07_29_04___Kernels_31_Filters_64_32_16_8_4_Hiddens_512_128_tanh/test_set_weird_scores_truncated_distances.npy')
NN_sne = from_s3_npy(s3_client, bucket_name, 'almogh/workdir3/model_saves/NN/100K_V4/LongTrainV4___2022_05_09___07_29_04___Kernels_31_Filters_64_32_16_8_4_Hiddens_512_128_tanh/test_set_tsne_truncated_distances.npy')

loading from uri: s3://tau-astro/almogh/workdir3/model_saves/NN/100K_V4/LongTrainV4___2022_05_09___07_29_04___Kernels_31_Filters_64_32_16_8_4_Hiddens_512_128_tanh/test_set_weird_scores_truncated_distances.npy
loading from uri: s3://tau-astro/almogh/workdir3/model_saves/NN/100K_V4/LongTrainV4___2022_05_09___07_29_04___Kernels_31_Filters_64_32_16_8_4_Hiddens_512_128_tanh/test_set_tsne_truncated_distances.npy


## NN + AWGN

In [13]:
NN_AWGN_weird_scores = from_s3_npy(s3_client, bucket_name, 'almogh/workdir3/model_saves/NN/100K_V4/LongTrainDenoiseV4___2022_05_09___07_28_03___Kernels_31_Filters_64_32_16_8_4_Hiddens_512_128_tanh/test_set_weird_scores_truncated_distances.npy')
NN_AWGN_sne = from_s3_npy(s3_client, bucket_name, 'almogh/workdir3/model_saves/NN/100K_V4/LongTrainDenoiseV4___2022_05_09___07_28_03___Kernels_31_Filters_64_32_16_8_4_Hiddens_512_128_tanh/test_set_tsne_truncated_distances.npy')

loading from uri: s3://tau-astro/almogh/workdir3/model_saves/NN/100K_V4/LongTrainDenoiseV4___2022_05_09___07_28_03___Kernels_31_Filters_64_32_16_8_4_Hiddens_512_128_tanh/test_set_weird_scores_truncated_distances.npy
loading from uri: s3://tau-astro/almogh/workdir3/model_saves/NN/100K_V4/LongTrainDenoiseV4___2022_05_09___07_28_03___Kernels_31_Filters_64_32_16_8_4_Hiddens_512_128_tanh/test_set_tsne_truncated_distances.npy


# Plot

## Definitions

In [14]:
# Imports
from datetime import datetime
import traceback
import holoviews as hv
from holoviews import opts
from holoviews.streams import Selection1D
from bokeh.models import HoverTool
from scipy import stats
import panel as pn
from holoviews.plotting.links import DataLink
hv.extension('bokeh')



In [15]:
# full dataframe
full_df = pd.DataFrame()
full_df['Large_RF_feature_1'] = Large_RF_sne[:,0]
full_df['Large_RF_feature_2'] = Large_RF_sne[:,1]
full_df['RF_feature_1'] = RF_sne[:,0]
full_df['RF_feature_2'] = RF_sne[:,1]
full_df['NN_feature_1'] = NN_sne[:,0]
full_df['NN_feature_2'] = NN_sne[:,1]
full_df['NN_AWGN_feature_1'] = NN_AWGN_sne[:,0]
full_df['NN_AWGN_feature_2'] = NN_AWGN_sne[:,1]
full_df['Large_RF_weird_scores'] = Large_RF_weird_scores
full_df['RF_weird_scores'] = RF_weird_scores
full_df['NN_weird_scores'] = NN_weird_scores
full_df['NN_AWGN_weird_scores'] = NN_AWGN_weird_scores
full_df['snr'] = snr
full_df['index'] = np.arange(len(gs))

In [66]:
np.pad(hist,(1,1))

array([  0, 207,  66,  14,   6,   2,   0,   0,   0,   0,   1,   0],
      dtype=int64)

In [67]:
def get_50p_crossings(x):
    # this function receives a vector of flux values from a certain wavelength, calculates a histogram, finds the crossing points of 50%, and returns the minimal and maximal point of crossing.
    hist, edges = np.histogram(x)
    de = edges[1]-edges[0]
    hist = np.pad(hist, (1,1))
    edges = np.pad(edges, (1,1))
    edges[0] = edges[1]-de
    edges[-1] = edges[-2]+de
    hist = hist/max(hist)
    hist -= 0.5
    flx = (edges[1:]+edges[:-1])/2
    df = flx[1]-flx[0]
    z = np.where(np.diff(np.sign(hist)))[0]
    cross = (abs(hist[z])/abs(hist[z]-hist[z+1]))*df+flx[z]
    return cross[0], cross[-1]

In [68]:
def points_dmap_callable_inner(src, color_src):
    points = hv.Points(full_df, kdims=[src+'_feature_1', src+'_feature_2']).opts(color=color_src, cmap='jet').opts(tools=['tap','box_select','lasso_select']).opts(selection_line_color='black', selection_alpha=0.7, nonselection_alpha=0.1).opts(framewise=True, width=700, height=500, colorbar=True)
    #points = hv.Points(full_df, kdims=[src+'_feature_1', src+'_feature_2']).opts(color=src+'_weird_scores', cmap='jet').opts(tools=['tap','box_select','lasso_select']).opts(selection_line_color='black', selection_alpha=0.7, nonselection_alpha=0.1).opts(framewise=True, width=700, height=500, colorbar=True)
    return points

"""
def points_dmap_callable(color_src):
    # The callable function for the points DynamicMap.
    if color_src=='score': # "score" means each has its own score
        RF_points = points_dmap_callable_inner('RF', 'RF_weird_scores')
        NN_points = points_dmap_callable_inner('NN', 'NN_weird_scores')
        NN_AWGN_points = points_dmap_callable_inner('NN_AWGN', 'NN_AWGN_weird_scores')
    else: # otherwise, all have same score
        assert color_src in ('RF_weird_scores','NN_weird_scores','NN_AWGN_weird_scores'), 'invalid color source.'
        RF_points = points_dmap_callable_inner('RF', color_src)
        NN_points = points_dmap_callable_inner('NN', color_src)
        NN_AWGN_points = points_dmap_callable_inner('NN_AWGN', color_src)
    NN_dlink = DataLink(RF_points, NN_points)
    NN_AWGN_dlink = DataLink(RF_points, NN_AWGN_points)
    points_layout = (RF_points+NN_points+NN_AWGN_points)
    points_layout = (RF_points+NN_AWGN_points)
    return points_layout
"""

def spectra_dmap_callable(index):
    """
    The callable function for the spectra DynamicMap.
    """
    np.save(os.path.join(local_work_dir_path,'index.npy'), index)
    with open(os.path.join(local_work_dir_path,'debug.txt'),'w') as f:
        f.write('in spectra_dmap_callable - '+datetime.now().strftime("%d/%m/%Y %H:%M:%S")+'\n')
        try:
            w = wl_grid
            if len(index)==0:
                f.write('len==0\n')
                # No Selection
                x = np.zeros(shape=wl_grid.shape)
                label = 'No Selection'
                x_max_err = x
                x_min_err = x
            else:
                f.write('len!=0\n')
                x = np.nanmean(X[index], axis=0)
                #x_valid = ~np.isnan(x)
                #x = x[x_valid]
                #w = w[x_valid]
                if len(index)==1:
                    f.write('len==1\n')
                    # a single point - plotting the outlier feature importance
                    label = 'index=%s, snr=%f, RF score=%f, NN score=%f, NN+AWGN score=%f' % (index[0], snr[index[0]], RF_weird_scores[index[0]], NN_weird_scores[index[0]], NN_AWGN_weird_scores[index[0]])
                    x_max_err = np.zeros_like(x)
                    x_min_err = np.zeros_like(x)
                else:
                    f.write('len>1\n')
                    # Multiple points - plotting the cluster feature importance
                    label = '%d points selected - plotting the average' % len(index)
                    cross = np.array([get_50p_crossings(X[index,i]) for i in range(X.shape[1])]).T
                    x_max_err = cross[1,:]-x
                    x_min_err = x-cross[0,:]
                    #x_max_err = np.nanmax(X[index], axis=0)-x
                    #x_min_err = x-np.nanmin(X[index], axis=0)
                
            # decimating max and min by 2 (for some reason, spread is not showing from over ~5000 points)
            f.write('x_max_err type = {0}\n'.format(str(type(x_max_err))))
            f.write('x_max_err shape = {0}\n'.format(str(x_max_err.shape)))
            D = 2
            w_spread = w[::D].reshape(-1)
            x_spread = x[::D].reshape(-1)
            x_max_err = x_max_err[::D].reshape(-1)
            x_min_err = x_min_err[::D].reshape(-1)
            #x_spread = np.mean(x.reshape(-1,D),axis=1).reshape(-1)
            #x_max_err = np.max(x_max_err.reshape(-1,D),axis=1).reshape(-1)
            #x_max_err = np.zeros_like(x_spread)
            #x_min_err = np.max(x_min_err.reshape(-1,D),axis=1).reshape(-1)
            #x_min_err = np.zeros_like(x_spread)
            assert len(w_spread)==len(x_spread)==len(x_max_err)==len(x_min_err), 'length must be equal! shapes are {0}, {1}, {2}, {3}.'.format(w_spread.shape, x_spread.shape,x_max_err.shape, x_min_err.shape)

            #flux = hv.Curve((w,x), kdims=['w'],vdims=['flux']).opts(color='black')
            flux = hv.Curve((w,x), kdims=['w'],vdims=['flux']).opts(color='black').opts(norm=dict(framewise=True)) * hv.Spread((w_spread,x_spread,x_min_err,x_max_err), kdims=['w'],vdims=['flux', 'yerrneg', 'yerrpos']).opts(fill_alpha=0.5, line_alpha=0).opts(norm=dict(framewise=True))
            #flux = hv.Curve((w,x), kdims=['w'],vdims=['flux']).opts(color='black') * hv.Curve((w_spread,x_spread), kdims=['w'],vdims=['flux']).opts(color='red')
            #flux = hv.Spread((w,x,x_min_err,x_max_err), kdims=['w'],vdims=['y', 'yerrneg', 'yerrpos'])
            #np.save(r'C:\Users\ahershko\OneDrive - Qualcomm\Documents\Thesis\git\w.npy', w)
            #np.save(r'C:\Users\ahershko\OneDrive - Qualcomm\Documents\Thesis\git\x.npy', x)
            #np.save(r'C:\Users\ahershko\OneDrive - Qualcomm\Documents\Thesis\git\x_max_err.npy', x_max_err)
            #np.save(r'C:\Users\ahershko\OneDrive - Qualcomm\Documents\Thesis\git\x_min_err.npy', x_min_err)
        
        except Exception as e:
            f.write('exception!\n')
            f.write(str(e)+'\n')
            tb = traceback.format_exc()
            f.write(tb)
            
        flux = flux.opts(tools=['hover']).relabel(label).opts(width=800, height=300, show_grid=True)

        #f.write('flux is an object of type: {0}\n'.format(str(type(flux))))
        #f.write('exiting...\n')
    
    return flux

# <<< All TSNEs are colored by the scores of the large RF >>>

## Interactive plot

In [70]:
color_src = 'Large_RF_weird_scores'
Large_RF_points = points_dmap_callable_inner('Large_RF', color_src)
RF_points = points_dmap_callable_inner('RF', color_src)
NN_points = points_dmap_callable_inner('NN', color_src)
NN_AWGN_points = points_dmap_callable_inner('NN_AWGN', color_src)
#RF_points = points_dmap_callable_inner('RF', 'RF_weird_scores')
#NN_points = points_dmap_callable_inner('NN', 'NN_weird_scores')
#NN_AWGN_points = points_dmap_callable_inner('NN_AWGN', 'NN_AWGN_weird_scores')

RF_dlink = DataLink(Large_RF_points, RF_points)
NN_dlink = DataLink(Large_RF_points, NN_points)
NN_AWGN_dlink = DataLink(Large_RF_points, NN_AWGN_points)

selection = Selection1D(source=Large_RF_points) # creating a selection from the points
spectra_dmap = hv.DynamicMap(spectra_dmap_callable, kdims=[], streams=[selection])
spectra_dmap.opts(norm=dict(framewise=True))

# Building the layout full layout
layout = (Large_RF_points+RF_points+NN_points+NN_AWGN_points+spectra_dmap).opts(merge_tools=False)
layout.cols(1)