<a href="https://colab.research.google.com/github/aimalz/CASTORpz/blob/main/Stratifying_Information.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stratifying Information

##Preliminaries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from astropy.table import Table
prepend = '/content/drive/MyDrive/Research/CASTORpz/'
unp = Table.read(prepend+"unperturbed_mags.fits")
print(len(unp))

# !pip install --upgrade "jax[cuda]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
!pip install jaxlib==0.4.16
!pip install jax==0.4.16

# jaxlib==0.3.22+cuda11.cudnn82

# !pip install jax[cuda]==0.4.17+cuda12.cudnn89 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html

# !pip install "jax[cuda]"==0.4.16
!nvcc --version

!pip install astropy pzflow corner

from pzflow import Flow
import jax.numpy as jnp
import pandas as pd
import numpy as np
import pickle
import corner
import matplotlib.pyplot as plt
from astropy.table import Table
from pzflow import Flow, FlowEnsemble
from pzflow.distributions import Uniform
from pzflow.bijectors import Chain, StandardScaler, NeuralSplineCoupling, ColorTransform, InvSoftplus, RollingSplineCoupling, ShiftBounds

## Define Catalogs and Magnitudes

In [None]:
cd /content/drive/MyDrive/Research/CASTORpz

unp = Table.read(prepend+"unperturbed_mags.fits").to_pandas()#.set_index('ID')
mock = Table.read(prepend+"ext_phot.fits").to_pandas()#.set_index('ID')

names_phot = ["ID", "photoz",
              "LSST_g_MAG", "LSST_g_MAGERR",
              "LSST_r_MAG", "LSST_r_MAGERR",
              "LSST_i_MAG", "LSST_i_MAGERR",
              "LSST_z_MAG", "LSST_z_MAGERR",
              "castor_uv_MAG", "castor_uv_MAGERR",
              "castor_u_MAG", "castor_u_MAGERR",
              "castor_g_MAG", "castor_g_MAGERR",
              "Euclid_VIS_MAG",
              'Euclid_Y_MAG','Euclid_J_MAG','Euclid_H_MAG',
              'Roman_106_MAG',
              'Roman_129_MAG','Roman_158_MAG',
              'Roman_184_MAG']



available_os = ["CASTOR-only", "LSST-only", "Euclid-only", #"Roman-only",
"LSST+CASTOR", "Euclid+CASTOR", #"Roman+CASTOR",
"Euclid+LSST", "Euclid+LSST+CASTOR"
]

Roman_os = ["Roman-only", "Roman+CASTOR", "Roman+Euclid+LSST+CASTOR", "Roman+LSST"]

available_os = available_os + Roman_os
print(available_os)

names = available_os

In [None]:
# load data

# ask Bobby about this

def getTrueY(test_cat, mag_col_names, y_col_name):
    test_cat = Table(test_cat, masked=True, copy=True)
    # remove nans
    for col in mag_col_names:
        test_cat[col].mask = np.isnan(test_cat[col].data) | test_cat[col].mask
        test_cat = test_cat[~test_cat[col].mask] # then remove nans from test set

    true_y = test_cat[y_col_name]
    return true_y.filled()

In [None]:
available_os = ['LSST-only',
                'CASTOR-only',
                'Euclid+LSST',
                'LSST+CASTOR',
                'Roman+Euclid+LSST',
                'Roman+Euclid+LSST+CASTOR']

os_combo_labels = {'LSST-only': 'LSST',
                  'CASTOR-only': 'CASTOR',
                  'LSST+CASTOR': 'LSST+CASTOR',
                  'Roman+LSST': 'LSST+Roman',
                  # 'Roman+LSST+CASTOR' : 'LSST+Roman+CASTOR', #(currently training, will be in Mar_experiments)
                  'Roman+Euclid+LSST': 'LSST+Roman+Euclid',
                  'Roman+Euclid+LSST+CASTOR': 'LSST+Roman+Euclid+CASTOR'}

In [None]:
mock_cat = pd.merge(mock, unp[['ID',"Euclid_VIS_MAG"]], on='ID')
CASTOR_baseline = Table.from_pandas(mock_cat)

In [None]:
# put data in expected format for TLM

LSST_g_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_g_MAG")
LSST_r_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_r_MAG")
LSST_i_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_i_MAG")
LSST_z_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_z_MAG")

# LSST_g_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_g_MAGERR")
# LSST_r_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_r_MAGERR")
# LSST_i_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_i_MAGERR")
# LSST_z_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_z_MAGERR")

CASTOR_g_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="castor_g_MAG")
CASTOR_u_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="castor_u_MAG")
CASTOR_uv_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="castor_uv_MAG")

# CASTOR_g_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="castor_g_MAGERR")
# CASTOR_u_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="castor_u_MAGERR")
# CASTOR_uv_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="castor_uv_MAGERR")

Euclid_VIS_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="Euclid_VIS_MAG")
Euclid_J_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="Euclid_J_MAG")
Euclid_H_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="Euclid_H_MAG")
Euclid_Y_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="Euclid_Y_MAG")

# Roman - only a subset of the Roman filters are available?

# Roman_062_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name='Roman_062_MAG')
# Roman_087_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name='Roman_087_MAG')
Roman_106_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name='Roman_106_MAG')

Roman_129_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name='Roman_129_MAG')
# Roman_146_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name='Roman_146_MAG')
Roman_158_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name='Roman_158_MAG')

Roman_184_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name='Roman_184_MAG')
# Roman_213_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name='Roman_213_MAG')

ID = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="ID")
z_true = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="photoz")

In [None]:
catalogs = dict()
catalogs['LSST+CASTOR'] = pd.DataFrame({'CASTOR_ID': ID, 'z_true': z_true,
                             'r': LSST_r_mag,
                             'g-r': LSST_g_mag - LSST_r_mag,
                             'r-i': LSST_r_mag - LSST_i_mag,
                             'i-z': LSST_i_mag - LSST_z_mag,
                             'uv-u': CASTOR_uv_mag - CASTOR_u_mag,
                             'u-g': CASTOR_u_mag - CASTOR_g_mag
                      }).set_index('CASTOR_ID').dropna()
catalogs['LSST-only'] = pd.DataFrame({'CASTOR_ID': ID, 'z_true': z_true,
                             'r': LSST_r_mag,
                             'g-r': LSST_g_mag - LSST_r_mag,
                             'r-i': LSST_r_mag - LSST_i_mag,
                             'i-z': LSST_i_mag - LSST_z_mag
                      }).set_index('CASTOR_ID').dropna()
catalogs['CASTOR-only'] = pd.DataFrame({'CASTOR_ID': ID, 'z_true': z_true,
                             'g': CASTOR_g_mag,
                             'uv-u': CASTOR_uv_mag - CASTOR_u_mag,
                             'u-g': CASTOR_u_mag - CASTOR_g_mag
                      }).set_index('CASTOR_ID').dropna()
catalogs['Euclid-only'] = pd.DataFrame({'CASTOR_ID': ID, 'z_true': z_true,
                             'v': Euclid_VIS_mag,
                             'v-y': Euclid_VIS_mag - Euclid_Y_mag,
                             'y-j': Euclid_Y_mag - Euclid_J_mag,
                             'j-h': Euclid_J_mag - Euclid_H_mag
                      }).set_index('CASTOR_ID').dropna()

catalogs['Euclid+CASTOR'] = pd.DataFrame({'CASTOR_ID': ID, 'z_true': z_true,
                             'g': CASTOR_g_mag,
                             'uv-u': CASTOR_uv_mag - CASTOR_u_mag,
                             'u-g': CASTOR_u_mag - CASTOR_g_mag,
                             'v': Euclid_VIS_mag,
                             'v-y': Euclid_VIS_mag - Euclid_Y_mag,
                             'y-j': Euclid_Y_mag - Euclid_J_mag,
                             'j-h': Euclid_J_mag - Euclid_H_mag
                      }).set_index('CASTOR_ID').dropna()
catalogs['Euclid+LSST'] = pd.DataFrame({'CASTOR_ID': ID, 'z_true': z_true,
                              'r': LSST_r_mag,
                             'g-r': LSST_g_mag - LSST_r_mag,
                             'r-i': LSST_r_mag - LSST_i_mag,
                             'i-z': LSST_i_mag - LSST_z_mag,
                             'v': Euclid_VIS_mag,
                             'v-y': Euclid_VIS_mag - Euclid_Y_mag,
                             'y-j': Euclid_Y_mag - Euclid_J_mag,
                             'j-h': Euclid_J_mag - Euclid_H_mag
                      }).set_index('CASTOR_ID').dropna()
catalogs['Euclid+LSST+CASTOR'] = pd.DataFrame({'CASTOR_ID': ID, 'z_true': z_true,
                              'r': LSST_r_mag,
                             'g-r': LSST_g_mag - LSST_r_mag,
                             'r-i': LSST_r_mag - LSST_i_mag,
                             'i-z': LSST_i_mag - LSST_z_mag,
                             'v': Euclid_VIS_mag,
                             'v-y': Euclid_VIS_mag - Euclid_Y_mag,
                             'y-j': Euclid_Y_mag - Euclid_J_mag,
                             'j-h': Euclid_J_mag - Euclid_H_mag,
                             'g': CASTOR_g_mag,
                             'uv-u': CASTOR_uv_mag - CASTOR_u_mag,
                             'u-g': CASTOR_u_mag - CASTOR_g_mag
                      }).set_index('CASTOR_ID').dropna()

catalogs["Roman+Euclid+LSST+CASTOR"] = pd.DataFrame({'CASTOR_ID': ID, 'z_true': z_true,
                             'r': LSST_r_mag,
                             'g-r': LSST_g_mag - LSST_r_mag,
                             'r-i': LSST_r_mag - LSST_i_mag,
                             'i-z': LSST_i_mag - LSST_z_mag,
                             'v': Euclid_VIS_mag,
                             'v-y': Euclid_VIS_mag - Euclid_Y_mag,
                             'y-j': Euclid_Y_mag - Euclid_J_mag,
                             'j-h': Euclid_J_mag - Euclid_H_mag,
                             'g': CASTOR_g_mag,
                             'uv-u': CASTOR_uv_mag - CASTOR_u_mag,
                             'u-g': CASTOR_u_mag - CASTOR_g_mag,
                             '106': Roman_106_mag,
                             '129 - 106': Roman_129_mag - Roman_106_mag,
                             '158-129': Roman_158_mag - Roman_129_mag,
                             '184-158': Roman_184_mag - Roman_158_mag
                      }).set_index('CASTOR_ID').dropna()

catalogs["Roman+Euclid+LSST"] = pd.DataFrame({'CASTOR_ID': ID, 'z_true': z_true,
                             'r': LSST_r_mag,
                             'g-r': LSST_g_mag - LSST_r_mag,
                             'r-i': LSST_r_mag - LSST_i_mag,
                             'i-z': LSST_i_mag - LSST_z_mag,
                             'v': Euclid_VIS_mag,
                             'v-y': Euclid_VIS_mag - Euclid_Y_mag,
                             'y-j': Euclid_Y_mag - Euclid_J_mag,
                             'j-h': Euclid_J_mag - Euclid_H_mag,
                             '106': Roman_106_mag,
                             '129 - 106': Roman_129_mag - Roman_106_mag,
                             '158-129': Roman_158_mag - Roman_129_mag,
                             '184-158': Roman_184_mag - Roman_158_mag
                      }).set_index('CASTOR_ID').dropna()

## Read Pre-trained Flows

In [None]:
available_os = ['LSST-only',
                'CASTOR-only',
                'Euclid+LSST',
                'LSST+CASTOR',
                'Roman+Euclid+LSST',
                'Roman+Euclid+LSST+CASTOR']

flows = {}
for os in available_os:
  flows[os] = FlowEnsemble(file=f"Oct_flows_draftpzflow_ensemble_for_{os}.pkl")

just_tav = {}
for os in available_os:
  just_tav[os] = flows[os].log_prob(catalogs[os])

## Dropping Bad flows here, but commenting out until I/we have a chance to investigate flow errors a bit more

In [None]:
# losses = {}
# for os in available_os:
#   with (open(f"losses_for_{os}.pkl", 'rb')) as openfile:
#     losses[os] = pd.read_pickle(openfile)

# flow_list = list(np.linspace(0, 29, 30).astype(int))
# bad_flows = {}

# for os in available_os:
#   bad_flows_list = []
#   for flow in flow_list:
#     if np.sum(np.isinf(np.array(losses[os]['losses'][f"Flow {flow}"]))) > 0:
#       bad_flows_list.append(flow)
#     bad_flows[os] = bad_flows_list

In [None]:
# just_tav_ensemble = {}
# for os in available_os:
#   just_tav_ensemble[os] = flows[os].log_prob(catalogs[os], returnEnsemble = True)

In [None]:
# new_tav = {}

# for os in available_os:
#   new_tav[os] = np.delete(just_tav_ensemble[os], bad_flows[os], axis = 1)

## Existing Attempts to Stratify by Information Metric

In [None]:
least_improved_Euclid = np.argsort((np.abs(just_tav['LSST-only'] - just_tav['Euclid+LSST'])))[::-1]
least_improved_CASTOR = np.argsort((np.abs(just_tav['LSST-only'] - just_tav['LSST+CASTOR'])))[::-1]

most_improved_Euclid = np.argsort((np.abs(just_tav['LSST-only'] - just_tav['Euclid+LSST'])))
most_improved_CASTOR = np.argsort((np.abs(just_tav['LSST-only'] - just_tav['LSST+CASTOR'])))

plt.hist(np.abs(just_tav['LSST-only'] - just_tav['Euclid+LSST']), alpha=0.25, label= 'Euclid + LSST', density=True, bins=np.linspace(0, 4, 100))
plt.hist(np.abs(just_tav['LSST-only'] - just_tav['LSST+CASTOR']), alpha=0.25, label= 'Euclid + LSST', density=True, bins=np.linspace(0, 4, 100))

plt.legend()
plt.xlim(0, 4)
plt.xlabel(r'$\Delta$' + chr(0x05ea)) #r'$\mathbb{E}_{z, x_{phot}} \left[ q_\theta(z | x_{phot}) \right]$')
plt.yticks([])
plt.show()

In [None]:
worst_galaxies_Euclid = np.argsort((np.abs(just_tav['Euclid+LSST'])))[::-1][0:1000]
worst_galaxies_CASTOR = np.argsort((np.abs(just_tav['LSST+CASTOR'])))[::-1][0:1000]

best_galaxies_Euclid = np.argsort((np.abs(just_tav['Euclid+LSST'])))[0:1000]
best_galaxies_CASTOR = np.argsort((np.abs(just_tav['LSST+CASTOR'])))[0:1000]

In [None]:
fig, ax = plt.subplots(1, 4, figsize = (18, 4))

i=1

colors_lsst = ['g-r', 'r-i', 'i-z', 'uv-u', 'u-g']

plt.suptitle('Best and Worst Galaxies in LSST+CASTOR (by absolute value of TLM)')

for color, ax in zip(colors_lsst, ax.reshape(-1)):
  ax.scatter(np.array(catalogs['LSST+CASTOR'][color])[best_galaxies_CASTOR], np.array(catalogs['LSST+CASTOR'][colors_lsst[i]])[best_galaxies_CASTOR], label = 'LSST+CASTOR', marker = '.')
  ax.scatter(np.array(catalogs['LSST+CASTOR'][color])[worst_galaxies_CASTOR], np.array(catalogs['LSST+CASTOR'][colors_lsst[i]])[worst_galaxies_CASTOR], label = 'LSST+Euclid', marker = '+', alpha = 0.3)

  ax.set_xlabel(color)
  ax.set_ylabel(colors_lsst[i])

  plt.legend()

  i = i + 1

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (8, 4))

i=1

colors_lsst = ['g-r', 'r-i', 'i-z']

plt.suptitle('Best and Worst Galaxies in LSST+Euclid (by absolute value of TLM)')

for color, ax in zip(colors_lsst, ax.reshape(-1)):
  ax.scatter(np.array(catalogs['Euclid+LSST'][color])[best_galaxies_Euclid], np.array(catalogs['Euclid+LSST'][colors_lsst[i]])[best_galaxies_Euclid], label = 'best', marker = '.')
  ax.scatter(np.array(catalogs['Euclid+LSST'][color])[worst_galaxies_Euclid], np.array(catalogs['Euclid+LSST'][colors_lsst[i]])[worst_galaxies_Euclid], label = 'worst', marker = '+', alpha = 0.3)

  ax.set_xlabel(color)
  ax.set_ylabel(colors_lsst[i])

  plt.legend()

  i = i + 1

In [None]:
worst_galaxies_Euclid = np.argsort((np.abs(just_tav['Euclid+LSST'])))[::-1][0:1000]
worst_galaxies_CASTOR = np.argsort((np.abs(just_tav['LSST+CASTOR'])))[::-1][0:1000]

best_galaxies_Euclid = np.argsort((np.abs(just_tav['Euclid+LSST'])))[0:1000]
best_galaxies_CASTOR = np.argsort((np.abs(just_tav['LSST+CASTOR'])))[0:1000]

In [None]:
best_galaxies_Euclid = np.argsort((np.abs(just_tav['Euclid+LSST'])))[0:1000]
best_galaxies_CASTOR = np.argsort((np.abs(just_tav['LSST+CASTOR'])))[0:1000]

In [None]:
print(np.sum(worst_galaxies_Euclid == worst_galaxies_CASTOR))
print(np.sum(best_galaxies_Euclid == best_galaxies_CASTOR))

In [None]:
fig, ax = plt.subplots(1, 4, figsize = (18, 4))

i=1

colors_lsst = ['g-r', 'r-i', 'i-z', 'uv-u', 'u-g']

plt.suptitle('Best Galaxies in CASTOR and Euclid (by absolute value of TLM)')

for color, ax in zip(colors_lsst, ax.reshape(-1)):
  ax.scatter(np.array(catalogs['LSST+CASTOR'][color])[best_galaxies_CASTOR], np.array(catalogs['LSST+CASTOR'][colors_lsst[i]])[best_galaxies_CASTOR], label = 'LSST+CASTOR', marker = '.')
  ax.scatter(np.array(catalogs['LSST+CASTOR'][color])[best_galaxies_Euclid], np.array(catalogs['LSST+CASTOR'][colors_lsst[i]])[best_galaxies_Euclid], label = 'LSST+Euclid', marker = '+', alpha = 0.3)

  ax.set_xlabel(color)
  ax.set_ylabel(colors_lsst[i])

  plt.legend()

  i = i + 1

In [None]:
fig, ax = plt.subplots(1, 4, figsize = (18, 4))

i=1

colors_lsst = ['g-r', 'r-i', 'i-z', 'uv-u', 'u-g']

plt.suptitle('Worst Galaxies in CASTOR and Euclid (by absolute value of TLM)')

for color, ax in zip(colors_lsst, ax.reshape(-1)):
  ax.scatter(np.array(catalogs['LSST+CASTOR'][color])[worst_galaxies_CASTOR], np.array(catalogs['LSST+CASTOR'][colors_lsst[i]])[worst_galaxies_CASTOR], label = 'LSST+CASTOR', marker = '.')
  ax.scatter(np.array(catalogs['LSST+CASTOR'][color])[worst_galaxies_Euclid], np.array(catalogs['LSST+CASTOR'][colors_lsst[i]])[worst_galaxies_Euclid], label = 'LSST+Euclid', marker = '+', alpha = 0.3)

  ax.set_xlabel(color)
  ax.set_ylabel(colors_lsst[i])

  plt.legend()

  i = i + 1

In [None]:
worst_galaxies = np.argsort((np.abs(just_tav['LSST-only'] - just_tav['LSST+CASTOR'])))[::-1][0:1000]
best_galaxies = np.argsort((np.abs(just_tav['LSST-only'] - just_tav['LSST+CASTOR'])))[0:1000]

In [None]:
fig, ax = plt.subplots(1, 4, figsize = (18, 4))

i=1

colors_lsst = ['g-r', 'r-i', 'i-z', 'uv-u', 'u-g']

plt.suptitle('Worst Galaxies in CASTOR and Euclid (by absolute value of TLM)')

for color, ax in zip(colors_lsst, ax.reshape(-1)):
  ax.scatter(np.array(catalogs['LSST+CASTOR'][color])[best_galaxies], np.array(catalogs['LSST+CASTOR'][colors_lsst[i]])[best_galaxies], label = 'best', marker = '.')
  ax.scatter(np.array(catalogs['LSST+CASTOR'][color])[worst_galaxies], np.array(catalogs['LSST+CASTOR'][colors_lsst[i]])[worst_galaxies], label = 'worst', marker = '+', alpha = 0.3)

  ax.set_xlabel(color)
  ax.set_ylabel(colors_lsst[i])

  plt.legend()

  i = i + 1


## Attempts to Stratify by Clustering

### DBSCAN seems better than the tSNE experiments, so leaving out the tSNE stuff - might be worth returning to.

In [None]:
# attempts to stratify by clustering

from sklearn.datasets import make_blobs

centers = [[1, 1], [-1, -1], [1, -1]]

X, labels_true = make_blobs(
    n_samples=750, centers=centers, cluster_std=0.4, random_state=0
)

In [None]:
worst_Euclid = np.array([np.array(catalogs['Euclid+LSST']['g-r'])[worst_galaxies_Euclid], np.array(catalogs['Euclid+LSST']['r-i'])[worst_galaxies_Euclid]]).T

plt.scatter(worst_Euclid[:,0], worst_Euclid[:,1])

In [None]:
from sklearn import metrics
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.06, min_samples=10).fit(worst_Euclid)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

In [None]:
unique_labels = set(labels)
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = labels == k

    xy = worst_Euclid[class_member_mask & core_samples_mask]
    plt.plot(
        xy[:, 0],
        xy[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
        markersize=14,
    )

    xy = worst_Euclid[class_member_mask & ~core_samples_mask]
    plt.plot(
        xy[:, 0],
        xy[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
        markersize=6,
    )

plt.title(f"Estimated number of clusters: {n_clusters_}")
plt.show()

## Attempts to Stratify by Quantiles

In [None]:
import seaborn as sns

import scipy.stats

In [None]:
delta_tav_array = np.array([np.abs(just_tav['Euclid+LSST'] - just_tav['LSST-only']), np.abs(just_tav['LSST+CASTOR'] - just_tav['LSST-only'])]).T

tav_df = pd.DataFrame(delta_tav_array, columns = ['Euclid', 'CASTOR'])

sns.jointplot(data=tav_df.sample(30000), x="Euclid", y="CASTOR",  kind="kde")
plt.xlim(0, 4)
plt.ylim(0, 4)
plt.show()

In [None]:
delta_tav_array = np.array([- just_tav['LSST-only'] + just_tav['LSST+CASTOR'], catalogs['LSST+CASTOR']['uv-u']]).T

tav_df_CASTOR = pd.DataFrame(delta_tav_array, columns = ['delta tav', 'uv-u'])

sns.jointplot(tav_df_Euclid, x='uv-u', y="delta tav", kind = 'kde')

In [None]:
quants = np.linspace(0,1, 100)
print(quants)

In [None]:
delta_tav_array = np.array([- just_tav['LSST-only'] + just_tav['LSST+CASTOR'], catalogs['LSST+CASTOR']['u-g'], catalogs['LSST+CASTOR']['g-r'], catalogs['LSST+CASTOR']['z_true']]).T

tav_df_CASTOR = pd.DataFrame(delta_tav_array, columns = ['delta tav', 'u-g', 'g-r', 'z_true'])

tav_df_CASTOR_z_ge_15 = tav_df_CASTOR[tav_df_CASTOR['z_true'] >= 1.5]
tav_df_CASTOR_z_less_15 = tav_df_CASTOR[tav_df_CASTOR['z_true'] < 1.5]


quants_ge_15 = scipy.stats.mstats.mquantiles(tav_df_CASTOR_z_ge_15['delta tav'], quants)
quants_less_15 = scipy.stats.mstats.mquantiles(tav_df_CASTOR_z_less_15['delta tav'], quants)

plt.scatter(quants_less_15, quants_ge_15)

x = np.linspace(np.min(quants_ge_15), np.max(quants_ge_15), 10)
y = x

plt.plot(x, y)
plt.xlim(0, 4)
plt.ylim(0, 4)

plt.title('quantile-quantile plot of $\Delta$TLM, z > 1.5 vs z < 1.5')
plt.xlabel('$\Delta$TLM z < 1.5')
plt.ylabel('$\Delta$TLM z > 1.5')


plt.grid()

In [None]:
delta_tav_array = np.array([- just_tav['LSST-only'] + just_tav['LSST+CASTOR'], catalogs['LSST+CASTOR']['u-g'], catalogs['LSST+CASTOR']['g-r'], catalogs['LSST+CASTOR']['z_true']]).T

tav_df_CASTOR = pd.DataFrame(delta_tav_array, columns = ['delta tav', 'u-g', 'g-r', 'z_true'])

tav_df_CASTOR_z_ge_15 = tav_df_CASTOR[tav_df_CASTOR['z_true'] >= 1.5]
tav_df_CASTOR_z_less_15 = tav_df_CASTOR[tav_df_CASTOR['z_true'] < 1.5]


quants_ge_15 = scipy.stats.mstats.mquantiles(tav_df_CASTOR_z_ge_15['g-r'], quants)
quants_less_15 = scipy.stats.mstats.mquantiles(tav_df_CASTOR_z_less_15['g-r'], quants)

plt.scatter(quants_less_15, quants_ge_15)

x = np.linspace(np.min(quants_ge_15), np.max(quants_ge_15), 10)
y = x

plt.plot(x, y)
plt.xlim(0, 4)
plt.ylim(0, 4)

plt.grid()

plt.title('quantile-quantile plot of g-r, z > 1.5 vs z < 1.5')

plt.xlabel('g-r z < 1.5')
plt.ylabel('g-r z > 1.5')

In [None]:
delta_tav_array = np.array([- just_tav['LSST-only'] + just_tav['LSST+CASTOR'], catalogs['LSST+CASTOR']['u-g'], catalogs['LSST+CASTOR']['g-r'], catalogs['LSST+CASTOR']['z_true']]).T

tav_df_CASTOR = pd.DataFrame(delta_tav_array, columns = ['delta tav', 'u-g', 'g-r', 'z_true'])

tav_df_CASTOR_z_ge_15 = tav_df_CASTOR[tav_df_CASTOR['z_true'] >= 1.5]
tav_df_CASTOR_z_less_15 = tav_df_CASTOR[tav_df_CASTOR['z_true'] < 1.5]


quants_ge_15 = scipy.stats.mstats.mquantiles(tav_df_CASTOR_z_ge_15['u-g'], quants)
quants_less_15 = scipy.stats.mstats.mquantiles(tav_df_CASTOR_z_less_15['u-g'], quants)

plt.scatter(quants_less_15, quants_ge_15)

x = np.linspace(np.min(quants_ge_15), np.max(quants_ge_15), 10)
y = x

plt.plot(x, y)
plt.xlim(0, 4)
plt.ylim(0, 4)

plt.title('quantile-quantile plot of u-g, z > 1.5 vs z < 1.5')
plt.grid()

plt.xlabel('u-g z < 1.5')
plt.ylabel('u-g z > 1.5')

In [None]:
delta_tav_array_abs_LSST = np.array([just_tav['LSST-only'], catalogs['LSST+CASTOR']['u-g'], catalogs['LSST+CASTOR']['g-r'], catalogs['LSST+CASTOR']['z_true']]).T
delta_tav_array_CASTOR = np.array([just_tav['LSST+CASTOR'], catalogs['LSST+CASTOR']['u-g'], catalogs['LSST+CASTOR']['g-r'], catalogs['LSST+CASTOR']['z_true']]).T


tav_df_abs_LSST = pd.DataFrame(delta_tav_array_abs_LSST, columns = ['delta tav', 'u-g', 'g-r', 'z_true'])
tav_df_abs_CASTOR = pd.DataFrame(delta_tav_array_CASTOR, columns = ['delta tav', 'u-g', 'g-r', 'z_true'])


# tav_df_CASTOR_z_ge_15 = tav_df_CASTOR[tav_df_CASTOR['z_true'] >= 1.5]
# tav_df_CASTOR_z_less_15 = tav_df_CASTOR[tav_df_CASTOR['z_true'] < 1.5]


quants_LSST = scipy.stats.mstats.mquantiles(tav_df_abs_LSST['delta tav'], quants)
quants_CASTOR = scipy.stats.mstats.mquantiles(tav_df_abs_CASTOR['delta tav'], quants)

plt.scatter(quants_LSST, quants_CASTOR)

x = np.linspace(np.min(quants_LSST), np.max(quants_CASTOR), 10)
y = x

plt.plot(x, y)
plt.xlim(0, 4)
plt.ylim(0, 4)

plt.title('quantile-quantile plot of TLM, LSST vs LSST+ CASTOR')
plt.xlabel('LSST')
plt.ylabel('LSST + CASTOR')


plt.grid()

In [None]:
sns.violinplot(x=tav_df_CASTOR["delta tav"])
plt.show()
sns.boxenplot(x=tav_df_CASTOR["delta tav"])
plt.show()
sns.violinplot(x=tav_df_Euclid["delta tav"])
plt.show()
sns.boxenplot(x=tav_df_Euclid["delta tav"])

In [None]:
# split by TLM value

delta_tav_array = np.array([- just_tav['LSST-only'] + just_tav['LSST+CASTOR'], catalogs['LSST+CASTOR']['u-g'], catalogs['LSST+CASTOR']['g-r'], catalogs['LSST+CASTOR']['z_true']]).T

tav_df_CASTOR = pd.DataFrame(delta_tav_array, columns = ['delta tav', 'u-g', 'g-r', 'z_true'])

sns.jointplot(tav_df_CASTOR[tav_df_CASTOR['delta tav']> 2], x='u-g', y="g-r", kind = 'kde')

plt.ylim(-1, 3)
plt.xlim(-1, 3)

sns.jointplot(tav_df_CASTOR[tav_df_CASTOR['delta tav']< 2], x='u-g', y="g-r", kind = 'kde')

plt.ylim(-1, 3)
plt.xlim(-1, 3)

In [None]:
# adding some representative points

gr = tav_df_CASTOR_z_cut[tav_df_CASTOR_z_cut['delta tav'] > 2].sort_values(by = ['delta tav'], ascending = False)[0:15]['g-r']
ug = tav_df_CASTOR_z_cut[tav_df_CASTOR_z_cut['delta tav'] > 2].sort_values(by = ['delta tav'], ascending = False)[0:15]['u-g']

In [None]:
# split by "pivot" redshift

delta_tav_array = np.array([- just_tav['LSST-only'] + just_tav['LSST+CASTOR'], catalogs['LSST+CASTOR']['u-g'], catalogs['LSST+CASTOR']['g-r'], catalogs['LSST+CASTOR']['z_true']]).T

tav_df_CASTOR = pd.DataFrame(delta_tav_array, columns = ['delta tav', 'u-g', 'g-r', 'z_true'])
tav_df_CASTOR_z_cut = tav_df_CASTOR[tav_df_CASTOR['z_true']> 1.5]

sns.jointplot(tav_df_CASTOR_z_cut[tav_df_CASTOR_z_cut['delta tav']> 2], x='u-g', y="g-r", kind = 'kde')

plt.scatter(ug, gr)

plt.ylim(-1, 3)
plt.xlim(-1.5, 5.5)

sns.jointplot(tav_df_CASTOR_z_cut[tav_df_CASTOR_z_cut['delta tav']< 2], x='u-g', y="g-r", kind = 'kde')

plt.ylim(-1, 3)
plt.xlim(-1.5, 5.5)