<a href="https://colab.research.google.com/github/bscot/Broadband_tomography_with_CASTOR_and_SPEHREx/blob/main/TheLastMetric.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from pzflow import Flow
import jax.numpy as jnp
import pandas as pd 
import numpy as np

from astropy.table import Table

from pzflow import Flow, FlowEnsemble
from pzflow.distributions import Uniform
from pzflow.bijectors import Chain, StandardScaler, NeuralSplineCoupling


# load data

def getTrueY(test_cat, mag_col_names, y_col_name):

    test_cat = Table(test_cat, masked=True, copy=True)
    # remove nans
    for col in mag_col_names:
        test_cat[col].mask = np.isnan(test_cat[col].data) | test_cat[col].mask
        test_cat = test_cat[~test_cat[col].mask] # then remove nans from test set
            
    true_y = test_cat[y_col_name]
    return true_y.filled()

unp = Table.read("unperturbed_mags.fits")
mock = Table.read("ext_phot.fits")
mask = unp["Euclid_VIS_MAG"] <= 24.5

CASTOR_baseline = mock[mask] #ignoring training data outside the Euclid flux limit 

names_phot = ["ID", "photoz",
              "LSST_g_MAG", "LSST_g_MAGERR", 
              "LSST_r_MAG", "LSST_r_MAGERR",
              "LSST_i_MAG", "LSST_i_MAGERR", 
              "LSST_z_MAG", "LSST_z_MAGERR",
              "castor_uv_MAG", "castor_uv_MAGERR", 
              "castor_u_MAG", "castor_u_MAGERR", 
              "castor_g_MAG", "castor_g_MAGERR"]

available_os = ["baseline"]
names = [
    "baseline",
]

os_names = dict(zip(available_os, names))
colors = ["k"] #, "plum", "cornflowerblue", "#2ca02c", "gold", "tomato"]
os_colors = dict(zip(available_os, colors))

# put data in expected format for TLM 

LSST_g_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_g_MAG")
LSST_r_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_r_MAG")
LSST_i_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_i_MAG")
LSST_z_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_z_MAG")

LSST_g_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_g_MAGERR")
LSST_r_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_r_MAGERR")
LSST_i_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_i_MAGERR")
LSST_z_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_z_MAGERR")

CASTOR_g_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="castor_g_MAG")
CASTOR_u_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="castor_u_MAG")
CASTOR_uv_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="castor_uv_MAG")

CASTOR_g_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="castor_g_MAGERR")
CASTOR_u_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="castor_u_MAGERR")
CASTOR_uv_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="castor_uv_MAGERR")

ID = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="ID")
z_true = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="photoz")

#df = pd.DataFrame({'CASTOR_ID': data_array_sorted_CASTOR_wID[:,0] , 'error': data_array_sorted_CASTOR_wID[:,1], 
#                   'g-r': data_array_sorted_CASTOR_wID[:,2] - data_array_sorted_CASTOR_wID[:,3], 
#                   'r-i': data_array_sorted_CASTOR_wID[:,3] - data_array_sorted_CASTOR_wID[:,4], 
#                   'i-z': data_array_sorted_CASTOR_wID[:,4] - data_array_sorted_CASTOR_wID[:,5], 
#                   'true_z': data_array_sorted_CASTOR_wID[:,6]})

catalogs = dict()
for os in available_os:
    
#     cat = pd.DataFrame({'CASTOR_ID': ID, 'z_true': z_true, 
#                              'LSST_g_mag': LSST_g_mag, 'LSST_g_mag_ERR': LSST_g_mag_ERR, 
#                              'LSST_r_mag': LSST_r_mag, 'LSST_r_mag_ERR': LSST_r_mag_ERR, 
#                              'LSST_i_mag': LSST_i_mag, 'LSST_i_mag_ERR': LSST_i_mag_ERR, 
#                              'LSST_z_mag': LSST_z_mag, 'LSST_z_mag_ERR': LSST_z_mag_ERR, 
#                              'CASTOR_uv_mag': LSST_uv_mag, 'LSST_uv_mag_ERR': LSST_uv_mag_ERR, 
#                              'CASTOR_u_mag': LSST_u_mag, 'LSST_u_mag_ERR': LSST_u_mag_ERR, 
#               'CASTOR_g_mag': LSST_g_mag, 'LSST_g_mag_ERR': LSST_g_mag_ERR})
    
    # this will need to change to accomodate multiple catalogs, e.g. LSST only or LSST + CASTOR
    
    cat = pd.DataFrame({'CASTOR_ID': ID, 'z_true': z_true, 
                             'r': LSST_r_mag,
                             'g-r': LSST_g_mag - LSST_r_mag, 
                             'r-i': LSST_r_mag - LSST_i_mag, 
                             'i-z': LSST_i_mag - LSST_z_mag, 
                             'uv-u': CASTOR_uv_mag - CASTOR_u_mag, 
                             'u-g': CASTOR_u_mag - CASTOR_g_mag
                      })
    
    catalogs[os] = cat.dropna()

# first I create a bijector chain
# the first bijection is a standard scaler - but I'm not actually using it for standard scaling
#     I set the mean and std so that it maps the redshift range (0, 3.2) onto (-5, 5), which is 
#     the domain of the NeuralSplineCoupling
# the second bijection is a NeuralSplineCoupling. I told it to expect 6 conditions,
#     which will be the r mag and the galaxy colors
bijector = Chain(
    StandardScaler(np.atleast_1d(1.6), np.atleast_1d(0.32)),
    NeuralSplineCoupling(n_conditions=6)
)

# I set the latent distribution to a Uniform over (-5, 5)
# this range was chosen to match the NeuralSplineCoupling domain
# I chose a Uniform since all of the redshifts are drawn from a compact domain
latent = Uniform(1, 5) # did the syntax here change?

# create a dictionary that will hold all the ensembles
ensembles = dict()

# create the baseline flows
for os in available_os:

    # the data column is the one that is sampled and transformed by the flow
    data_columns = ["z_true"]
    # the conditional columns are the columns that the flow is conditioned on
    conditional_columns = ["r", "u-g", "g-r", "r-i", "i-z", "uv - u"] # different colors than LSST  

    # save some info with the flow
    info = f"Models z_true conditioned on galaxy colors and r mag from os {os}. K=16"

    # instantiate and save the flow
    flowEns = FlowEnsemble(data_columns = data_columns, 
                           conditional_columns = conditional_columns,
                           bijector = bijector,
                           latent = latent,
                           info = info,
                           N = 10)

    ensembles[os] = flowEns
    
    
    
    

for os, ens in ensembles.items():
    
    # get the data and make a train and test set
    cat = catalogs[os]
    cat_train = cat.sample(frac = 0.8)
    cat_test = cat.drop(cat_train.index)
    
    # train the flow on the given learning rate schedule
    loss1 = ens.train(cat_train, sample_errs=True,
                       optimizer = adam(step_size = 1e-3),
                       epochs = 100, seed = 123)
    loss2 = ens.train(cat_train, sample_errs=True,
                       optimizer = adam(step_size = 2e-4),
                       epochs = 100, seed = 312)
    loss3 = ens.train(cat_train, sample_errs=True,
                       optimizer = adam(step_size = 1e-4),
                       epochs = 50, seed = 231)
    
    losses = {fname : # for each flow trained in the ensemble...
                  [float(loss) # save the list of training losses
                   for lossDict in [loss1, loss2, loss3]
                   for loss in lossDict[fname]]
              for fname in loss1}
    
    # print the train and test loss
    train_loss = -np.mean(ens.log_prob(cat_train))
    test_loss = -np.mean(ens.log_prob(cat_test))
    print(os, train_loss, test_loss)
    
    # save the ensemble
    ens.save(f"trained_flows/pzflow_ensemble_for_{os}.pkl")
    # and the losses
    with open(f"trained_flows/losses_for_{os}.pkl", "wb") as file:
        pickle.dump({"losses": losses, 
                     "train loss": train_loss, 
                     "test loss": test_loss},
                    file)


SyntaxError: ignored

dependencies: pzflow, pandas, jax

In [3]:
from pzflow import Flow
import jax.numpy as jnp
import pandas as pd 

import matplotlib.pyplot as plt

ModuleNotFoundError: ignored

In [None]:
from astropy.table import Table

## CASTOR COSMOS synthetic catalogs

In [None]:
# load data

unp = Table.read("unperturbed_mags.fits")
mock = Table.read("ext_phot.fits")
mask = unp["Euclid_VIS_MAG"] <= 24.5

CASTOR_baseline = mock[mask] #ignoring training data outside the Euclid flux limit 

names_phot = ["ID", "photoz",
              "LSST_g_MAG", "LSST_g_MAGERR", 
              "LSST_r_MAG", "LSST_r_MAGERR",
              "LSST_i_MAG", "LSST_i_MAGERR", 
              "LSST_z_MAG", "LSST_z_MAGERR"
              "castor_uv_MAG", "castor_uv_MAGERR", 
              "castor_u_MAG", "castor_u_MAGERR", 
              "castor_g_MAG", "castor_g_MAGERR"]

available_os = ["baseline"]
names = [
    "baseline",
]

os_names = dict(zip(available_os, names))
colors = ["k"] #, "plum", "cornflowerblue", "#2ca02c", "gold", "tomato"]
os_colors = dict(zip(available_os, colors))

# put data in expected format for TLM 

LSST_g_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_g_MAG")
LSST_r_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_r_MAG")
LSST_i_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_i_MAG")
LSST_z_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_z_MAG")

LSST_g_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_g_MAGERR")
LSST_r_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_r_MAGERR")
LSST_i_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_i_MAGERR")
LSST_z_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="LSST_z_MAGERR")

CASTOR_g_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="CASTOR_g_MAG")
CASTOR_r_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="CASTOR_r_MAG")
CASTOR_i_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="CASTOR_i_MAG")
CASTOR_z_mag = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="CASTOR_z_MAG")

CASTOR_g_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="CASTOR_g_MAGERR")
CASTOR_r_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="CASTOR_r_MAGERR")
CASTOR_i_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="CASTOR_i_MAGERR")
CASTOR_z_mag_ERR = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="CASTOR_z_MAGERR")

ID = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="ID")
z_true = getTrueY(test_cat=CASTOR_baseline, mag_col_names=names_phot, y_col_name="photoz")

df = pd.DataFrame({'CASTOR_ID': data_array_sorted_CASTOR_wID[:,0] , 'error': data_array_sorted_CASTOR_wID[:,1], 
                   'g-r': data_array_sorted_CASTOR_wID[:,2] - data_array_sorted_CASTOR_wID[:,3], 
                   'r-i': data_array_sorted_CASTOR_wID[:,3] - data_array_sorted_CASTOR_wID[:,4], 
                   'i-z': data_array_sorted_CASTOR_wID[:,4] - data_array_sorted_CASTOR_wID[:,5], 
                   'true_z': data_array_sorted_CASTOR_wID[:,6]})

In [None]:
# load the catalogs
catalogs = dict()
for os in available_os:
    
#     cat = pd.DataFrame({'CASTOR_ID': ID, 'z_true': z_true, 
#                              'LSST_g_mag': LSST_g_mag, 'LSST_g_mag_ERR': LSST_g_mag_ERR, 
#                              'LSST_r_mag': LSST_r_mag, 'LSST_r_mag_ERR': LSST_r_mag_ERR, 
#                              'LSST_i_mag': LSST_i_mag, 'LSST_i_mag_ERR': LSST_i_mag_ERR, 
#                              'LSST_z_mag': LSST_z_mag, 'LSST_z_mag_ERR': LSST_z_mag_ERR, 
#                              'CASTOR_uv_mag': LSST_uv_mag, 'LSST_uv_mag_ERR': LSST_uv_mag_ERR, 
#                              'CASTOR_u_mag': LSST_u_mag, 'LSST_u_mag_ERR': LSST_u_mag_ERR, 
#               'CASTOR_g_mag': LSST_g_mag, 'LSST_g_mag_ERR': LSST_g_mag_ERR})
    
    # this will need to change to accomodate multiple catalogs, e.g. LSST only or LSST + CASTOR
    
    cat = pd.DataFrame({'CASTOR_ID': ID, 'z_true': z_true, 
                             'r': LSST_r_mag,
                             'g-r': LSST_g_mag - LSST_r_mag, 
                             'r-i': LSST_r_mag - LSST_i_mag, 
                             'i-z': LSST_i_mag - LSST_z_mag, 
                             'uv-u': CASTOR_uv_mag - CASTOR_u_mag; 
                             'u-g': CASTOR_u_mag - CASTOR_g_mag
                      })
    
    catalogs[os] = cat.dropna()

# Instantiate and Train 

### Does this run 'as is'?

In [None]:
# first I create a bijector chain
# the first bijection is a standard scaler - but I'm not actually using it for standard scaling
#     I set the mean and std so that it maps the redshift range (0, 3.2) onto (-5, 5), which is 
#     the domain of the NeuralSplineCoupling
# the second bijection is a NeuralSplineCoupling. I told it to expect 6 conditions,
#     which will be the r mag and the galaxy colors
bijector = Chain(
    StandardScaler(np.atleast_1d(1.6), np.atleast_1d(0.32)),
    NeuralSplineCoupling(n_conditions=6)
)

# I set the latent distribution to a Uniform over (-5, 5)
# this range was chosen to match the NeuralSplineCoupling domain
# I chose a Uniform since all of the redshifts are drawn from a compact domain
latent = Uniform((-5, 5))

# create a dictionary that will hold all the ensembles
ensembles = dict()

# create the baseline flows
for os in available_os:

    # the data column is the one that is sampled and transformed by the flow
    data_columns = ["z_true"]
    # the conditional columns are the columns that the flow is conditioned on
    conditional_columns = ["r", "u-g", "g-r", "r-i", "i-z", "uv - u"] # different colors than LSST  

    # save some info with the flow
    info = f"Models z_true conditioned on galaxy colors and r mag from os {os}. K=16"

    # instantiate and save the flow
    flowEns = FlowEnsemble(data_columns = data_columns, 
                           conditional_columns = conditional_columns,
                           bijector = bijector,
                           latent = latent,
                           info = info,
                           N = 10)

    ensembles[os] = flowEns
    
    
    
    
%%time

for os, ens in ensembles.items():
    
    # get the data and make a train and test set
    cat = catalogs[os]
    cat_train = cat.sample(frac = 0.8)
    cat_test = cat.drop(cat_train.index)
    
    # train the flow on the given learning rate schedule
    loss1 = ens.train(cat_train, sample_errs=True,
                       optimizer = adam(step_size = 1e-3),
                       epochs = 100, seed = 123)
    loss2 = ens.train(cat_train, sample_errs=True,
                       optimizer = adam(step_size = 2e-4),
                       epochs = 100, seed = 312)
    loss3 = ens.train(cat_train, sample_errs=True,
                       optimizer = adam(step_size = 1e-4),
                       epochs = 50, seed = 231)
    
    losses = {fname : # for each flow trained in the ensemble...
                  [float(loss) # save the list of training losses
                   for lossDict in [loss1, loss2, loss3]
                   for loss in lossDict[fname]]
              for fname in loss1}
    
    # print the train and test loss
    train_loss = -np.mean(ens.log_prob(cat_train))
    test_loss = -np.mean(ens.log_prob(cat_test))
    print(os, train_loss, test_loss)
    
    # save the ensemble
    ens.save(f"trained_flows/pzflow_ensemble_for_{os}.pkl")
    # and the losses
    with open(f"trained_flows/losses_for_{os}.pkl", "wb") as file:
        pickle.dump({"losses": losses, 
                     "train loss": train_loss, 
                     "test loss": test_loss},
                    file)

# from github/aimalz/TheLastMetric/blob/master/training_flows_July23.ipynb

pull the data just to get formatting

In [None]:
# !wget https://storage.googleapis.com/ahw2019/for_malz_and_lanusse.tar.gz
# !tar -xzf for_malz_and_lanusse.tar.gz
# !mv for_malz_and_lanusse dataset



more dataset/readme.txt



# list of available catalogs
available_os = ["run_1_4_y10", "run_4_38_y10", "run_10_92_y10", "run_4_34_y10", "run_7_61_y10", "run_9_86_y10"]
names = [
    "baseline_v1_5_10yrs",
    "footprint_stuck_rollingv1_5_10yrs",
    "ddf_heavy_nexp2_v1_6_10yrs",
    "footprint_newAv1_5_10yrs",
    "third_obs_pt60v1_5_10yrs",
    "barebones_v1_6_10yrs",
]
os_names = dict(zip(available_os, names))
colors = ["k", "plum", "cornflowerblue", "#2ca02c", "gold", "tomato"]
os_colors = dict(zip(available_os, colors))


# column names of the catalogs
names_z=('ID', 'z_true', 'z_phot', 'dz_phot', 'NN', 'N_train')
names_phot=(
    'ID', 'z_true', 
    'u', 'u_err',
    'g', 'g_err',
    'r', 'r_err',
    'i', 'i_err',
    'z', 'z_err',
    'y', 'y_err',
    'u-g', 'u-g_err',
    'g-r', 'g-r_err',
    'r-i', 'r-i_err',
    'i-z', 'i-z_err',
    'z-y', 'z-y_err',
)

# load the catalogs
catalogs = dict()
for os in available_os:
    z_cat = pd.read_csv(f"dataset/{os}/zphot.cat", names=names_z, delim_whitespace=True, skiprows=1)
    phot_cat = pd.read_csv(f"dataset/{os}/test.cat", names=names_phot, delim_whitespace=True)
    cat = z_cat.merge(phot_cat)
    catalogs[os] = cat.dropna()

instantiate the model of the photometric space, train it on the catalogs for LSST with and without CASTOR, and save it (two versions rather than 6 OSs)

In [None]:
# first I create a bijector chain
# the first bijection is a standard scaler - but I'm not actually using it for standard scaling
#     I set the mean and std so that it maps the redshift range (0, 3.2) onto (-5, 5), which is 
#     the domain of the NeuralSplineCoupling
# the second bijection is a NeuralSplineCoupling. I told it to expect 6 conditions,
#     which will be the r mag and the galaxy colors
bijector = Chain(
    StandardScaler(np.atleast_1d(1.6), np.atleast_1d(0.32)),
    NeuralSplineCoupling(n_conditions=6)
)

# I set the latent distribution to a Uniform over (-5, 5)
# this range was chosen to match the NeuralSplineCoupling domain
# I chose a Uniform since all of the redshifts are drawn from a compact domain
latent = Uniform((-5, 5))

# create a dictionary that will hold all the ensembles
ensembles = dict()

# create the baseline flows
for os in available_os:

    # the data column is the one that is sampled and transformed by the flow
    data_columns = ["z_true"]
    # the conditional columns are the columns that the flow is conditioned on
    conditional_columns = ["r", "u-g", "g-r", "r-i", "i-z", "z-y"]

    # save some info with the flow
    info = f"Models z_true conditioned on galaxy colors and r mag from os {os}. K=16"

    # instantiate and save the flow
    flowEns = FlowEnsemble(data_columns = data_columns, 
                           conditional_columns = conditional_columns,
                           bijector = bijector,
                           latent = latent,
                           info = info,
                           N = 10)

    ensembles[os] = flowEns
    
    
    
    
%%time

for os, ens in ensembles.items():
    
    # get the data and make a train and test set
    cat = catalogs[os]
    cat_train = cat.sample(frac = 0.8)
    cat_test = cat.drop(cat_train.index)
    
    # train the flow on the given learning rate schedule
    loss1 = ens.train(cat_train, sample_errs=True,
                       optimizer = adam(step_size = 1e-3),
                       epochs = 100, seed = 123)
    loss2 = ens.train(cat_train, sample_errs=True,
                       optimizer = adam(step_size = 2e-4),
                       epochs = 100, seed = 312)
    loss3 = ens.train(cat_train, sample_errs=True,
                       optimizer = adam(step_size = 1e-4),
                       epochs = 50, seed = 231)
    
    losses = {fname : # for each flow trained in the ensemble...
                  [float(loss) # save the list of training losses
                   for lossDict in [loss1, loss2, loss3]
                   for loss in lossDict[fname]]
              for fname in loss1}
    
    # print the train and test loss
    train_loss = -np.mean(ens.log_prob(cat_train))
    test_loss = -np.mean(ens.log_prob(cat_test))
    print(os, train_loss, test_loss)
    
    # save the ensemble
    ens.save(f"trained_flows/pzflow_ensemble_for_{os}.pkl")
    # and the losses
    with open(f"trained_flows/losses_for_{os}.pkl", "wb") as file:
        pickle.dump({"losses": losses, 
                     "train loss": train_loss, 
                     "test loss": test_loss},
                    file)

# from github/aimalz/TheLastMetric/blob/master/MAFVariationalMutualInformationPzFlow.ipynb

open the models of redshift-photometry space and the catalogs

In [None]:
flows = {}
for os in available_os:
  flows[os] = Flow(file=f"trained_flows/flow_for_run_{os}.pkl")
# TODO: need to experiment with different fit parameters because this might be too smooth, also does it account for photometric errors?
# TODO: check that draws from flow look like original data


# load the catalogs
catalogs = dict()
for os in available_os:
    z_cat = pd.read_csv(f"dataset/run_{os}/zphot.cat", names=names_z, delim_whitespace=True, skiprows=1)
    phot_cat = pd.read_csv(f"dataset/run_{os}/test.cat", names=names_phot, delim_whitespace=True)
    cat = z_cat.merge(phot_cat)
    catalogs[os] = cat.dropna()
    
    

evaluate the posteriors for each galaxy (totally forgot we had to do this)

In [None]:
# # this just makes the posteriors for plotting, not sure why it uses so much memory. . .
# tx = np.linspace(0,3.5,100)
# all_logp = {}
# for which_os in available_os:
#   flow = flows[which_os]
#   cat = catalogs[which_os]
#   logp = flow.posterior(flow.info["condition_scaler"](cat), column="z_true", grid=tx)
#   all_logp[which_os] = logp
    
    

In [None]:
all_milb = {}
for which_os in available_os:
  phot_cat = catalogs[which_os]

  mutual_information_lower_bound = flows[which_os].log_prob(flows[which_os].info["condition_scaler"](phot_cat))
  all_milb[which_os] = mutual_information_lower_bound
  print((os_names[which_os], np.sum(mutual_information_lower_bound)))
# TODO: make this an actual expected value rather than just sum
# also, shouldn't it be sum of exponential of metric value, since it should never penalize a negative value?

KEY PLOT: distribution of metric values for same galaxies with and without CASTOR

In [None]:
# surprisingly not so different from one another
for which_os in available_os:
  mutual_information_lower_bound = all_milb[which_os].flatten()
  print((np.mean(mutual_information_lower_bound), np.std(mutual_information_lower_bound)))
  hist(mutual_information_lower_bound, bins=np.linspace(-16, 5, 100), alpha=0.5, histtype='step', 
       color=os_colors[which_os], label=os_names[which_os], density=False)
  xlabel(r'$\mathbb{E}_{z, x_{phot}} \left[ q_\theta(z | x_{phot}) \right]$')
# xlim(-10., 5.)
legend(loc='upper left')
# semilogy()

KEY PLOT: look at per-galaxy metric values as a function of redshift, then photometry

In [None]:
fig, axs = plt.subplots(len(available_os), 1, figsize=(5, 5*len(available_os)))
for i, which_os in enumerate(available_os):
  axs[i].hist2d(z_cats[which_os]['z_true'], all_milb[which_os].flatten(), bins=[np.linspace(0., 3., 50), np.linspace(-5., 5., 100)])
  axs[i].set_xlabel('redshift')
  axs[i].set_ylabel(r'$\mathbb{E}_{z, x_{phot}} \left[ q_\theta(z | x_{phot}) \right]$')
  axs[i].set_title(os_names[which_os])
# they're different, but not visibly so
# TODO: plot violins of metric as a function of binned redshift so they're all on one set of axes? or quantiles because outlers? or box/whisker https://matplotlib.org/stable/gallery/pyplots/boxplot_demo_pyplot.html?
# TODO: normalize within redshift bins to get these on one set of axes?
# TODO sort of want sum of metric values in redshift bins, no?

take expected value of the approximated posteriors, which is the mutual information lower bound

In [None]:
# something isn't right about the autocalculation of moments so doing it by hand
def calc_moment(vals, k):
  n = len(vals)
  outval = np.sum(vals**k) / float(n)
  return float(outval)

which_moments = range(0, 5)
moment_res = {}
for which_os in available_os:
  # print((np.mean(all_milb[which_os]), np.std(all_milb[which_os])))
  moment_res[which_os] = []
  for i in which_moments:
    moment_res[which_os].append(calc_moment(all_milb[which_os], k=i))#sps.mstats.moment(all_milb[which_os], moment=which_moments[i], axis=0))
# print(moment_res)

# from github/aimalz/TheLastMetric/blob/master/FigureTLMvsRedshift.ipynb

KEY PLOT: distribution of TLM values with and without CASTOR including model uncertainty

In [None]:
%pylab inline
from utils import load_data, compute_last_metric
import corner
from pzflow import Flow


# Loading data
z_cats, phot_cats, available_os, os_names, os_colors = load_data()


# Loading pre-trained flows
flows = {}
for os in available_os:
  flows[os] = [Flow(file=f"trained_flows/flow_for_run_{os}_%d.pkl"%(i+1) ) for i in range(10)]


# Computing metric for each observing strategy
all_tlm = {}
for which_os in available_os:
  all_tlm[which_os] = np.stack([(compute_last_metric(f,
                                          phot_cats[which_os],
                                          z_cats[which_os], entropy_nbins=60)) for f in flows[which_os] ], axis=0)
  print((os_names[which_os], np.mean(all_tlm[which_os]), np.std(np.mean(all_tlm[which_os], axis=1))))
    
    
    

figure(figsize=(7,5))
for which_os in available_os:
  hist(np.mean(all_tlm[which_os],axis=1), 32, range=[2.95, 3.35], alpha=0.6,
       color=os_colors[which_os], label=os_names[which_os])
  axvline(np.mean(all_tlm[which_os]), color=os_colors[which_os], linewidth=2)
legend(ncol=2)
xlabel(chr(0x05ea))
savefig('metrics.pdf', bbox_inches = 'tight', pad_inches = 0 )