# Weak-lensing galaxy shape catalogue validation

## Main notebook, set-up, catalogue preparation

### Contents
1. Set-up
2. Load data
3. Matching of stars
4. Select galaxies

In [1]:
%reload_ext autoreload
%autoreload 2

In [None]:
# General library imports
import sys
import os
import numpy as np
from astropy.io import fits

In [None]:
from cs_util import canfar
from sp_validation.io import *
from sp_validation.cat import *
from sp_validation.survey import *
from sp_validation.galaxy import *
from sp_validation.calibration import *

## 1. Set-up

In [None]:
# Load parameters
%run params.py

### Create and open output files and directories

In [None]:
make_out_dirs(output_dir, plot_dir, plot_subdirs, verbose=verbose)
stats_file = open_stats_file(plot_dir, stats_file_name)

## 2. Load data

### Load merged (final) galaxy catalogue

In [None]:
extension = os.path.splitext(galaxy_cat_path)[1]
if extension == ".fits":
    print("Loading galaxy .npy file...")
    dd = np.load(galaxy_cat_path, mmap_mode=mmap_mode)
else:
    print("Loading galaxy .hdf5 file...")
    dd = read_hdf5_file(galaxy_cat_path, name, stats_file, param_path=param_list_path)

n_obj = len(dd)
print_stats(f'Read {n_obj} objects from file {galaxy_cat_path}', stats_file, verbose=verbose)

#### Print some quantities to check nothing obvious is wrong with catalogue

In [None]:
# Base name for ellipticity and size keys (column names)
key_base = {
    'ngmix': 'NGMIX',
    'galsim': 'GALSIM_GAL'
}

# PSF keys
key_PSF_ell = {}
key_PSF_size = {}
size_to_fwhm = {}

key_PSF_ell['ngmix'] = 'NGMIX_ELL_PSFo_NOSHEAR'
key_PSF_size['ngmix'] = 'NGMIX_T_PSFo_NOSHEAR'
size_to_fwhm['ngmix'] = T_to_fwhm

key_PSF_ell['galsim'] = 'GALSIM_PSF_ELL_ORIGINAL_PSF'
key_PSF_size['galsim'] = 'GALSIM_PSF_SIGMA_ORIGINAL_PSF'
size_to_fwhm['galsim'] = sigma_to_fwhm

In [None]:
print_stats('Galaxies:', stats_file, verbose=verbose)
n_tot = print_some_quantities(dd, stats_file, verbose=verbose)
for sh in shapes:
    print_mean_ellipticity(
        dd,
        f'{key_base[sh]}_ELL_NOSHEAR',
        2, 
        n_tot,
        stats_file,
        invalid=-10,
        verbose=verbose
    )

#### Survey area and potential missing tiles
The approximate observed area is the number of tiles $\times$ 0.25 deg$^2$ (ignoring overlaps and masking).

In [None]:
area_deg2, area_amin2, tile_IDs = get_area(dd, area_tile, verbose=verbose)

Identify missing tiles by comparing tile ID from catalogue to external input tile ID file.

In [None]:
n_found, n_missing = missing_tiles(tile_IDs, path_tile_ID, path_found_ID, path_missing_ID, verbose=verbose)

### Load star catalogue

In [None]:
if star_cat_path:
    d_star = fits.getdata(star_cat_path, hdu_star_cat)

In [None]:
if star_cat_path:
    print_stats('Stars:', stats_file, verbose=verbose)
    n_tot = print_some_quantities(d_star, stats_file, verbose=verbose)
    print_mean_ellipticity(
        d_star, 
        ['E1_PSF_HSM', 'E2_PSF_HSM'],
        1,
        n_tot,
        stats_file, 
        invalid=-10,
        verbose=verbose
    )

### 3. Matching of stars

### Matching of star catalogues
Match the star catalogue `d_star` (selected on individual exposures using size-magnitude diagram) to catalogue from tile. Uses some simple criteria to select stars from tile catalogue such as SPREAD_CLASS.

This is mainly for testing, this match will not be used later.

#### Match to all objects

In [None]:
if star_cat_path:
    ind_star, mask_area_tiles, n_star_tot = check_matching(
        d_star,
        dd,
        ['RA', 'DEC'],
        [col_name_ra, col_name_dec],
        thresh,
        stats_file,
        name=None,
        verbose=verbose
    )

#### Refine: Match to valid, unflagged galaxy sample

In [None]:
# Flags to indicate valid star sample
m_star = {}
ra_star = {}
dec_star = {}
g_star_psf = {}

if 'ngmix' in shapes:
    m_star['ngmix'] = (
        (dd['FLAGS'][ind_star] == 0)
        & (dd['IMAFLAGS_ISO'][ind_star] == 0)
        & (dd['NGMIX_MCAL_FLAGS'][ind_star] == 0)
        & (dd['NGMIX_ELL_PSFo_NOSHEAR'][:,0][ind_star] != -10)
    )

    print_stats('ngmix:', stats_file, verbose=verbose)

    ra_star['ngmix'], dec_star['ngmix'], g_star_psf['ngmix'] = match_subsample(
        dd,
        ind_star,
        m_star['ngmix'],
        [col_name_ra, col_name_dec],
        'NGMIX_ELL_PSFo_NOSHEAR',
        n_star_tot,
        stats_file,
        verbose=verbose
    )

if 'galsim' in shapes:
    m_star['galsim'] = (
        (dd['FLAGS'][ind_star] == 0)
        & (dd['IMAFLAGS_ISO'][ind_star] == 0)
        & (dd['GALSIM_PSF_ELL_ORIGINAL_PSF'][:,0][ind_star] != -10)
    )

    print_stats('galsim:', stats_file, verbose=verbose)

    ra_star['galsim'], dec_star['galsim'], g_star_psf['galsim'] = match_subsample(
        dd,
        ind_star,
        m_star['galsim'],
        [col_name_ra, col_name_dec],
        'GALSIM_PSF_ELL_ORIGINAL_PSF',
        n_star_tot,
        stats_file,
        verbose=verbose
)

In [None]:
#### Refine: Match to SPREAD_CLASS samples
if "SPREAD_CLASS" in dd.dtype.names:
    for sh in shapes:
        print_stats(f'{sh}:', stats_file, verbose=verbose)
        match_spread_class(dd, ind_star, m_star[sh], stats_file, len(ra_star[sh]), verbose=verbose)
else:
    print_stats("No SPREAD_CLASS in input, skipping star-gal matching", stats_file, verbose=verbose)

## Check for objects with invalid PSF

In [None]:
for sh in shapes:
    print(f'{sh}:')
    check_invalid(
        dd,
        [key_PSF_ell[sh], f'{key_base[sh]}_ELL_NOSHEAR'],
        [0, 0],
        [-10, -10],
        stats_file,
        name=['PSF', 'galaxy ellipticity'],
        verbose=verbose
    )

## 4. Select galaxies

### 4.1 Using the spread model parameter
This parameter quantifies the size of an object with respect to the local PSF. Objects with larger spread model are more likely to be galaxies.

#### Common flags and cuts
First, set cuts common to ngmix and galsim:
  - spread model: select objects well larger than the PSF
  - magnitude: cut galaxies that are too faint (= too noisy, likely to be
    artefacts), and too bright (might be too large for postage stamp)
  - flags: cut objects that were flagged as invalid or masked
  - n_epoch: select objects observed on at leatst one epoch (for safety,
    to avoid potential errors with empty data)

In [None]:
cut_overlap = classification_galaxy_overlap_ra_dec(
    dd,
    ra_key=col_name_ra,
    dec_key=col_name_dec
)

n_ok = sum(cut_overlap)
print_stats(f"Non-overlapping objects: {n_ok:10d}, {n_ok/n_obj:10.2%}", stats_file, verbose=verbose)

m_gal = {}

for sh in shapes:
    # add method-specific cuts
    #classification_method = getattr(galaxy, f'classification_galaxy_{sh}')
    if sh == 'ngmix':
        classification_method = classification_galaxy_ngmix
    elif sh == 'galsim':
        classification_method = classification_galaxy_galsim

    cut_common = classification_galaxy_base(
        dd,
        cut_overlap,
        gal_mag_bright=gal_mag_bright,
        gal_mag_faint=gal_mag_faint,
        flags_keep=flags_keep,
        n_epoch_min=n_epoch_min,
        do_spread_model=do_spread_model,
    )
    m_gal[sh] = classification_method(
        dd,
        cut_common,
        stats_file,
        verbose=verbose,
    )

    n_ok = sum(cut_common)
    print_stats(f"{sh}: objects after common cut: {n_ok:10d}, {n_ok/n_obj:10.2%}", stats_file, verbose=verbose)

    # MKDEBUG for debugging calibrate_comprehensive
    n_ok = sum(m_gal[sh])
    print_stats(f"common & ngmix = galaxy selection: {n_ok:10d}, {n_ok/n_obj:10.2%}", stats_file, verbose=verbose)


