# quality_assurance

> In progress, including development of diagnostics to be moved elsewhere when complete.

In [None]:
from abc import ABC, abstractmethod

import dask
import pandas as pd
import xarray as xr

from qagmire.data import (
    get_lr_l2_stack_files,
    read_class_spec,
    read_class_table,
    read_galaxy_table,
)

## Diagnostics

Now let's look at some diagnostic tests.

### Line flux check

A reproduction of the weaveio [line_flux_check](https://github.com/bamford/QAG/blob/master/diagnostics/line_flux_check.py).

In [None]:
def line_wavelengths(galaxy_table, class_table):
    """Determine the expected observed wavelengths of all lines."""
    lines = galaxy_table["LINE"].astype(str)
    line_species, line_rest_wl = lines.str.split(dim="", sep="_").T
    line_rest_wl = line_rest_wl.astype(float)
    line_wl = (1 + class_table["Z"]) * line_rest_wl
    return line_wl

In [None]:
def wavelength_boundaries(class_spec):
    """Determine wavelength boundaries and wavelength gaps of blue and red spectra."""
    gaps = {}
    boundaries = {}
    for band, low, high in (("B", 4000, 6000), ("R", 6000, 9000)):
        wl_dim = f"LAMBDA_{band}"
        wl = class_spec[wl_dim]
        null_flux = class_spec[f"FLUX_RR_{band}"].isnull()
        wl_null = wl.where(null_flux & (wl > low) & (wl < high))
        gaps[band] = {"low": wl_null.min(dim=wl_dim), "high": wl_null.max(dim=wl_dim)}
        wl_not_null = wl.where(~null_flux)
        boundaries[band] = {
            "low": wl_not_null.min(dim=wl_dim),
            "high": wl_not_null.max(dim=wl_dim),
        }
    return boundaries, gaps

In [None]:
class Diagnostics(ABC):
    def run(self):
        tests = self.tests()
        test_names = [t["name"] for t in tests]
        test_array = [t["test"] for t in tests]
        detail = xr.concat(test_array, pd.Index(test_names, name="test"))
        detail = dask.compute(detail)[0]
        return detail

    @abstractmethod
    def tests(self, **kwargs):
        return [
            {
                "name": "a_short_name",
                "description": "The question that the test answers",
                "test": None,
            },  # a boolean DataArray including dimensions `filename` and `APS_ID`
        ]

In [None]:
class LineFluxCheck(Diagnostics):
    def tests(self, **kwargs):
        lr_l2_stack_files = get_lr_l2_stack_files(**kwargs)

        class_spec = read_class_spec(lr_l2_stack_files)
        galaxy_table = read_galaxy_table(lr_l2_stack_files)
        class_table = read_class_table(lr_l2_stack_files)

        line_wl = line_wavelengths(galaxy_table, class_table)
        boundaries, gaps = wavelength_boundaries(class_spec)

        measured_line_flux = galaxy_table["LINES"].sel(QTY="FLUX", drop=True)
        null_flux = measured_line_flux.isnull()

        is_in_red_gap = (line_wl > gaps["R"]["low"]) & (line_wl < gaps["R"]["high"])
        is_in_blue_gap = (line_wl > gaps["B"]["low"]) & (line_wl < gaps["B"]["high"])
        is_in_gap = is_in_blue_gap | is_in_red_gap

        is_off_spectrum = (
            (line_wl < boundaries["B"]["low"]) | (line_wl > boundaries["B"]["high"])
        ) & ((line_wl < boundaries["R"]["low"]) | (line_wl > boundaries["R"]["high"]))

        is_on_spectrum = ~is_in_gap & ~is_off_spectrum

        null_spectrum = (
            boundaries["B"]["low"].isnull() | boundaries["R"]["low"].isnull()
        )

        tests = [
            {
                "name": "line_in_null_spectrum",
                "description": "Do non-null line fluxes appear in completely null spectra?",
                "test": ~null_flux & null_spectrum,
            },
            {
                "name": "line_in_blue_chip_gap",
                "description": "Do non-null line fluxes appear in the blue chip gap?",
                "test": ~null_flux & is_in_blue_gap,
            },
            {
                "name": "line_in_red_chip_gap",
                "description": "Do non-null line fluxes appear in the red chip gap?",
                "test": ~null_flux & is_in_red_gap,
            },
            {
                "name": "line_off_spectrum",
                "description": "Do non-null line fluxes appear outside the observed wavelength range?",
                "test": ~null_flux & is_off_spectrum,
            },
            {
                "name": "null_line_on_spectrum",
                "description": "Do null line fluxes appear in an observed wavelength range?",
                "test": null_flux & is_on_spectrum,
            },
        ]
        return tests

In [None]:
detail = LineFluxCheck().run()

Locating and converting where necessary: 100%|██████████| 17/17 [00:00<00:00, 11491.24it/s]
Reading netCDF files... took 2.85 s.
Locating and converting where necessary: 100%|██████████| 17/17 [00:00<00:00, 18088.07it/s]
Reading netCDF files... took 2.54 s.
Locating and converting where necessary: 100%|██████████| 17/17 [00:00<00:00, 16070.13it/s]
Reading netCDF files... took 2.73 s.


  return np.nanmin(x_chunk, axis=axis, keepdims=keepdims)
  return np.nanmin(x_chunk, axis=axis, keepdims=keepdims)


In [None]:
detail = detail.swap_dims({"filename": "obid"}).drop_vars("filename")

In [None]:
def fails_summary(da, top=None):
    df = da.to_dataframe(name="fails").unstack("test")
    df.loc[:, ("fails", "total")] = df.sum(axis="columns")
    df = df.sort_values(("fails", "total"), ascending=False)
    if top is not None:
        df = df.iloc[:top]
    return df

In [None]:
per_obid = detail.sum(dim=["APS_ID", "LINE"])
fails_summary(per_obid)

Unnamed: 0_level_0,fails,fails,fails,fails,fails,fails
test,line_in_null_spectrum,line_in_blue_chip_gap,line_in_red_chip_gap,line_off_spectrum,null_line_on_spectrum,total
obid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
3170,0,15,37,261,17500,17813
3189,0,20,26,267,17481,17794
3133,0,10,22,288,17428,17748
3191,0,5,7,97,17226,17335
3175,0,52,44,709,12132,12937
3380,0,32,46,674,12159,12911
3434,0,37,63,747,11182,12029
3346,0,22,94,1684,6496,8296
3217,0,37,87,1781,6353,8258
3372,0,144,178,2695,3727,6744


In [None]:
per_line = detail.sum(dim=["obid", "APS_ID"])
fails_summary(per_line)

Unnamed: 0_level_0,fails,fails,fails,fails,fails,fails
test,line_in_null_spectrum,line_in_blue_chip_gap,line_in_red_chip_gap,line_off_spectrum,null_line_on_spectrum,total
LINE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
[ArIII]_7135.67,0,0,55,4343,6604,11002
[SII2]_6730.68,0,0,53,3583,6626,10262
[SII]_6716.31,0,0,51,3576,6625,10252
[NII]_6583.34,0,0,47,3361,6634,10042
Ha_6562.80,0,0,52,3313,6633,9998
[OI]_6300.20,0,0,73,2776,6638,9487
[OII]_3726.03,0,178,0,0,8491,8669
HeI_5875.60,0,1,136,1837,6655,8629
[OII]_3728.73,0,172,0,0,8454,8626
[NeIII]_3967.40,0,81,229,0,8185,8495


In [None]:
per_fibre = detail.sum(dim=["obid", "LINE"])
fails_summary(per_fibre, top=20)

Unnamed: 0_level_0,fails,fails,fails,fails,fails,fails
test,line_in_null_spectrum,line_in_blue_chip_gap,line_in_red_chip_gap,line_off_spectrum,null_line_on_spectrum,total
APS_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
628,0,0,0,6,343,349
629,0,0,0,3,340,343
341,0,0,0,0,332,332
144,0,0,0,3,324,327
436,0,2,2,12,310,326
722,0,0,0,14,304,318
437,0,6,0,15,296,317
434,0,6,0,15,296,317
242,0,0,0,21,280,301
339,0,6,0,18,277,301
