# Add Atlas to Database
## Parameters
The next code block sets parameters that are used throughout the remainder of the notebook.

In [None]:
# pylint: disable=invalid-name,missing-module-docstring

# atlas data in table format (csv or tsv) to be deposited
atlas_file_name = "/global/homes/b/bkieft/metabolomics_data/20250519_JGI_KS_510532_OilMisc_final_EXP120B_HILICZ_USHXG01827/mjblow_JGI-HILIC_0_0/Targeted/JGI-HILIC_20250519_JGI_KS_510532_OilMisc_final_EXP120B_HILICZ_USHXG01827/ISTDsEtcV7-POS/CompoundAtlas__510532_OilMisc_final_HILIC_ISTDsEtcV7_positive_polynomial_510532_OilMisc_final_JGI-HILIC_ISTDsEtcV7-POS_0_mjblow_0_0.csv"

# name to assign to atlas
atlas_name = "510532_OilMisc_final_HILIC_ISTDsEtcV7_positive_polynomial_510532_OilMisc_final_JGI-HILIC_ISTDsEtcV7-POS_0_mjblow_0_0"

# use 'positive' or 'negative'
polarity = "positive"

# overrides any mz_tolerance value in the atlas
mz_tolerance = 5

# choose to sort the atlas by RT and MZ for analysis
sort_atlas = True

# is the atlas to be deposited an internal standard atlas (i.e., with unlabeled and labeled compounds)?
istd_atlas = True

# run a check to see if the number of compounds in deposited atlas matches number in input atlas
# defaulted to False because it can take a non-trivial amount of time for larger atlases
run_retrieval_check = False

############ The rest of this block contains project independent parameters

# to use an older version of the metatlas source code, set this to a commit id,
# branch name, or tag. If None, then use the the "main" branch.
source_code_version_id = None

# Threshold for how much status information metatlas functions print in the notebook
# levels are 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
log_level = "INFO"

## Initialization

In [None]:
# pylint: disable=wrong-import-position,import-error,missing-class-docstring
import logging  # noqa: E402
from pathlib import Path  # noqa: E402
from IPython.display import Markdown, display  # noqa: E402
import pandas as pd

class StopExecution(Exception):
    def _render_traceback_(self):
        pass

assert atlas_file_name is not None
assert atlas_name is not None

logger = logging.getLogger("metatlas.jupyter")

kernel_def = """{"argv":["shifter","--entrypoint","--image=ghcr.io/biorack/metatlas/metatlas_shifter:latest","/usr/local/bin/python","-m",
                 "ipykernel_launcher","-f","{connection_file}"],"display_name": "Metatlas Targeted","language": "python",
                 "metadata": { "debugger": true }}"""
kernel_file_name = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted" / "kernel.json"
try:
    has_root_kernel = Path("/root/.local/share/jupyter/kernels/papermill/kernel.json").is_file()
except PermissionError:
    has_root_kernel = False
if not has_root_kernel and not kernel_file_name.is_file():
    kernel_file_name.parent.mkdir(parents=True, exist_ok=True)
    with kernel_file_name.open(mode="w", encoding="utf-8") as f:
        f.writelines(kernel_def)
    logger.critical('CRITICAL: Notebook kernel has been installed. Set kernel to "Metatlas Targeted" and re-run notebook.')
    raise StopExecution
try:
    from metatlas.tools import notebook  # noqa: E402
except ImportError as err:
    logger.critical('CRITICAL: Set notebook kernel to "Metatlas Targeted" and re-run notebook.')
    raise StopExecution from err
# Check if any required variables are None
if atlas_file_name is None or atlas_name is None or polarity is None:
    raise SystemExit("Exiting notebook due to unset variables in cell 1.")
if ".csv" not in atlas_file_name:
    logger.info('Warning: if you are sorting this CSV before depositing the atlas, you must input a filename in Cell 1 with the suffix ".csv" or amend the sort function in the notebook')
notebook.setup(log_level, source_code_version_id)
from metatlas.plots.dill2plots import make_atlas_from_spreadsheet  # noqa: E402
from metatlas.io.metatlas_get_data_helper_fun import make_atlas_df,sort_atlas_table  # noqa: E402

## Atlas sort

In [None]:
if sort_atlas == True:
    deposit_atlas_file_name = sort_atlas_table(atlas_file_name, "rt_peak", 'mz', istd_atlas)    
else:
    deposit_atlas_file_name = atlas_file_name

## Atlas generation

In [None]:
%%time
logger.info('Reading in atlas from ' + deposit_atlas_file_name + ' and depositing to MySQL database at NERSC')
filetype = "csv" if deposit_atlas_file_name.endswith(".csv") else "tab"
atlas = make_atlas_from_spreadsheet(deposit_atlas_file_name, atlas_name, filetype=filetype, polarity=polarity, store=True, mz_tolerance=mz_tolerance)
logger.info('Making atlas df from ' + atlas.name + ' for downstream checks')
atlas_df = make_atlas_df(atlas)
display(Markdown(f"### Atlas unique_id: {atlas.unique_id}"))
display(Markdown(f"### Atlas name: {atlas.name}"))

## Deposit check

In [None]:
%%time
if run_retrieval_check == True:

    import pandas as pd
    from metatlas.datastructures.utils import get_atlas

    def atlas_id_to_df(atlas_unique_id: str) -> pd.DataFrame:
        """Retrieve atlas from database using unique id and create DataFrame from compound identification data."""

        atlas = get_atlas(atlas_unique_id)

        atlas_df = make_atlas_df(atlas)

        return atlas_df
    
    # Ensure atlas can be retrieved from database
    retrieved_atlas_df = atlas_id_to_df(atlas.unique_id)
    
    # Convert input atlas to df
    input_atlas_df = pd.read_csv(deposit_atlas_file_name)
    
    # Check dataframe dims against expectations
    if input_atlas_df.shape[0] == retrieved_atlas_df.shape[0]:
        
        logger.info('Input and deposited atlas have the same number of compounds.')
    
    else:
        
        logger.info('Warning! Input and deposited atlas do not have the same number of compounds.')