# Add Atlas to Database
## Parameters
The next code block sets parameters that are used throughout the remainder of the notebook.

In [15]:
# pylint: disable=invalid-name,missing-module-docstring

# atlas data in CSV file format
csv_atlas_file_name = None

# name to assign to atlas
atlas_name = None

# use 'positive' or 'negative'
polarity = None

# overrides any mz_tolerance value in the CSV file
mz_tolerance = 5

# choose to sort the atlas by RT and MZ for analysis
sort_atlas = True

# is the atlas to be deposited an internal standard atlas (i.e., with unlabeled and labeled compounds)?
istd_atlas = True

# run a check to see if the number of compounds in deposited atlas matches number in input atlas
# defaulted to False because it can take a non-trivial amount of time for larger atlases
run_retrieval_check = False

############ The rest of this block contains project independent parameters

# to use an older version of the metatlas source code, set this to a commit id,
# branch name, or tag. If None, then use the the "main" branch.
source_code_version_id = None

# Threshold for how much status information metatlas functions print in the notebook
# levels are 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
log_level = "INFO"

## Initialization

In [None]:
# pylint: disable=wrong-import-position,import-error,missing-class-docstring
import logging  # noqa: E402
from pathlib import Path  # noqa: E402
from IPython.display import Markdown, display  # noqa: E402
import pandas as pd

class StopExecution(Exception):
    def _render_traceback_(self):
        pass


logger = logging.getLogger("metatlas.jupyter")

kernel_def = """{"argv":["shifter","--entrypoint","--image=ghcr.io/biorack/metatlas/metatlas_shifter:latest","/usr/local/bin/python","-m",
                 "ipykernel_launcher","-f","{connection_file}"],"display_name": "Metatlas Targeted","language": "python",
                 "metadata": { "debugger": true }}"""
kernel_file_name = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted" / "kernel.json"
try:
    has_root_kernel = Path("/root/.local/share/jupyter/kernels/papermill/kernel.json").is_file()
except PermissionError:
    has_root_kernel = False
if not has_root_kernel and not kernel_file_name.is_file():
    kernel_file_name.parent.mkdir(parents=True, exist_ok=True)
    with kernel_file_name.open(mode="w", encoding="utf-8") as f:
        f.writelines(kernel_def)
    logger.critical('CRITICAL: Notebook kernel has been installed. Set kernel to "Metatlas Targeted" and re-run notebook.')
    raise StopExecution
try:
    from metatlas.tools import notebook  # noqa: E402
except ImportError as err:
    logger.critical('CRITICAL: Set notebook kernel to "Metatlas Targeted" and re-run notebook.')
    raise StopExecution from err
# Check if any required variables are None
if csv_atlas_file_name is None or atlas_name is None or polarity is None:
    raise SystemExit("Exiting notebook due to unset variables in cell 1.")
if ".csv" not in csv_atlas_file_name:
    logger.info('Warning: if you are sorting this CSV before depositing the atlas, you must input a filename in Cell 1 with the suffix ".csv" or amend the sort function in the notebook')
notebook.setup(log_level, source_code_version_id)
from metatlas.plots.dill2plots import make_atlas_from_spreadsheet  # noqa: E402
from metatlas.io.metatlas_get_data_helper_fun import make_atlas_df  # noqa: E402

In [13]:
def sort_atlas_csv(input_csv, column1, column2, istd_atlas):
    """
    Reads in the atlas CSV, sorts it based on two numeric columns in ascending order

    Parameters:
    - input_csv: Path to the input CSV file.
    - column1: The first column to sort by.
    - column2: The second column to sort by.
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_csv)
    
    # Sort the DataFrame based on the specified columns in ascending order
    if istd_atlas == True:
        if 'label' in df.columns:
            if 'unlabeled' not in df['label'].values:
                logger.info("Warning: The designation 'unlabeled' does not appear in the 'label' column. Only set 'istd_atlas' to True if this is an internal standard atlas with isotopic labeling.")
            df['rt_peak'] = df.apply(lambda row: row['rt_peak'] + 0.1 if 'unlabeled' in row['label'] else row['rt_peak'], axis=1)
            sorted_df = df.sort_values(by=[column1, column2], ascending=[True, True])
            sorted_df['rt_peak'] = sorted_df.apply(lambda row: row['rt_peak'] - 0.1 if 'unlabeled' in row['label'] else row['rt_peak'], axis=1)
        else:
            logger.info("Warning: The 'label' column is missing. Not sorting with heavy isotope compound first even though 'istd_atlas' is set to True.")
            sorted_df = df.sort_values(by=[column1, column2], ascending=[True, True])
    else:
        sorted_df = df.sort_values(by=[column1, column2], ascending=[True, True])
    
    return(sorted_df)

In [None]:
if sort_atlas == True:
    
    csv_atlas_file_sorted_name = csv_atlas_file_name.replace(".csv", "_sorted.csv")

    sorted_df = sort_atlas_csv(csv_atlas_file_name, "rt_peak", 'mz', istd_atlas)

    logger.info('Writing sorted atlas to ' + csv_atlas_file_sorted_name + ' and selecting for db deposit')
    sorted_df.to_csv(csv_atlas_file_sorted_name, index=False)
    
    input_atlas_file_name = csv_atlas_file_sorted_name
    
else:

    input_atlas_file_name = csv_atlas_file_name
    
    logger.info('Notice: atlas data not sorted, retaining ' + csv_atlas_file_name + ' for db deposit')


## Atlas generation

In [None]:
%%time
assert csv_atlas_file_name is not None
assert atlas_name is not None
logger.info('Reading in atlas from ' + input_atlas_file_name + ' and depositing to MySQL database at NERSC')
atlas = make_atlas_from_spreadsheet(
    input_atlas_file_name, atlas_name, filetype="csv", polarity=polarity, store=True, mz_tolerance=mz_tolerance
)
logger.info('Making atlas df from ' + atlas.name + ' for downstream checks')
atlas_df = make_atlas_df(atlas)
display(Markdown(f"### Atlas unique_id: {atlas.unique_id}"))
display(Markdown(f"### Atlas name: {atlas.name}"))

In [None]:
if run_retrieval_check == True:

    import pandas as pd
    from metatlas.datastructures.utils import get_atlas

    def atlas_id_to_df(atlas_unique_id: str) -> pd.DataFrame:
        """Retrieve atlas from database using unique id and create DataFrame from compound identification data."""

        atlas = get_atlas(atlas_unique_id)

        atlas_df = make_atlas_df(atlas)

        return atlas_df
    
    # Ensure atlas can be retrieved from database
    retrieved_atlas_df = atlas_id_to_df(atlas.unique_id)
    
    # Convert input atlas to df
    input_atlas_df = pd.read_csv(input_atlas_file_name)
    
    # Check dataframe dims against expectations
    if input_atlas_df.shape[0] == retrieved_atlas_df.shape[0]:
        
        logger.info('Input and deposited atlas have the same number of compounds.')
    
    else:
        
        logger.info('Warning! Input and deposited atlas do not have the same number of compounds.')