# 0. Parameters

In [None]:
# pylint: disable=invalid-name,missing-module-docstring

# A list of experiment IDs to pull LCMS runs from
# if None, then the value of the "experiment" variable will be used
run_batches = None

# The name of a workflow defined in the configuration file
workflow_name = None

# The name of an analysis within the workflow
analysis_name = None

# source atlas' "unique_id" field in the database
source_atlas_unique_id = None

# if copy_atlas is True, then generate an atlas specifically for this analysis_number
# should only be set to False if an analysis will not be modifying the atlas or RT ranges
copy_atlas = None

# one of 'positive' or 'negative'
polarity = None

# an integer, increment if you need to redo your analysis
# will be appended to your username to create analysis_id
analysis_number = None

# experiment ID that must match the parent folder containing the LCMS output files
# An example experiment ID is '20201116_JGI-AK_LH_506489_SoilWarm_final_QE-HF_HILICZ_USHXG01530'
experiment = None

# list of substrings that will group together when creating groups
# this provides additional grouping beyond the default grouping on field #12
groups_controlled_vocab = None

# The following include/exclude groups/lcmsruns dictionaries are used to define
# the filtering of groups or lcmsruns at different steps of the analysis
# pipeline.
#
# Each of the values within the dictionary should either be None or a list of
# strings. If a None value is supplied below, then the value will be loaded
# from the configuration file (config_file_name). If a list value is provided
# below, then the list will be utilized (overriding the value in the
# configuration file).
#
# The value associated with the 'always' key will be appeneded to each of the
# other values within the same dictionary. This appending occurs after all
# dictionary values given in this notebook have been merged with the values
# given in the configuration file.
#
# If the configuration file does not define a key-value pair and the below
# dictionary has None, then no filtering will be performed.

# group will only be used if their name has a substring match to this list of strings
include_groups = dict(  # noqa: C408
    always=None,
    gui=None,
    qc_outputs=None,
    ids_spreadsheet=None,
    chromatograms=None,
    data_sheets=None,
    box_plots=None,
)

# Exclude groups with names containing any of the substrings in this list.
# Generally you will want to include polarities you are not using
# such as ['NEG', 'FPS'] for a positive polarity analysis.
exclude_groups = dict(  # noqa: C408
    always=None,
    gui=None,
    qc_outputs=None,
    ids_spreadsheet=None,
    chromatograms=None,
    data_sheets=None,
    box_plots=None,
)

# LCMS runs will only be used if their name contain one or more of the
# substrings in the corresponding list.
include_lcmsruns = dict(  # noqa: C408
    always=None,
    gui=None,
    qc_outputs=None,
    ids_spreadsheet=None,
    chromatograms=None,
    data_sheets=None,
    box_plots=None,
)

# LCMS runs will removed if their name contain one or more of the
# substrings in the corresponding list.
exclude_lcmsruns = dict(  # noqa: C408
    always=None,
    gui=None,
    qc_outputs=None,
    ids_spreadsheet=None,
    chromatograms=None,
    data_sheets=None,
    box_plots=None,
)

# Create outputs used to QC the run
generate_qc_outputs = None

# thresholds for filtering out compounds with weak MS1 signals
# set to None to disable a filter
num_points = None
peak_height = None

# if True, the post_annotation() function will remove atlas rows marked
# 'Remove' before generating output files
filter_removed = None

# list of tuples contain string with color name and substring pattern.
# Lines in the EIC plot will be colored by the first substring pattern
# that has a match within the name of the hdf5_file. The order they are
# listed in your list is the order they are displayed in the overlays
# (first is front, last is back). Named colors available in matplotlib
# are here: https://matplotlib.org/3.1.0/gallery/color/named_colors.html
# or use hexadecimal values '#000000'. Lines default to black.
line_colors = None

# Set to False to disable check that all compounds have either been
# removed or rated within the annotation GUI before generating outputs.
require_all_evaluated = None

# If True, then create the main set of outputs
generate_analysis_outputs = None

# Groups to be excluded when generating the post annotation outputs:
exclude_groups_for_analysis_outputs = None

# include MSMS fragment ions in the output documents?
# has no effect if generate_post_annotation_outputs is False
export_msms_fragment_ions = None

# Setting this to True will remove the cache of MSMS hits
# if you don't see MSMS data for any of your compounds in RT adjuster GUI,
# then you might want to try settings this to True. However, it will
# make your notebook take significantly longer to run.
# The cache is per experiment, so clearing the cache will impact other
# notebooks for this same experiment.
clear_cache = None

# This value will always be automatically passed in from the RT_Alignment
# notebook and you should not manually set this parameter.
rt_alignment_number = None

# The rest of this block contains project independent parameters

# Configuration file location
config_file_name = None

# to use an older version of the metatlas source code, set this to a commit id,
# branch name, or tag. If None, then use the the "main" branch.
source_code_version_id = None

# Full path to the directory where you want this notebook to store data.
# A subdirectory will be auto created within this directory for each project.
# You can place this anywhere on cori's filesystem, but placing it within your
# global home directory is recommended so that you do not need to worry about
# your data being purged. Each project will take on the order of 100 MB.
project_directory = None

# ID from Google Drive URL for base output folder .
# The default value is the ID that corresponds to 'JGI_Metabolomics_Projects'.
google_folder = None

# maximum number of CPUs to use
# when running on jupyter.nersc.gov, you are not allowed to set this above 4
max_cpus = None

# Threshold for how much status information metatlas functions print in the notebook
# levels are 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
log_level = None

In [None]:
# pylint: disable=wrong-import-position,import-error,missing-class-docstring
import logging  # noqa: E402
from pathlib import Path  # noqa: E402


class StopExecution(Exception):
    def _render_traceback_(self):
        pass


parameters = {k: v for k, v in globals().items() if k[0] != "_" and k not in ["In", "Out", "get_ipython", "exit", "quit", "open"]}
logger = logging.getLogger("metatlas.jupyter")
kernel_def = """{"argv":["shifter","--entrypoint","--image=doejgi/metatlas_shifter:latest","/usr/local/bin/python","-m",
                 "ipykernel_launcher","-f","{connection_file}"],"display_name": "Metatlas Targeted","language": "python",
                 "metadata": { "debugger": true }}"""
kernel_file_name = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted" / "kernel.json"
try:
    has_root_kernel = Path("/root/.local/share/jupyter/kernels/papermill/kernel.json").is_file()
except PermissionError:
    has_root_kernel = False
if not has_root_kernel and not kernel_file_name.is_file():
    kernel_file_name.parent.mkdir(parents=True, exist_ok=True)
    with kernel_file_name.open(mode="w", encoding="utf-8") as f:
        f.writelines(kernel_def)
    logger.critical('CRITICAL: Notebook kernel has been installed. Set kernel to "Metatlas Targeted" and re-run notebook.')
    raise StopExecution
try:
    from metatlas.tools import notebook, config  # noqa: E402
except ImportError as err:
    logger.critical('CRITICAL: Set notebook kernel to "Metatlas Targeted" and re-run notebook.')
    raise StopExecution from err
configuration, workflow, analysis = config.get_config(parameters)
notebook.setup(analysis.parameters.log_level, analysis.parameters.source_code_version_id)

import getpass  # noqa: E402
import pandas as pd  # noqa: E402
from IPython.display import display, HTML  # noqa: E402
from metatlas.plots import dill2plots as dp  # noqa: E402
from metatlas.datastructures import metatlas_objects as metob  # noqa: E402
from metatlas.datastructures.analysis_identifiers import AnalysisIdentifiers  # noqa: E402
from metatlas.targeted.process import pre_annotation, annotation_gui, post_annotation  # noqa: E402

display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option("display.max_rows", 5000)
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_colwidth", 100)
assert experiment is not None
assert source_atlas_unique_id is not None
assert project_directory is not None
run_batches = run_batches if run_batches is not None else [experiment]
%matplotlib widget
username = getpass.getuser()

# 1. Set output directory

In [None]:
! pwd

In [None]:
initial_ids = AnalysisIdentifiers(
    project_directory=project_directory,
    experiment=experiment,
    configuration=configuration,
    workflow=workflow_name,
    analysis=analysis_name,
    analysis_number=analysis_number,
    source_atlas_unique_id=source_atlas_unique_id,
    username=None,
    lcmsruns=None,
    all_groups=None,
)
output_dir = initial_ids.output_dir

# 2. Create Groups
## Find your files

In [None]:
files = dp.get_metatlas_files(experiment=run_batches, name="%", most_recent=True)
df = metob.to_dataframe(files)
display(df[["experiment", "name", "username", "acquisition_time"]])

In [None]:
len(files)

# OPTION A: Automated Group Maker

In [None]:
# STEP 1: View the groups

initial_ids.set_output_state(analysis.parameters, "gui")

files = dp.get_metatlas_files(experiment=run_batches, name="%", most_recent=True)

controlled_vocab = ["QC", "InjBl", "ISTD"]  # add _ to beginning. It will be stripped if at begining
version_identifier = f"{username}_0_{analysis_number}"
file_dict = {}
groups_dict = {}
# WARNING - This currently does not filter based on initial_ids.include_lcmsruns
for f in files:
    if any(exclude_string in f.name for exclude_string in initial_ids.exclude_lcmsruns):
        continue
    k = f.name.split(".")[0]
    #     get index if any controlled vocab in filename
    indices = [i for i, s in enumerate(controlled_vocab) if s.lower() in k.lower()]
    prefix = "_".join(k.split("_")[:11])
    if len(indices) > 0:
        short_name = controlled_vocab[indices[0]].lstrip("_")
        group_name = f"{prefix}_{version_identifier}_{short_name}"
        short_name = k.split("_")[9] + "_" + short_name  # Prepending POL to short_name
    else:
        short_name = k.split("_")[12]
        group_name = f"{prefix}_{version_identifier}_{short_name}"
        short_name = k.split("_")[9] + "_" + k.split("_")[12]  # Prepending POL to short_name
    file_dict[k] = {"file": f, "group": group_name, "short_name": short_name}
    groups_dict[group_name] = {"items": [], "name": group_name, "short_name": short_name}
df = pd.DataFrame(file_dict).T
df.index.name = "filename"
df.reset_index(inplace=True)  # ['group'].unique()
df.drop(columns=["file"], inplace=True)
for name, data in groups_dict.items():
    for file_value in file_dict.values():
        if file_value["group"] == name:
            data["items"].append(file_value["file"])
df.head(100)

In [None]:
# STEP 2: create the groups variable, if the above looks OK

groups = []
for group_key, group_values in groups_dict.items():
    g = metob.Group(name=group_key, items=group_values["items"], short_name=group_values["short_name"])
    groups.append(g)
    for item in g.items:
        print(g.name, g.short_name, item.name)
    print("")

In [None]:
# STEP 3 Option A: store the groups variable content in the DB (currently only the long group name is stored)
metob.store(groups)

# 3. Select groups of files to operate on

In [None]:
groups = dp.select_groups_for_analysis(
    name=f"{experiment}%",  # <- edit text search string here
    most_recent=True,
    remove_empty=True,
    include_list=initial_ids.include_groups,
    exclude_list=initial_ids.exclude_groups,
)
print("sorted groups")
groups = sorted(groups, key=lambda x: x.name)
for i, a in enumerate(groups):
    print(i, a.name)

In [None]:
# to view metadata about your groups, run the block below
metob.to_dataframe(groups)

# 4. Load data, generating metatlas_dataset

Reads data from .h5 files and generates a metatlas_dataset instance. 

In [None]:
metatlas_dataset = pre_annotation(
    experiment=experiment,
    rt_alignment_number=rt_alignment_number,
    analysis_number=analysis_number,
    source_atlas_unique_id=source_atlas_unique_id,
    configuration=configuration,
    workflow=workflow,
    analysis=analysis,
    lcmsruns=files,
    all_groups=groups,
)

# 5. Annotation GUI
If you are re-running this notebook and do not need to make additional changes to RT min/max bounds, then you can skip running the next code cell. Skipping will save you from calculating MSMS hits twice.

In [None]:
agui = annotation_gui(data=metatlas_dataset, compound_idx=0, width=15, height=3, colors=analysis.parameters.line_colors)

# 6. Generate standard outputs and upload to Google Drive

In [None]:
post_annotation(data=metatlas_dataset, configuration=configuration, workflow=workflow, analysis=analysis)