# Parameters

The next code block sets parameters that are used throughout the remainder of the notebook.

In [None]:
# one of 'positive' or 'negative'
polarity = 'positive'

# one of 'ISTDsEtc' or 'FinalEMA-HILIC'
output_type = 'FinalEMA-HILIC'

# an integer, increment if you need to redo your analysis
# will be appended to your username to create analysis_id
analysis_number = 0

# experiment ID that must match the parent folder containing the LCMS output files
# An example experiment ID is '20201116_JGI-AK_LH_506489_SoilWarm_final_QE-HF_HILICZ_USHXG01530'
experiment = 'REPLACE ME'

# Exclude files with names containing any of the substrings in this list. Eg., ['peas', 'beans']
exclude_files = []

# Exclude groups with names containing any of the substrings in this list.
# 'POS' or 'NEG' will be auto-appended later, so you shouldn't use them here.
exclude_groups = ['QC','InjBl']

# thresholds for filtering out compounds with weak MS1 signals
num_points_passing = 5
peak_height_passing = 4e5

# include MSMS fragment ions in the output documents?
export_msms_fragment_ions = False

# list of substrings that will group together when creating groups
# this provides additional grouping beyond the default grouping on field #12
groups_controlled_vocab = ['QC','InjBl','ISTD']

# list of tuples contain string with color name and substring pattern.
# Lines in the EIC plot will be colored by the first substring pattern
# that has a match within the name of the hdf5_file. The order they are
# listed in your list is the order they are displayed in the overlays
# (first is front, last is back). Named colors available in matplotlib
# are here: https://matplotlib.org/3.1.0/gallery/color/named_colors.html
# or use hexadecimal values '#000000'. Lines default to black.
rt_adjuster_color_list = [('red','ExCtrl'),                                                     
                          ('green','TxCtrl'),
                          ('blue','InjBl')]

# The rest of this block contains project independent parameters

# Full path to the directory where you have cloned the metatlas git repo.
# If you ran the 'git clone ...' command in your home directory on Cori, 
# then you'll want '/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metatlas'
# where the uppercase letters are replaced based on your NERSC username.
metatlas_repo_path = '/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metatlas'


# Full path to the directory where you want this notebook to store data.
# A subdirectory will be auto created within this directory for each project.
# You can place this anywhere on cori's filesystem, but placing it within your
# global home directory is recommended so that you do not need to worry about
# your data being purged. Each project will take on the order of 100 MB.
project_directory = '/global/homes/FIRST-INITIAL-OF-USERNAME/USERNAME/metabolomics_projects'

# maximum number of CPUs to use
# when running on jupyter.nersc.gov, you are not allowed to set this above 4
max_cpus = 4

# Threshold for how much status information metatlas functions print in the notebook
# levels are 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
log_level = 'INFO'

In [None]:
%matplotlib notebook

import sys, os
os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'

sys.path.insert(0, metatlas_repo_path)
try:
    import dataset
except ModuleNotFoundError:
    print('Could not find dataset module. Please check that the kernel is set to "metatlas py3".')
    raise ValueError('Invalid kernel setting in Jupyter Notebook.')
if not os.path.exists(metatlas_repo_path):
    print('Directory set for metatlas_repo_path parameter does not exists.')
    raise ValueError('Invalid metatlas_repo_path parameter in Jupyter Notebook.')
try:
    from metatlas.tools import fastanalysis as fa
except ModuleNotFoundError:
    print('Could not find metatlas module. In the Parameters block, please check the value of metatlas_repo_path.')
    raise ValueError('Invalid metatlas_repo_path parameter in Jupyter Notebook.')
from metatlas.plots import dill2plots as dp
from metatlas.io import metatlas_get_data_helper_fun as ma_data
from metatlas.datastructures import metatlas_objects as metob
from metatlas.datastructures import metatlas_dataset as mads
from metatlas.tools.logging import activate_logging

import getpass
import logging
import numpy as np
import pandas as pd
import pickle
import time
from functools import partial
from importlib import reload
from pathlib import Path
from IPython.core.display import display, HTML

if polarity not in ['positive', 'negative']:
    raise ValueError('Parameter polarity is not one of "positive" or "negative".')

if output_type == 'ISTDsEtc':
    pass
elif output_type == 'FinalEMA-HILIC':
    num_data_points_passing = 5
    peak_height_passing = 4e5
else:
    raise ValueError('Parameter output_type is not one of "ISTDsEtc" or "FinalEMA-HILIC".')

if experiment == 'Replace me':
    raise ValueError('Parameter experiment has not been set.')
if len(experiment.split('_')) != 9:
    raise ValueError('Parameter experiment does contain 9 fields when split on "_".')

activate_logging(console_level=log_level)
logger = logging.getLogger('metatlas.jupyter')

username = getpass.getuser()
analysis_id = f"{username}{analysis_number}"
output_dir = os.path.join(project_directory, experiment, analysis_id, output_type)
short_experiment_analysis_id = experiment.split('_')[0]+'_'+experiment.split('_')[3]+'_'+analysis_id

os.makedirs(project_directory, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

# set notebook to have minimal side margins
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)

logger.info("experiment=%s, analysis_id=%s, short_experiment_analysis_id=%s", experiment, analysis_id, short_experiment_analysis_id)
logger.info("output_dir=%s", output_dir)

# LCMS filenaming convention

### You must assign your raw files into experimental groups for analysis.  These are used for downstream statistics and for selection of specific groups for filtering to subsets of files for analysis (Ex. just pos or just neg).

The groups are created from common file headers and the unique group names. The convention our lab group uses for filenames is as follows: 
***
DATE_NORTHENLABINITIALS_COLLABINITIALS_PROJ_EXP_SAMPSET_SYSTEM_COLUMN-method_SERIAL_POL_ACQ_SAMPLENUMBER_ SAMPLEGROUP_REP_OPTIONAL_SEQ 

Ex.:20180105_SK_AD_ENIGMA_PseudoInt_R2ADec2017_QE119_50454_123456_POS_MSMS_001_Psyringae-R2A-30C-20hr_Rep01_NA_Seq001.raw
***
The common header consists of the fields 0-10: DATE_NORTHENLABINITIALS_COLLABINITIALS_PROJ_EXP_SAMPSET_SYSTEM_COLUMN-method_SERIAL_POL_ACQ 

The sample group name is commonly field # 12 (between underscore 11 and 12) -0 indexed-
# Find your files
1. On the first line of the block below, set the 'experiment' and 'name' variables to find your files.  These fields require wildcards for partial string searches
2. 'Experiment' is the folder name within global/project/projectdirs/metatlas/raw_data, that will be emailed to you when the files are uploaded to NERSC.  You can also look in the raw_data directory for the NERSC user who uploaded your files; your experiment folder should be in there.
3. 'name' is string that will match a subset of your files within that folder.  

In [None]:
files = dp.get_metatlas_files(experiment = experiment,name = '%',most_recent = True)
df = metob.to_dataframe(files)
logger.info("Number of LCMS output files matching '%s' is: %d.", experiment, len(files))
df.head()

# Make Groups
This will attempt to create groups in an automated fashion (rather than filling out a spreadsheet with a list of files and group names).  If your files are all in one folder at nersc, you can use this options.  If not, use option B below.

A long group name consisting of the common header + either controlled vocab value or field #12 along with a short group name (just controlled vocab or field #12) will be stored in a local variable.  The short group names can be used on plots.


1. STEP 1: View the groups
    1. Pick an experiment folder to look for files in on the metob.retrieve function
    2. Enter controlled vocabulary for control files to put select files into groups when control string may be in a different field (not #12) or as a randomly placed substring within a field (ex. if 'InjBl' is included in your controlled vocab list, files like _InjBl-MeOH_ and _StartInjBl_ will group together)
    3. If your group name is not between _ 11 and 12 you can adjust those values in the split commands below.  All other (non-controlledvocab) groups will be created from that field.
2. STEP 2: Create the groups variable after checking the output from STEP 1
3. STEP 3: <br />
    Option A: If everything looks fine the group names and short names, Store groups once you know you have files in correct groups by running and checking the output of STEPS 1 and 2.<br />
    Option B (optional): If you would like to edit the groups, uncomment the options B-I and B-II. Run Option B-I to export a prefilled tab infosheet. Edit the file and then run Option B-II to import the new groups and save it. 

In [None]:
#STEP 1: View the groups
files = dp.get_metatlas_files(experiment = experiment,name = '%',most_recent = True)
file_dict = {}
groups_dict = {}
for f in files:
    if not any(map(f.name.__contains__, exclude_files)):
        k = f.name.split('.')[0]
        #     get index if any controlled vocab in filename
        indices = [i for i, s in enumerate(groups_controlled_vocab) if s.lower() in k.lower()]
        prefix = '_'.join(k.split('_')[:11])
        if len(indices)>0:
            short_name = groups_controlled_vocab[indices[0]].lstrip('_')
            group_name = '%s_%s_%s'%(prefix,analysis_id,short_name)
            short_name = k.split('_')[9]+'_'+short_name # Prepending POL to short_name
        else:
            short_name = k.split('_')[12]
            group_name = '%s_%s_%s'%(prefix,analysis_id,short_name)
            short_name = k.split('_')[9]+'_'+k.split('_')[12]  # Prepending POL to short_name
        file_dict[k] = {'file':f,'group':group_name,'short_name':short_name}
        groups_dict[group_name] = {'items':[],'name':group_name,'short_name':short_name}
df = pd.DataFrame(file_dict).T
df.index.name = 'filename'
df.reset_index(inplace=True)#['group'].unique()
df.drop(columns=['file'],inplace=True)
for ug in groups_dict.keys():
    for file_key,file_value in file_dict.items():
        if file_value['group'] == ug:
            groups_dict[ug]['items'].append(file_value['file'])
df.head(100)

In [None]:
#STEP 2: create the groups variable, if the above looks OK
groups = []
for group_key,group_values in groups_dict.items():
    g = metob.Group(name=group_key,items=group_values['items'],short_name=group_values['short_name'])
    groups.append(g)        
    for item in g.items:
        print(g.name,g.short_name,item.name)
    print('')

In [None]:
# STEP 3 Option A: store the groups variable content in the DB (currently only the long group name is stored)
metob.store(groups)

## Make data frame of short filenames and samplenames
Uncomment the below 2 blocks to make short file names and smaple names.<br>
This creates a dataframe and a csv file which can be edited, exported and imported. 

In [None]:
# Make short_filename and short_samplename 
files = metob.retrieve('lcmsruns',experiment=experiment,username='*')
short_filename_delim_ids = [0,2,4,5,7,9,14]
short_samplename_delim_ids = [9,12,13,14]
short_names_df = pd.DataFrame(columns=['sample_treatment','short_filename','short_samplename'])
ctr = 0
for f in files:
    short_filename = []
    short_samplename = []
    tokens = f.name.split('.')[0].split('_')
    for id in short_filename_delim_ids:
        short_filename.append(str(tokens[id]))
    for id in short_samplename_delim_ids:
        short_samplename.append(str(tokens[id]))
    short_filename = "_".join(short_filename)
    short_samplename = "_".join(short_samplename)
    short_names_df.loc[ctr, 'full_filename'] = f.name.split('.')[0]
    short_names_df.loc[ctr, 'sample_treatment'] = str(tokens[12]) # delim 12
    short_names_df.loc[ctr, 'short_filename'] = short_filename
    short_names_df.loc[ctr, 'short_samplename'] = short_samplename
    short_names_df.loc[ctr, 'last_modified'] = pd.to_datetime(f.last_modified,unit='s')
    ctr +=1
short_names_df.sort_values(by='last_modified', inplace=True)
short_names_df.drop(columns=['last_modified'], inplace=True)
short_names_df.drop_duplicates(subset=['full_filename'], keep='last', inplace=True)
short_names_df.set_index('full_filename', inplace=True)
short_names_df.to_csv(os.path.join(output_dir, 'short_names.csv'), sep=',', index=True)

# Select groups of files to operate on

Here, you will assign your database groups to a local variable which will be used downstream in the notebook for analyzing your data with an atlas.

1. in block below, fill out the fields for name, include_list and exclude_list using text strings from the group names you created in the previous step.  The include/exlcude lists do not need wildcards.  Name is a string unique to all of your groups (ex. fields 0-11 of your filenames)

### Typically, you will run one polarity at a time.

In [None]:
exclude_groups.append('NEG' if polarity=='positive' else 'POS')
groups = dp.select_groups_for_analysis(name = experiment+'%',
                                       most_recent = True,
                                       remove_empty = True,
                                       include_list = [], exclude_list = exclude_groups)
print("sorted groups")
groups = sorted(groups, key=lambda x: x.name)
for i,a in enumerate(groups):
    print(i, a.name)
metob.to_dataframe(groups)

# Select Atlas to use

1. The first block will retrieve a list of atlases matching the 'name' string that you enter.  Also, you must enter your username.
2. The next block will select one from the list, using the index number.  Make sure to enter the index number for the atlas you want to use for your analysis by setting in this line: atlas_idx = 0

In [None]:
name_query = f"%_{polarity[:3].upper()}_{short_experiment_analysis_id}%"
atlases = metob.retrieve('atlases', name=f"%_{polarity[:3].upper()}_{short_experiment_analysis_id}%", username=username)
for i,a in enumerate(atlases):
    print(i,a.name,pd.to_datetime(a.last_modified,unit='s'))

In [None]:
atlas_idx = 0
metatlas_dataset = mads.MetatlasDataset(atlases[atlas_idx], groups, max_cpus=max_cpus)
ma_data.make_data_sources_tables(groups, metatlas_dataset.atlas, output_dir) 

# Optional: Filter atlas for compounds with no or low signals

In [None]:
metatlas_dataset.filter_compounds_by_signal(num_points=num_points_passing, peak_height=peak_height_passing)

In [None]:
a = dp.adjust_rt_for_selected_compound(metatlas_dataset, msms_hits=metatlas_dataset.hits,
                                       color_me=rt_adjuster_color_list,
                                       compound_idx=0, alpha=0.5, width=18, height=3)

# Export results files
### Filter out compounds with ms1_notes of 'remove'

In [None]:
metatlas_dataset.filter_compounds_ms1_notes_remove()

### Export Atlas to a Spreadsheet

The peak flags that you set and selected from the rt adjuster radio buttons will be saved in a column called id_notes

In [None]:
export_atlas_filename = os.path.join(output_dir, f"{polarity[:3].upper()}_{metatlas_dataset.atlas.name}_export")
atlas_identifications = dp.export_atlas_to_spreadsheet(metatlas_dataset.atlas, export_atlas_filename)
logger.info("Exported atlas to file: %s.", export_atlas_filename)

### Export MSMS match scores, stats sheets, and final identification table

This block creates a number of files:

1. compound_scores.csv
2. stats_table.tab
3. filtered and unfiltered peak heights, areas, msms scores, mz centroid, mz ppm error, num of fragment matches, rt delta, rt peak
4. final identification sheet that is formatted for use as a supplemental table for manuscript submission.  You will need to manually complete some columns.  Please discuss with Ben, Katherine, Daniel or Suzie before using for the first time.

THe kwargs below will set the filtering points for the parameters indicated.

In [None]:
kwargs = {'min_intensity': 1e4,   # strict = 1e5, loose = 1e3
          'rt_tolerance': .5,    #>= shift of median RT across all files for given compound to reference
          'mz_tolerance': 20,      # strict = 5, loose = 25; >= ppm of median mz across all files for given compound relative to reference
          'min_msms_score': .6, 'allow_no_msms': True,     # strict = 0.6, loose = 0.3 <= highest compound dot-product score across all files for given compound relative to reference
          'min_num_frag_matches': 1, 'min_relative_frag_intensity': .001}   # strict = 3 and 0.1, loose = 1, 0.01 number of matching mzs when calculating max_msms_score and ratio of second highest to first highest intensity of matching sample mzs
scores_df = fa.make_scores_df(metatlas_dataset, metatlas_dataset.hits)
scores_df['passing'] = fa.test_scores_df(scores_df, **kwargs)

pass_atlas_df, fail_atlas_df, pass_dataset, fail_dataset = fa.filter_atlas_and_dataset(scores_df, metatlas_dataset.atlas_df, metatlas_dataset, column='passing')

fa.make_stats_table(input_dataset=metatlas_dataset, msms_hits=metatlas_dataset.hits, output_loc=output_dir, min_peak_height=1e5, use_labels=True, min_msms_score=0.01, min_num_frag_matches=1, include_lcmsruns=[], exclude_lcmsruns=['QC'], polarity=polarity[:3].upper())
scores_df.to_csv(os.path.join(output_dir,'stats_tables', polarity[:3].upper()+'_compound_scores.csv'))

### Export EIC chromatograms as individual pdfs for each compound

1.  There are three options for formatting your EIC output using the "group =" line below:
    1. 'page' will print each sample group on a new page of a pdf file
    2. 'index' will label each group with a letter
    3. None will print all of the groups on one page with very small subplot labels
2. The Y axis scale can be shared across all files using share_y = True or set to the max within each file using share_y = False
3. To use short names for plots, short_names_df should be provided as input. Additionally the header column to be used for short names should be provided as follows (short_names_df=short_names_df, short_names_header='short_samplename'). Header options are sample_treatment, short_filename, short_samplename. These are optional parameters

In [None]:
group = 'index' # 'page' or 'index' or None
save = True
share_y = True

dp.make_chromatograms(input_dataset=metatlas_dataset, include_lcmsruns=[], exclude_lcmsruns=['InjBl','QC','Blank','blank'], group=group, share_y=share_y, save=save, output_loc=output_dir, short_names_df=short_names_df, short_names_header='short_samplename', polarity=polarity[:3].upper())

### Export MSMS mirror plots as individual pdfs for each compound

1. use_labels = True will use the compound names you provided in your atlas, if you set it to false, the compounds will be named with the first synonym available from pubchem which could be a common name, iupac name, cas number, vendor part number, etc. 
2.  The include and exclude lists will match partial strings in filenames, do not use wildcards.
3. If short_names_df is provided as input, short_samplename is used for plots.

In [None]:
dp.make_identification_figure_v2(input_dataset=metatlas_dataset, msms_hits=metatlas_dataset.hits, use_labels=True, include_lcmsruns=[], exclude_lcmsruns=['InjBl', 'QC', 'Blank', 'blank'], output_loc=output_dir,  short_names_df=short_names_df, polarity=polarity[:3].upper())

### Data Sheets
1. To include short names in the output, short_names_df should be provided as input to make_output_dataframe. 
2. ylabel is optional

In [None]:
output_dataframe = partial(dp.make_output_dataframe, input_dataset=metatlas_dataset, include_lcmsruns=[], exclude_lcmsruns=[], output_loc=os.path.join(output_dir,polarity[:3].upper()+'_data_sheets'), short_names_df=short_names_df, polarity=polarity[:3].upper(), use_labels=True)
peak_height = output_dataframe(fieldname='peak_height')
peak_area = output_dataframe(fieldname='peak_area')
mz_peak = output_dataframe(fieldname='mz_peak')
rt_peak = output_dataframe(fieldname='rt_peak')
mz_centroid = output_dataframe(fieldname='mz_centroid')
rt_centroid = output_dataframe(fieldname='rt_centroid')

### Box plots

In [None]:
dp.make_boxplot_plots(rt_peak, output_loc=os.path.join(output_dir, polarity[:3].upper()+'_boxplot_rt_peak'), ylabel="RT Peak")
dp.make_boxplot_plots(peak_height, output_loc=os.path.join(output_dir, polarity[:3].upper()+'_boxplot_peak_height'), ylabel="Peak Height")
dp.make_boxplot_plots(mz_centroid, output_loc=os.path.join(output_dir, polarity[:3].upper()+'_boxplot_mz_centroid'), ylabel="MZ Centroid")

### Export MSMS fragment Ions

In [None]:
if export_msms_fragment_ions:
    intensity_fraction = 0.01
    min_mz = 450.0 #minimum m/z to export in msms
    max_mz = -40.0 # distance from precurosor to export (0.5 is a good number. crazy people use negative numbers)
    scale_intensity = True
    data = []
    for compound_index in range(len(metatlas_dataset[0])):
        max_intensity = 0
        d = {}
        for file_index in range(len(metatlas_dataset)):
            try:
                pk_idx = metatlas_dataset[file_index][compound_index]['data']['msms']['data']['precursor_intensity'].argmax()
                pk = metatlas_dataset[file_index][compound_index]['data']['msms']['data']['precursor_intensity'][pk_idx]
                precursor_mz = metatlas_dataset[file_index][compound_index]['data']['msms']['data']['precursor_MZ'][pk_idx]
                rt = metatlas_dataset[file_index][compound_index]['data']['msms']['data']['rt'][pk_idx]
                if (pk>max_intensity) & (rt>metatlas_dataset[file_index][compound_index]['identification'].rt_references[-1].rt_min) & (rt<metatlas_dataset[file_index][compound_index]['identification'].rt_references[-1].rt_max):
                    good_index = file_index
                    max_intensity = pk
                    final_mz = precursor_mz #save this for filtering below
            except:
                pass
    #     print(compound_index,good_index,max_intensity)
        if max_intensity>0:
            msms = metatlas_dataset[good_index][compound_index]['data']['msms']['data']
            idx = np.argwhere(msms['precursor_intensity']==max_intensity).flatten()
            mz = msms['mz'][idx]
            intensity = msms['i'][idx]
            max_msms_intensity = intensity.max()
            cutoff = intensity_fraction * max_msms_intensity
            conditions = (intensity>cutoff) & (mz>min_mz) & (mz<(final_mz+max_mz))
            if sum(conditions)>0:
                keep_idx = np.argwhere(conditions).flatten()
                mz = str(['%.2f'%x for x in list(mz[keep_idx])]).replace('\'','')
                if scale_intensity==True:
                    intensity = intensity / intensity.max()
                    intensity = intensity * 1e5
                    intensity = intensity.astype(int)
                intensity = str(['%d'%x for x in list(intensity[keep_idx])]).replace('\'','')
                spectra = str([mz,intensity]).replace('\'','')
            else:
                mz = None
                intensity = None
                spectra = None
        else:
            mz = None
            intensity = None
            spectra = None
        data.append({'name':metatlas_dataset[file_index][compound_index]['identification'].name,'spectrum':spectra,'mz':mz,'intensity':intensity})
    data = pd.DataFrame(data)
    data[['name','mz','intensity']].to_csv(os.path.join(output_dir,'spectra_1pct_450cut.csv'),index=None)
    # to look at it type this:
    data.head(20)