###  Jupyter notebook for feature demonstration of `metabolinks` Python module

#### Metabolinks Github home: https://github.com/aeferreira/metabolinks

Install metabolinks by running

`pip install metabolinks`

## Compound taxonomy annotation of MassTRIX results by automatic retrieval of BRITE/KeGG and LIPIDMAPS data.

This notebook demonstrates compound taxonomy annotation of MassTRIX Results.

LipidMaps classes and BRITE/KeGG taxonomy data is inserted as new columns in a results table retrieved from the MassTRIX server.

A possible blacklist file can be used, to ignore identifiers.

In [None]:
import time

from metabolinks.dataio import read_MassTRIX
from metabolinks.taxonomy import insert_taxonomy

### Read, clean and check a MassTRIX file with identification results.

This file is in the MassTRIX "compact" format (one peak, several compounds per line)

In [None]:
testfile_name = 'MassTRIX_output.tsv'
def cleanup_cols(df, isotopes=True, uniqueID=True, columns=None):
    """Removes the 'uniqueID' and the 'isotope presence' columns."""
    col_names = []
    if uniqueID:
        col_names.append("uniqueID")
    if isotopes:
        iso_names = (
            "C13",
            "O18",
            "N15",
            "S34",
            "Mg25",
            "Mg26",
            "Fe54",
            "Fe57",
            "Ca44",
            "Cl37",
            "K41",
        )
        col_names.extend(iso_names)
    if columns is not None:
        col_names.extend(columns)
    return df.drop(col_names, axis=1)

df = read_MassTRIX(testfile_name)
df = cleanup_cols(df) # removes uniqueID and 'isotopes' columns

print(f"File {testfile_name} was read")
df.info() # assert that there are 15 entries and 24 - 12 = 12 columns
df.head()

### Annotate results with compound taxonomy.

(Use *trace=True* as an argument to activate tracing of annotations)

In [None]:
print ('Starting annotating...\n')
start_time = time.time()

df = insert_taxonomy(df)

elapsed_time = time.time() - start_time
m, s = divmod(elapsed_time, 60)
print ("Finished in " + "%02dm%02ds" % (m, s))

In [None]:
# check result
df.info()
df[['trans_KEGG_Ids',
    'trans_LipidMaps_Ids',
    'Major Class', 'Class',
    'Secondary Class',
    'Tertiary Class',
    'KNApSAcK']].head(10)

### Export the annotated dataframe into a MS-Excel file

Use the same name as the MassTRIX .tsv data file, replacing tail with '_raw.xlsx'

In [None]:
out_fname = testfile_name[:-4]+'_compound_taxa.xlsx'

df.to_excel(out_fname, header=True, index=False)

print(f"File {out_fname} created")

### Annotate results with compound taxonomy but, this time, use a black list of BRITE ids.

#### Also, use *trace=True* as an argument to activate tracing of annotations

In [None]:
start_time = time.time()

print ('Starting annotating...\n')

df = read_MassTRIX(testfile_name)
df = cleanup_cols(df)

print(f"File {testfile_name} was read\n")

black_list_fname = 'blacklist.txt'

df = insert_taxonomy(df, brite_blacklist=black_list_fname, trace=True)

elapsed_time = time.time() - start_time
m, s = divmod(elapsed_time, 60)
print ("Finished in " + "%02dm%02ds" % (m, s))

In [None]:
# check result
df[['trans_KEGG_Ids',
    'trans_LipidMaps_Ids',
    'Major Class', 'Class',
    'Secondary Class',
    'Tertiary Class',
    'KNApSAcK']].head(10)