## Example for the retrieval of BRITE/KeGG and LIPIDMAPS compound taxonomy.

In [None]:
import time

from metabolinks import read_MassTRIX
from metabolinks.taxonomy import insert_taxonomy, load_local_kegg_db

### Read, clean and check the example MassTRIX file.

This file is in the MassTRIX "compact" format (one peak, several compounds per line)

In [None]:
start_time = time.time()

testfile_name = '../example_data/MassTRIX_output.tsv'

df = read_MassTRIX(testfile_name)
df = df.cleanup_cols() # removes uniqueID and 'isotopes' columns

print ("File {} was read\n".format(testfile_name))
df.info() # assert that there are 15 entries and 24 - 12 = 12 columns
df.head()

In [None]:
kegg_db = None

# Use a local Kegg DB.
# IMPORTANT: Comment this line to fetch KeGG records from KeGG Rest API service
kegg_db = load_local_kegg_db('../metabolinks/dbs/kegg_db.txt')

### Annotate results with compound taxonomy.

(Use *trace=True* as an argument to activate tracing of annotations)

In [None]:
print ('Starting annotating...\n')

df = insert_taxonomy(df, local_kegg_db=kegg_db)

elapsed_time = time.time() - start_time
m, s = divmod(elapsed_time, 60)
print ("Finished in " + "%02dm%02ds" % (m, s))

In [None]:
# check result
df.info()
df[['trans_KEGG_Ids',
    'trans_LipidMaps_Ids',
    'Major Class', 'Class',
    'Secondary Class',
    'Tertiary Class',
    'KNApSAcK']].head(10)

### Export the annotated dataframe into a MS-Excel file

Use the same name as the MassTRIX .tsv data file, replacing tail with '_raw.xlsx'

In [None]:
out_fname = testfile_name[:-4]+'_raw.xlsx'

df.to_excel(out_fname, header=True, index=False)

print ("File {} created".format(out_fname))

### Annotate results with compound taxonomy but, this time, use a black list of BRITE ids.

#### Also, use *trace=True* as an argument to activate tracing of annotations

In [None]:
start_time = time.time()

print ('Starting annotating...\n')

df = read_MassTRIX(testfile_name)
df = df.cleanup_cols() # removes uniqueID and 'isotopes' columns

print ("File {} was read\n".format(testfile_name))

black_list_fname = '../example_data/blacklist.txt'

df = insert_taxonomy(df,
                     brite_blacklist=black_list_fname,
                     local_kegg_db=kegg_db,
                     trace=True)

elapsed_time = time.time() - start_time
m, s = divmod(elapsed_time, 60)
print ("Finished in " + "%02dm%02ds" % (m, s))

In [None]:
# check result
df[['trans_KEGG_Ids',
    'trans_LipidMaps_Ids',
    'Major Class', 'Class',
    'Secondary Class',
    'Tertiary Class',
    'KNApSAcK']].head(10)