## Example for the retrieval of BRITE/KeGG and LIPIDMAPS compound taxonomy.

In [None]:
from __future__ import print_function
import time

import pandas as pd

from masstrix import read_MassTRIX
from annotations import annotate_df, load_local_kegg_db

### Read, clean and check the example MassTRIX file.

This file is in the MassTRIX "compact" format (one peak, several compounds per line)

In [None]:
print ('Starting...\n')
start_time = time.time()

testfile_name = 'example_data/MassTRIX_output.tsv'

df = read_MassTRIX(testfile_name)
print ("File {} was read\n".format(testfile_name))

# Retira as colunas 9 a 20, que não são necessárias
# Exprimentei df.drop(df.columns[['nome', 'nome 2', ..., 'nome n']], axis=1, inplace=True) mas não resulta
df.drop(df.columns[[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]], axis=1, inplace=True)

# check result
df.info() # assert that there are 15 entries and 24 - 12 = 12 columns
df.head()

In [None]:
kegg_db = None

# Use a local Kegg DB.
# IMPORTANT: Comment this line to fetch KeGG records from KeGG Rest API service
kegg_db = load_local_kegg_db('dbs/kegg_db.txt')

### Call the main driver function.

(Use *trace=True* as an argument to activate tracing of annotations)

In [None]:
print ('Starting annotating...\n')

df = annotate_df(df, local_kegg_db=kegg_db)

In [None]:
elapsed_time = time.time() - start_time
m, s = divmod(elapsed_time, 60)

print ("Finished in " + "%02dm%02ds" % (m, s))

In [None]:
# check result
df.info()
df.head(10)

### Export the annotated dataframe into a MS-Excel file

Use the same name as the MassTRIX .tsv data file, replacing tail with '_raw.xlsx'

In [None]:
out_fname = testfile_name[:-4]+'_raw.xlsx'

df.to_excel(out_fname, header=True, index=False)

print ("File {} created".format(out_fname))