## Example for the retrieval of BRITE/KeGG and LIPIDMAPS compound taxonomy.

In [1]:
import time

from metabolinks import read_MassTRIX

### Read, clean and check a MassTRIX file with identification results.

This file is in the MassTRIX "compact" format (one peak, several compounds per line)

In [2]:
start_time = time.time()

testfile_name = '../example_data/MassTRIX_output.tsv'

df = read_MassTRIX(testfile_name)
df = df.cleanup_cols() # removes uniqueID and 'isotopes' columns

print ("File {} was read\n".format(testfile_name))
df.info() # assert that there are 15 entries and 24 - 12 = 12 columns
df.head()

File ../example_data/MassTRIX_output.tsv was read

<class 'metabolinks.masstrix.MassTRIXResults'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 12 columns):
raw_mass                      15 non-null float64
peak_height                   15 non-null float64
corrected_mass                15 non-null float64
npossible                     15 non-null int64
KEGG_mass                     15 non-null object
ppm                           15 non-null object
KEGG_cid                      15 non-null object
KEGG_formula                  15 non-null object
KEGG_name                     15 non-null object
KEGG Pathways                 15 non-null object
KEGG Pathways descriptions    15 non-null object
Compound in Organism(X)       15 non-null object
dtypes: float64(3), int64(1), object(8)
memory usage: 1.5+ KB


Unnamed: 0,raw_mass,peak_height,corrected_mass,npossible,KEGG_mass,ppm,KEGG_cid,KEGG_formula,KEGG_name,KEGG Pathways,KEGG Pathways descriptions,Compound in Organism(X)
0,133.08577,0.0,132.078494,26,133.085920710401#133.085920710401#133.08592071...,-1.1324306198618#-1.1324306198618#-1.132430619...,C03264#C03499#C06103#C07834#HMDB00317#HMDB0040...,C6H12O3#C6H12O3#C6H12O3#C6H12O3#C6H12O3#C6H12O...,D-2-Hydroxyisocaproate ([M+H]+)#Ethyl (R)-3-hy...,null#null#ko00930#null#null#null#null#null#nul...,null#null#;Caprolactam degradation#null#null#n...,null#null#null#null#null#null#null#null#null#n...
1,149.01145,0.0,110.048291,3,149.011172241631#149.011172241631#149.01117224...,1.86400688677955#1.86400688677955#1.8640068867...,C02560#C05130#HMDB03905,C5H6N2O#C5H6N2O#C5H6N2O,N-Acetylimidazole ([M+K39]+)#Imidazole-4-aceta...,null#ko00340#null,null#;Histidine metabolism#null,null#null#null
2,177.00634,0.0,138.043181,4,177.006086861191#177.006086861191#177.00608686...,1.43011153725101#1.43011153725101#1.4301115372...,C00785#C02126#HMDB00301#HMDB02730,C6H6N2O2#C6H6N2O2#C6H6N2O2#C6H6N2O2,Urocanate;Urocanic acid ([M+K39]+)#4-Nitroanil...,ko00340;ko01100#null#null#null,;Histidine metabolism;Metabolic pathways#null#...,null#null#null#null
3,189.04265,0.0,150.079491,1,189.0424724,0.939629703,C17512,C8H10N2O,N-Methylanthranilamide;N-Methyl-2-aminobenzami...,,,
4,203.05828,0.0,164.095121,3,203.057968967081#203.058122434051#203.05812243...,1.53174211364742#0.775964166523288#0.775964166...,C00604#C11224#HMDB12246,C12H8N2#C9H12N2O#C9H12N2O,"1,10-Phenanthroline;o-Phenanthroline ([M+Na]+)...",null#null#null,null#null#null,null#null#null


### Annotate results with compound taxonomy.

(Use *trace=True* as an argument to activate tracing of annotations)

In [3]:
print ('Starting annotating...\n')

df = df.insert_taxonomy()

elapsed_time = time.time() - start_time
m, s = divmod(elapsed_time, 60)
print ("Finished in " + "%02dm%02ds" % (m, s))

Starting annotating...


Loading local DBs
7% done
13% done
20% done
27% done
33% done
40% done
47% done
53% done
60% done
67% done
73% done
80% done
87% done
93% done
100% done

Done! 78 ids processed. 60 DB lookups
Finished in 00m05s


In [4]:
# check result
df.info()
df[['trans_KEGG_Ids',
    'trans_LipidMaps_Ids',
    'Major Class', 'Class',
    'Secondary Class',
    'Tertiary Class',
    'KNApSAcK']].head(10)

<class 'metabolinks.masstrix.MassTRIXResults'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 19 columns):
raw_mass                      15 non-null float64
peak_height                   15 non-null float64
corrected_mass                15 non-null float64
npossible                     15 non-null int64
KEGG_mass                     15 non-null object
ppm                           15 non-null object
KEGG_cid                      15 non-null object
KEGG_formula                  15 non-null object
KEGG_name                     15 non-null object
KEGG Pathways                 15 non-null object
KEGG Pathways descriptions    15 non-null object
Compound in Organism(X)       15 non-null object
trans_KEGG_Ids                15 non-null object
trans_LipidMaps_Ids           15 non-null object
Major Class                   15 non-null object
Class                         15 non-null object
Secondary Class               15 non-null object
Tertiary Class                15 non-null object
KNAp

Unnamed: 0,trans_KEGG_Ids,trans_LipidMaps_Ids,Major Class,Class,Secondary Class,Tertiary Class,KNApSAcK
0,C03264#C03499#C06103#C07834#HMDB00317#HMDB0040...,LMFA01050402#C03499#LMFA01050015#C07834#HMDB00...,Anatomical Therapeutic Chemical (ATC) classifi...,Fatty Acyls [FA]#N NERVOUS SYSTEM#FA Fatty acyls,Fatty Acids and Conjugates [FA01]#N05 PSYCHOLE...,N05C HYPNOTICS AND SEDATIVES#Hydroxy fatty aci...,
1,C02560#C05130#C05130,C02560#C05130#HMDB03905,,,,,
2,C00785#C02126#C00785#HMDB02730,C00785#C02126#HMDB00301#HMDB02730,,,,,
3,C17512,C17512,,,,,
4,C00604#C11224#HMDB12246,C00604#C11224#HMDB12246,Pesticides [BR:br08007],Obsolete pesticides,Herbicides,Phenylurea herbicides,
5,C01056#C03043#C16569#HMDB01240#HMDB01329#HMDB0...,C01056#C03043#C16569#HMDB01240#HMDB01329#HMDB0...,Pesticides [BR:br08007],Pesticides,Herbicides,Uracil herbicides,
6,C05636#C05638#C12033#C05638#HMDB13319,C05636#C05638#C12033#HMDB04076#HMDB13319,,,,,
7,C06537#C14742#C15042,C06537#C14742#C15042,Carcinogens [BR:br08008],Group 3: Not classifiable as to its carcinogen...,Compounds,C14742 N-Nitrosodiphenylamine,
8,C01748#C06538#C13311,C01748#C06538#C13311,Phytochemical compounds [BR:br08003],Alkaloids,Alkaloids derived from tryptophan and anthrani...,Indole alkaloids,Plantae
9,C14274#C14718,C14274#C14718,,,,,


### Export the annotated dataframe into a MS-Excel file

Use the same name as the MassTRIX .tsv data file, replacing tail with '_raw.xlsx'

In [5]:
out_fname = testfile_name[:-4]+'_raw.xlsx'

df.to_excel(out_fname, header=True, index=False)

print ("File {} created".format(out_fname))

File ../example_data/MassTRIX_output_raw.xlsx created


### Annotate results with compound taxonomy but, this time, use a black list of BRITE ids.

#### Also, use *trace=True* as an argument to activate tracing of annotations

In [6]:
start_time = time.time()

print ('Starting annotating...\n')

df = read_MassTRIX(testfile_name)
df = df.cleanup_cols() # removes uniqueID and 'isotopes' columns

print ("File {} was read\n".format(testfile_name))

black_list_fname = '../example_data/blacklist.txt'

df = df.insert_taxonomy(brite_blacklist=black_list_fname, trace=True)

elapsed_time = time.time() - start_time
m, s = divmod(elapsed_time, 60)
print ("Finished in " + "%02dm%02ds" % (m, s))

Starting annotating...

File ../example_data/MassTRIX_output.tsv was read


Loading local DBs

++++++++ PEAK +++++++++++++

---- compound: C03264 (C03264, C03264)
(look up C03264)
(look up LMFA01050402)
LIPIDMAPS id: LMFA01050402
('Lipids [LM]', 'Fatty Acyls [FA]', 'Fatty Acids and Conjugates [FA01]', 'Hydroxy fatty acids [FA0105]')

---- compound: C03499 (C03499, C03499)
(look up C03499)
No BRITE, no LIPIDMAPS

---- compound: C06103 (C06103, C06103)
(look up C06103)
from BRITE (with id br08002):
('Lipids [BR:br08002]', 'FA  Fatty acyls', 'FA01 Fatty Acids and Conjugates', 'FA0105 Hydroxy fatty acids')

---- compound: C07834 (C07834, C07834)
(look up C07834)
from BRITE (with id br08303):
('null', 'br08303', '', '')
BRITE class br08303 in blacklist. Skipped

---- compound: HMDB00317 (HMDB00317, HMDB00317)
No BRITE, no LIPIDMAPS

---- compound: HMDB00409 (HMDB00409, HMDB00409)
No BRITE, no LIPIDMAPS

---- compound: HMDB00525 (HMDB00525, HMDB00525)
No BRITE, no LIPIDMAPS

---- compound: H

Plantae

---- compound: C13311 (C13311, C13311)
(look up C13311)
No BRITE, no LIPIDMAPS
60% done

DATA:
trans_KEGG_Ids            : C01748#C06538#C13311
trans_LipidMaps_Ids       : C01748#C06538#C13311
Major Class               : Phytochemical compounds [BR:br08003]
Class                     : Alkaloids
Secondary Class           : Alkaloids derived from tryptophan and anthranilic acid
Tertiary Class            : Indole alkaloids
KNApSAcK                  : Plantae

++++++++ PEAK +++++++++++++

---- compound: C14274 (C14274, C14274)
(look up C14274)
No BRITE, no LIPIDMAPS

---- compound: C14718 (C14718, C14718)
(look up C14718)
No BRITE, no LIPIDMAPS
67% done

DATA:
trans_KEGG_Ids            : C14274#C14718
trans_LipidMaps_Ids       : C14274#C14718
Major Class               : 
Class                     : 
Secondary Class           : 
Tertiary Class            : 
KNApSAcK                  : 

++++++++ PEAK +++++++++++++

---- compound: C11082 (C11082, C11082)
(look up C11082)
No BRITE, n

In [7]:
# check result
df[['trans_KEGG_Ids',
    'trans_LipidMaps_Ids',
    'Major Class', 'Class',
    'Secondary Class',
    'Tertiary Class',
    'KNApSAcK']].head(10)

Unnamed: 0,trans_KEGG_Ids,trans_LipidMaps_Ids,Major Class,Class,Secondary Class,Tertiary Class,KNApSAcK
0,C03264#C03499#C06103#C07834#HMDB00317#HMDB0040...,LMFA01050402#C03499#LMFA01050015#C07834#HMDB00...,Lipids [BR:br08002]#Lipids [LM],Fatty Acyls [FA]#FA Fatty acyls,Fatty Acids and Conjugates [FA01]#FA01 Fatty A...,Hydroxy fatty acids [FA0105]#FA0105 Hydroxy fa...,
1,C02560#C05130#C05130,C02560#C05130#HMDB03905,,,,,
2,C00785#C02126#C00785#HMDB02730,C00785#C02126#HMDB00301#HMDB02730,,,,,
3,C17512,C17512,,,,,
4,C00604#C11224#HMDB12246,C00604#C11224#HMDB12246,,,,,
5,C01056#C03043#C16569#HMDB01240#HMDB01329#HMDB0...,C01056#C03043#C16569#HMDB01240#HMDB01329#HMDB0...,,,,,
6,C05636#C05638#C12033#C05638#HMDB13319,C05636#C05638#C12033#HMDB04076#HMDB13319,,,,,
7,C06537#C14742#C15042,C06537#C14742#C15042,,,,,
8,C01748#C06538#C13311,C01748#C06538#C13311,Phytochemical compounds [BR:br08003],Alkaloids,Alkaloids derived from tryptophan and anthrani...,Indole alkaloids,Plantae
9,C14274#C14718,C14274#C14718,,,,,
