This notebook prepares the BN10 metabolomics data to be used in tidy format. It downloads the data from Julian's link and processes it into the data/clean/bn10.tidy_metabolomics.feather file.

This notebook will be ported over to a script in the src/data/ folder!

I'm going to try to match the steps I did in `2018-07-03.bn10_metabolomics.tidy_data.ipynb` as much as possible

In [1]:
import pandas as pd
import feather

In [2]:
fname = '../../data/raw/bn10/BN10_newIDs.csv'
ftidy = '../../data/clean/bn10.tidy_metabolomics.feather'

df = pd.read_csv(fname)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.head()

Unnamed: 0,Compound,mz,RT,Method,Compound.ID,HMDB.ID...representative.,Metabolite,Number.of.Members,Cluster.Name,Cluster.Number,...,dg.0001,dg.0008,dh.0001,dh.0010,di.0001,di.0009,dj.0001,dj.0016,dk.0001,dk.0003
0,C18n_cmp.QI01,355.2424,7.76,C18n,cmp.QI01,Internal Standard,PGE2-d4,3,PGE2-d4__,759,...,296533.0,294045.0,286796.0,319931.0,298972.0,292705.0,300217.0,303752.0,299045.0,289756.0
1,C18n_cmp.QI02,313.2382,9.83,C18n,cmp.QI02,HMDB04705,12_13-diHOME,3,12_13-diHOME__,495,...,186940.0,228675.0,471212.0,597694.0,147788.0,94566.0,1323674.0,342138.0,277745.0,206769.0
2,C18n_cmp.QI03,313.2382,10.02,C18n,cmp.QI03,HMDB04704,9_10-diHOME,2,9_10-diHOME__,855,...,150893.0,194845.0,354305.0,398334.0,114068.0,166261.0,796338.0,454767.0,414934.0,323602.0
3,C18n_cmp.QI04,295.2274,11.29,C18n,cmp.QI04,HMDB04667,13-HODE,3,13-HODE__,533,...,77153.0,258187.0,182365.0,152416.0,28902.0,98580.0,355470.0,722006.0,108549.0,147500.0
4,C18n_cmp.QI05,335.2226,11.0,C18n,cmp.QI05,,5_6 diHETE,1,5_6 diHETE__,5202,...,12800.0,16872.0,4310.0,8021.0,1838.0,21655.0,189.0,1360.0,6891.0,26992.0


In [4]:
print(', '.join(df.columns))

Compound, mz, RT, Method, Compound.ID, HMDB.ID...representative., Metabolite, Number.of.Members, Cluster.Name, Cluster.Number, Cluster.major.ion, potential.adduct, aa.0163, ab.0140, ac.0002, ad.0002, ad.0005, ae.0001, ae.0003, ae.0007, ae.0024, ae.0069, af.0003, af.0060, ag.0001, ag.0005, ah.0002, ah.0028, ai.0002, aj.0001, ak.0001, ak.0010, al.0002, am.0001, am.0003, am.0006, am.0014, am.0025, am.0052, am.0070, am.0111, am.0224, an.0001, an.0002, an.0004, an.0013, an.0025, an.0030, an.0082, ao.0001, ao.0004, ao.0025, ao.0049, ao.0085, ao.0090, ap.0001, ap.0037, aq.0004, aq.0015, ar.0039, as.0033, at.0004, at.0044, au.0002, av.0006, av.0107, aw.0001, aw.0036, ax.0001, ax.0090, ay.0001, az.0001, ba.0001, ba.0003, ba.0010, ba.0018, ba.0025, ba.0066, ba.0113, bc.0002, bc.0003, bc.0013, bc.0018, bc.0029, bc.0042, bc.0068, bc.0105, bd.0033, be.0001, bf.0003, bf.0108, bh.0001, bh.0003, bh.0012, bh.0025, bh.0050, bh.0066, bh.0087, bh.0123, bi.0001, bi.0026, bj.0001, bj.0004, bj.0014, bj.0032,

# Convert data to tidy format

First, let's put all the sample-value pairs in each row.

In [5]:
tidydf = df.melt(id_vars=['Compound', 'mz', 'RT', 'Method', 
                          'Compound.ID', 'HMDB.ID...representative.', 
                          'Metabolite', 'Number.of.Members', 
                          'Cluster.Name', 'Cluster.Number', 
                          'Cluster.major.ion', 'potential.adduct'],
                var_name='sample',
                value_name='intensity')
print(tidydf.shape)
tidydf.head()

(8574995, 14)


Unnamed: 0,Compound,mz,RT,Method,Compound.ID,HMDB.ID...representative.,Metabolite,Number.of.Members,Cluster.Name,Cluster.Number,Cluster.major.ion,potential.adduct,sample,intensity
0,C18n_cmp.QI01,355.2424,7.76,C18n,cmp.QI01,Internal Standard,PGE2-d4,3,PGE2-d4__,759,Major,,aa.0163,289686.0
1,C18n_cmp.QI02,313.2382,9.83,C18n,cmp.QI02,HMDB04705,12_13-diHOME,3,12_13-diHOME__,495,Major,,aa.0163,191459.0
2,C18n_cmp.QI03,313.2382,10.02,C18n,cmp.QI03,HMDB04704,9_10-diHOME,2,9_10-diHOME__,855,Major,,aa.0163,196193.0
3,C18n_cmp.QI04,295.2274,11.29,C18n,cmp.QI04,HMDB04667,13-HODE,3,13-HODE__,533,Major,,aa.0163,82445.0
4,C18n_cmp.QI05,335.2226,11.0,C18n,cmp.QI05,,5_6 diHETE,1,5_6 diHETE__,5202,Unclustered,,aa.0163,32595.0


In [6]:
# Clean up column names
tidydf = tidydf.rename(
    columns={'Compound': 'compound',
             'RT': 'rt', 
             'Method': 'method',
             'Compound.ID': 'compound_id',
             'HMDB.ID...representative.': 'hmdb_id',
             'Metabolite': 'metabolite', 
             'Number.of.Members': 'n_members',
             'Cluster.Name': 'cluster_name',
             'Cluster.Number': 'cluster_number', 
             'Cluster.major.ion': 'cluster_major_ion', 
             'potential.adduct': 'potential_adduct'})
tidydf.head()

Unnamed: 0,compound,mz,rt,method,compound_id,hmdb_id,metabolite,n_members,cluster_name,cluster_number,cluster_major_ion,potential_adduct,sample,intensity
0,C18n_cmp.QI01,355.2424,7.76,C18n,cmp.QI01,Internal Standard,PGE2-d4,3,PGE2-d4__,759,Major,,aa.0163,289686.0
1,C18n_cmp.QI02,313.2382,9.83,C18n,cmp.QI02,HMDB04705,12_13-diHOME,3,12_13-diHOME__,495,Major,,aa.0163,191459.0
2,C18n_cmp.QI03,313.2382,10.02,C18n,cmp.QI03,HMDB04704,9_10-diHOME,2,9_10-diHOME__,855,Major,,aa.0163,196193.0
3,C18n_cmp.QI04,295.2274,11.29,C18n,cmp.QI04,HMDB04667,13-HODE,3,13-HODE__,533,Major,,aa.0163,82445.0
4,C18n_cmp.QI05,335.2226,11.0,C18n,cmp.QI05,,5_6 diHETE,1,5_6 diHETE__,5202,Unclustered,,aa.0163,32595.0


In [7]:
# Split sample ID into donor and sample number
tidydf[['donor', 'sample_number']] = tidydf['sample'].str.split('.', expand=True)
tidydf.head()

Unnamed: 0,compound,mz,rt,method,compound_id,hmdb_id,metabolite,n_members,cluster_name,cluster_number,cluster_major_ion,potential_adduct,sample,intensity,donor,sample_number
0,C18n_cmp.QI01,355.2424,7.76,C18n,cmp.QI01,Internal Standard,PGE2-d4,3,PGE2-d4__,759,Major,,aa.0163,289686.0,aa,163
1,C18n_cmp.QI02,313.2382,9.83,C18n,cmp.QI02,HMDB04705,12_13-diHOME,3,12_13-diHOME__,495,Major,,aa.0163,191459.0,aa,163
2,C18n_cmp.QI03,313.2382,10.02,C18n,cmp.QI03,HMDB04704,9_10-diHOME,2,9_10-diHOME__,855,Major,,aa.0163,196193.0,aa,163
3,C18n_cmp.QI04,295.2274,11.29,C18n,cmp.QI04,HMDB04667,13-HODE,3,13-HODE__,533,Major,,aa.0163,82445.0,aa,163
4,C18n_cmp.QI05,335.2226,11.0,C18n,cmp.QI05,,5_6 diHETE,1,5_6 diHETE__,5202,Unclustered,,aa.0163,32595.0,aa,163


## Clean up any molecules measured by multipled methods

In [11]:
# Sanity check: how many different methods per molecule we care about?
scfas = ['propionate', 'butyrate', 'isovalerate']
primary_bile = ['cholate', 'chenodeoxycholate']
secondary_bile = ['deoxycholate', 'lithocholate']

all_mlcls = scfas + primary_bile + secondary_bile

tidydf.query('metabolite == @all_mlcls')[['metabolite', 'method']].drop_duplicates().sort_values(by='metabolite')

Unnamed: 0,metabolite,method
19934,butyrate,HILn
51,chenodeoxycholate,C18n
56,cholate,C18n
52,deoxycholate,C18n
19962,isovalerate,HILn
50,lithocholate,C18n
19966,lithocholate,HILn
19980,propionate,HILn


Lithocholate was measured with two methods: which one do I want? Well, since all the other bile acids are measured on the C18 column, let's be consistent and use that here too.

In [13]:
litho_hiln_idx = tidydf.query('metabolite == "lithocholate"').query('method == "HILn"').index
tidydf.loc[litho_hiln_idx, 'metabolite'] = 'lithocholate-HILn'

In [14]:
tidydf.query('metabolite == @all_mlcls')[['metabolite', 'method']].drop_duplicates().sort_values(by='metabolite')

Unnamed: 0,metabolite,method
19934,butyrate,HILn
51,chenodeoxycholate,C18n
56,cholate,C18n
52,deoxycholate,C18n
19962,isovalerate,HILn
50,lithocholate,C18n
19980,propionate,HILn


Okay good, that worked.

# Write to file

In [18]:
tidydf.shape

(8574995, 16)

In [19]:
feather.write_dataframe(tidydf, ftidy)