In [1]:
from typing import Optional

import pandas as pd

from deeplcms_functions import inspect_database, utils

# Inspecting the list of available metabolomics studies from the Metabolomics Workbench website

In [2]:
datasets = inspect_database.return_metabolomics_workbench_studies()

In [4]:
inspect_database.filter_and_sort_datasets(datasets, min_samples=200).head(20)

Unnamed: 0,Study ID,Study Title,Species,Institute,Analysis,Released,Version,Samples,format,file_size_number,file_size_metric
353,ST002482,Non-targeted screening of natural products fro...,Alternaria / Aspergillus / Botrytis / Anthosto...,Agriculture and Agri-Food Canada,LC-MS#,2023-07-02,1,325,mzML,1.7,G
84,ST002790,Community metabolomes reflect taxon-specific f...,Marine plankton,University of Washington,LC-MS#,2023-08-07,1,232,mzML,3.1,G
387,ST002432,Metabolic impacts of metformin to seasonal inf...,Homo sapiens,The Jackson Laboratory for Genomic Medicine,LC-MS#,2023-01-20,1,360,mzML,3.9,G
617,ST002168,Multi-omics analyses of 398 foxtail millet acc...,Foxtail millet (Setaria italica),Shanxi Agricultural University,LC-MS#,2022-05-31,1,1088,mzML,4.9,G
853,ST001926,Modular evolution of the Drosophila metabolome,Drosophila melanogaster,University of Washington,LC-MS#,2022-02-02,1,261,mzXML,5.2,G
773,ST002008,Glycine betaine uptake and metabolism in marin...,Natural mixed marine microbial community,University of Washington,LC-MS#,2022-01-17,1,433,mzML,5.4,G
664,ST002120,Feasibility of detecting AC and SCC using UPLC...,Homo sapiens,Ocean University of China,LC-MS#,2022-07-20,1,229,raw,7.4,G
61,ST002832,Resource competition predicts assembly of in v...,Bacteroides thetaiotaomicron,Stanford University,LC-MS#,2023-09-14,1,402,raw,8.4,G
555,ST002233,Batch variation of large scale LC-MS metabolom...,Homo sapiens,The Jackson Laboratory for Genomic Medicine,LC-MS#,2022-11-01,1,268,raw,9.5,G
738,ST002044,An observational study of cardiovascular patie...,Homo sapiens,Translational Health Science And Technology In...,LC-MS#,2022-02-08,1,286,mzML,10.8,G


# Taking a look at ST002432

In [6]:
sample_list = (
    pd.read_html(
        "https://www.metabolomicsworkbench.org/data/subject_fetch.php?STUDY_ID=ST001618&STUDY_TYPE=MS&RESULT_TYPE=5"
    )[1]
    .rename(columns=lambda x: x.replace(" ", "_").lower().split(":")[0])
    .drop(columns=["subject_name", "sample_data"])
    .assign(sample_name=lambda df: df.sample_name.str.split(".", expand=True)[0])
)
sample_list

Unnamed: 0,mb_sample_id,sample_name,phenotype
0,SA137303,U_10,Non-User
1,SA137263,U_101,Non-User
2,SA137264,U_103,Non-User
3,SA137265,U_106,Non-User
4,SA137262,U_113,Non-User
...,...,...,...
331,SA137457,U_90,User
332,SA137458,U_91,User
333,SA137460,U_92,User
334,SA137459,U_97,User


In [7]:
for column in sample_list:
    print(sample_list[column].value_counts(), end=3 * "\n")

mb_sample_id
SA137303    1
SA137369    1
SA137510    1
SA137552    1
SA137445    1
           ..
SA137333    1
SA137327    1
SA137326    1
SA137337    1
SA137416    1
Name: count, Length: 336, dtype: int64


sample_name
U_10     1
U_368    1
U_379    1
U_376    1
U_374    1
        ..
SP_32    1
SP_31    1
SP_30    1
SP_3     1
U_99     1
Name: count, Length: 336, dtype: int64


phenotype
User           218
Non-User        80
Study_Pools     38
Name: count, dtype: int64




When first checking the data, we noticed a mismatch in the number of files listed on the Metabolomics Workbench website. They claimed there were 360 mzML files, but after downloading and unzipping, we ended up with over 500 files. Since we're unsure about the diagnosis for the extra samples, we'll exclude them from our study by removing them.

In [6]:
%%script echo skipping

data_path = utils.Configuration.RAW_DATA_PATH.joinpath("ST002432_IIV_Metformin")

MS_files = pd.Series([item.stem for item in list(data_path.rglob("*.mzML"))])
print(
    f" Number of raw data files dowloaded from Metabolomics Workbench : {len(MS_files)}"
)
print(
    f" Number of unique raw data files amongst them : {(MS_files.str[:15].value_counts().sort_values(ascending=False).values == 1).sum()}"
)

Couldn't find program: 'echo'


In [7]:
%%script echo skipping

# identify files that are unique to the ones saved on the hard drive, we will delete these
files_to_delete = MS_files[~MS_files.isin(sample_list.sample_name)]

# delete the surplus files
[
    item.unlink()
    for item in list(data_path.rglob("*.mzML"))
    if item.stem in list(files_to_delete)
]

Couldn't find program: 'echo'


In [8]:
data_path = utils.Configuration.RAW_DATA_PATH.joinpath("ST002432_IIV_Metformin")

MS_files = pd.Series([item.stem for item in list(data_path.rglob("*.mzML"))])
print(
    f" Number of raw data files dowloaded from Metabolomics Workbench : {len(MS_files)}"
)

MS_files[MS_files.isin(sample_list.sample_name)]

 Number of raw data files dowloaded from Metabolomics Workbench : 355


0      YW_20201206_033_20201208153824
1      YW_20201206_034_20201208154344
2      YW_20201206_035_20201208154904
3      YW_20201206_036_20201208155425
4      YW_20201206_037_20201208155945
                    ...              
350                   YW_20201210_252
351                   YW_20201210_253
352                   YW_20201210_254
353                   YW_20201210_255
354                   YW_20201210_256
Length: 355, dtype: object

As observed, we've saved 355 files on disk, while the Metabolomics Workbench website lists 360, indicating 5 missing files. Not a problem. Our plan is to filter out those 5 missing sample IDs from our sample list and then save the updated sample list to disk.

In [10]:
final_sample_list = sample_list[sample_list.sample_name.isin(MS_files)]

final_sample_list.to_parquet(
    f"{utils.Configuration.RAW_DATA_PATH.joinpath('sample_list')}.parquet.gzip",
    compression="gzip",
)

In [11]:
final_sample_list

Unnamed: 0,mb_sample_id,sample_name,visit,treatment
0,SA243004,YW_20201206_071,V1,Metformin
1,SA243001,YW_20201206_072,V1,Metformin
2,SA243002,YW_20201206_089,V1,Metformin
3,SA243010,YW_20201206_090,V1,Metformin
4,SA242999,YW_20201206_101,V1,Metformin
...,...,...,...,...
355,SA243335,YW_20201210_186,V6,Placebo
356,SA243340,YW_20201210_233,V6,Placebo
357,SA243345,YW_20201210_234,V6,Placebo
358,SA243343,YW_20201210_255,V6,Placebo
