In [1]:
from typing import Optional

import DeepLCMS_utils as utils
import pandas as pd
from DeepLCMS_data import inspect_database

# Inspecting the list of available metabolomics studies from the Metabolomics Workbench website

In [2]:
datasets = inspect_database.return_metabolomics_workbench_studies()

In [3]:
inspect_database.filter_and_sort_datasets(datasets, min_samples=200).head(5)

Unnamed: 0,Study ID,Study Title,Species,Institute,Analysis,Released,Version,Samples,format,file_size_number,file_size_metric
349,ST002482,Non-targeted screening of natural products fro...,Alternaria / Aspergillus / Botrytis / Anthosto...,Agriculture and Agri-Food Canada,LC-MS#,2023-07-02,1,325,mzML,1.7,G
80,ST002790,Community metabolomes reflect taxon-specific f...,Marine plankton,University of Washington,LC-MS#,2023-08-07,1,232,mzML,3.1,G
383,ST002432,Metabolic impacts of metformin to seasonal inf...,Homo sapiens,The Jackson Laboratory for Genomic Medicine,LC-MS#,2023-01-20,1,360,mzML,3.9,G
612,ST002168,Multi-omics analyses of 398 foxtail millet acc...,Foxtail millet (Setaria italica),Shanxi Agricultural University,LC-MS#,2022-05-31,1,1088,mzML,4.9,G
848,ST001926,Modular evolution of the Drosophila metabolome,Drosophila melanogaster,University of Washington,LC-MS#,2022-02-02,1,261,mzXML,5.2,G


# Taking a look at ST002432

In [4]:
sample_list = (
    pd.read_html(
        "https://www.metabolomicsworkbench.org/data/subject_fetch.php?STUDY_ID=ST002432&STUDY_TYPE=MS&RESULT_TYPE=5"
    )[1]
    .rename(columns=lambda x: x.replace(" ", "_").lower().split(":")[0])
    .drop(columns=["subject_name", "sample_data"])
    .assign(sample_name=lambda df: df.sample_name.str.split(".", expand=True)[0])
)

In [5]:
for column in sample_list:
    print(sample_list[column].value_counts(), end=3 * "\n")

mb_sample_id
SA243004    1
SA243246    1
SA243260    1
SA243244    1
SA243262    1
           ..
SA243103    1
SA243106    1
SA243113    1
SA243091    1
SA243329    1
Name: count, Length: 360, dtype: int64


sample_name
YW_20201206_071    1
YW_20201206_112    1
YW_20201206_096    1
YW_20201206_095    1
YW_20201206_086    1
                  ..
YW_20201210_247    1
YW_20201210_238    1
YW_20201210_237    1
YW_20201210_192    1
YW_20201210_256    1
Name: count, Length: 360, dtype: int64


visit
V1    60
V2    60
V3    60
V4    60
V5    60
V6    60
Name: count, dtype: int64


treatment
Metformin    192
Placebo      168
Name: count, dtype: int64




When first checking the data, we noticed a mismatch in the number of files listed on the Metabolomics Workbench website. They claimed there were 360 mzML files, but after downloading and unzipping, we ended up with over 500 files. Since we're unsure about the diagnosis for the extra samples, we'll exclude them from our study by removing them.

In [6]:
%%script echo skipping

data_path = utils.Configuration.RAW_DATA_PATH.joinpath("ST002432_IIV_Metformin")

MS_files = pd.Series([item.stem for item in list(data_path.rglob("*.mzML"))])
print(
    f" Number of raw data files dowloaded from Metabolomics Workbench : {len(MS_files)}"
)
print(
    f" Number of unique raw data files amongst them : {(MS_files.str[:15].value_counts().sort_values(ascending=False).values == 1).sum()}"
)

Couldn't find program: 'echo'


In [7]:
%%script echo skipping

# identify files that are unique to the ones saved on the hard drive, we will delete these
files_to_delete = MS_files[~MS_files.isin(sample_list.sample_name)]

# delete the surplus files
[
    item.unlink()
    for item in list(data_path.rglob("*.mzML"))
    if item.stem in list(files_to_delete)
]

Couldn't find program: 'echo'


In [8]:
data_path = utils.Configuration.RAW_DATA_PATH.joinpath("ST002432_IIV_Metformin")

MS_files = pd.Series([item.stem for item in list(data_path.rglob("*.mzML"))])
print(
    f" Number of raw data files dowloaded from Metabolomics Workbench : {len(MS_files)}"
)

MS_files[MS_files.isin(sample_list.sample_name)]

 Number of raw data files dowloaded from Metabolomics Workbench : 355


0      YW_20201206_033_20201208153824
1      YW_20201206_034_20201208154344
2      YW_20201206_035_20201208154904
3      YW_20201206_036_20201208155425
4      YW_20201206_037_20201208155945
                    ...              
350                   YW_20201210_252
351                   YW_20201210_253
352                   YW_20201210_254
353                   YW_20201210_255
354                   YW_20201210_256
Length: 355, dtype: object

As observed, we've saved 355 files on disk, while the Metabolomics Workbench website lists 360, indicating 5 missing files. Not a problem. Our plan is to filter out those 5 missing sample IDs from our sample list and then save the updated sample list to disk.

In [10]:
final_sample_list = sample_list[sample_list.sample_name.isin(MS_files)]

final_sample_list.to_parquet(
    f"{utils.Configuration.RAW_DATA_PATH.joinpath('sample_list')}.parquet.gzip",
    compression="gzip",
)

In [11]:
final_sample_list

Unnamed: 0,mb_sample_id,sample_name,visit,treatment
0,SA243004,YW_20201206_071,V1,Metformin
1,SA243001,YW_20201206_072,V1,Metformin
2,SA243002,YW_20201206_089,V1,Metformin
3,SA243010,YW_20201206_090,V1,Metformin
4,SA242999,YW_20201206_101,V1,Metformin
...,...,...,...,...
355,SA243335,YW_20201210_186,V6,Placebo
356,SA243340,YW_20201210_233,V6,Placebo
357,SA243345,YW_20201210_234,V6,Placebo
358,SA243343,YW_20201210_255,V6,Placebo
