In [None]:
from typing import Optional

import pandas as pd

from deeplcms_functions import inspect_database, utils

# Inspecting the list of available metabolomics studies from the Metabolomics Workbench website

In [None]:
datasets = inspect_database.return_metabolomics_workbench_studies()

In [None]:
inspect_database.filter_and_sort_datasets(datasets, min_samples=200).head(20)

# Taking a look at ST001618

In [None]:
sample_list = (
    pd.read_html(
        "https://www.metabolomicsworkbench.org/data/subject_fetch.php?STUDY_ID=ST001618&STUDY_TYPE=MS&RESULT_TYPE=5"
    )[1]
    .rename(columns=lambda x: x.replace(" ", "_").lower().split(":")[0])
    .drop(columns=["subject_name", "sample_data"])
    .assign(sample_name=lambda df: df.sample_name.str.split(".", expand=True)[0])
    .query("sample_name.str.contains('U')")  # filtered out the study pools samples
)
sample_list

In [None]:
for column in sample_list:
    print(sample_list[column].value_counts(), end=3 * "\n")

When first checking the data, we noticed a mismatch in the number of files listed on the Metabolomics Workbench website. They claimed there were 360 mzML files, but after downloading and unzipping, we ended up with over 500 files. Since we're unsure about the diagnosis for the extra samples, we'll exclude them from our study by removing them.

In [None]:
data_path = utils.Configuration.RAW_DATA_PATH.joinpath("ST001618_Opium_study_LC_MS")

MS_files = pd.Series([item.stem for item in list(data_path.rglob("*.mzML"))])
print(
    f" Number of raw data files dowloaded from Metabolomics Workbench : {len(MS_files)}"
)
print(
    f" Number of unique raw data files amongst them : {(MS_files.str[:15].value_counts().sort_values(ascending=False).values == 1).sum()}"
)

As observed, we've saved 355 files on disk, while the Metabolomics Workbench website lists 360, indicating 5 missing files. Not a problem. Our plan is to filter out those 5 missing sample IDs from our sample list and then save the updated sample list to disk.

In [None]:
sample_list.to_parquet(
    f"{utils.Configuration.RAW_DATA_PATH.joinpath('sample_list')}.parquet.gzip",
    compression="gzip",
)

In [None]:
pd.read_parquet(utils.Configuration.RAW_DATA_PATH.joinpath("sample_list.parquet.gzip"))

In [None]:
# Perform a sanity check to see if the MS files have the same name as the ones in the sample list
# if this is not the case, we will have an error here
assert MS_files.isin(sample_list.sample_name).sum() == sample_list.shape[0]