In [1]:
from typing import Optional

import pandas as pd

In [2]:
def return_metabolomics_workbench_studies() -> pd.DataFrame:
    """
    Fetches the available metabolomics study dataset from the Metabolomics Workbench website.

    Returns:
        pd.DataFrame: Metabolomics dataset.

    Example:
    >>> metabolomics_data = return_metabolomics_workbench_dataset()
    """
    return pd.read_html(
        "https://www.metabolomicsworkbench.org/data/DRCCStudySummary.php?Mode=StudySummary&SortBy=Study%20ID&AscDesc=desc&ResultsPerPage=2000"
    )[2]


datasets = return_metabolomics_workbench_studies()

In [3]:
def filter_and_sort_datasets(
    df: pd.DataFrame, min_samples: int = 1, max_samples: Optional[int] = None
) -> pd.DataFrame:
    """
    Filters and sorts a DataFrame based on specific conditions related to samples, analysis, and file size metrics.

    Parameters:
        df (pd.DataFrame): The DataFrame to be filtered and sorted.
        min_samples (int): Minimum number of samples (default: 1).
        max_samples (int, optional): Maximum number of samples (default: None).

    Returns:
        pd.DataFrame: Filtered and sorted DataFrame.

    Example:
    >>> filtered_data = filter_and_sort_datasets(df, min_samples=50, max_samples=100)
    """
    df = (
        df.rename(columns={"Download(* : Contains raw data)": "file_size"})
        .assign(
            format=lambda df: df.file_size.str.extract(r"Data format:(\w+)"),
            file_size=lambda df: df.file_size.str.extract("(\d+\.*\d+[a-zA-Z]+)"),
            file_size_number=lambda df: df.file_size.str.extract("(\d+\.*\d+)").astype(
                "float"
            ),
            file_size_metric=lambda df: df.file_size.str.extract("([a-zA-Z])"),
        )
        .drop(columns="file_size")
        .sort_values(by="Samples", ascending=False)
        .dropna(subset="format")
    )

    if max_samples is None:
        return df.query(
            "(Samples > @min_samples) and (Analysis == 'LC-MS#') and (file_size_metric != 'T') and (~format.isin(['d', 'wiff']))"
        ).sort_values(by=["file_size_number", "Samples"], ascending=[True, False])
    else:
        return df.query(
            "(@min_samples < Samples < @max_samples) and (Analysis == 'LC-MS#') and (file_size_metric != 'T') and (~format.isin(['d', 'wiff']))"
        ).sort_values(by=["file_size_number", "Samples"], ascending=[True, False])


filter_and_sort_datasets(datasets, min_samples=300).head(20)

Unnamed: 0,Study ID,Study Title,Species,Institute,Analysis,Released,Version,Samples,format,file_size_number,file_size_metric
349,ST002482,Non-targeted screening of natural products fro...,Alternaria / Aspergillus / Botrytis / Anthosto...,Agriculture and Agri-Food Canada,LC-MS#,2023-07-02,1,325,mzML,1.7,G
383,ST002432,Metabolic impacts of metformin to seasonal inf...,Homo sapiens,The Jackson Laboratory for Genomic Medicine,LC-MS#,2023-01-20,1,360,mzML,3.9,G
612,ST002168,Multi-omics analyses of 398 foxtail millet acc...,Foxtail millet (Setaria italica),Shanxi Agricultural University,LC-MS#,2022-05-31,1,1088,mzML,4.9,G
768,ST002008,Glycine betaine uptake and metabolism in marin...,Natural mixed marine microbial community,University of Washington,LC-MS#,2022-01-17,1,433,mzML,5.4,G
57,ST002832,Resource competition predicts assembly of in v...,Bacteroides thetaiotaomicron,Stanford University,LC-MS#,2023-09-14,1,402,raw,8.4,G
629,ST002151,"Integrative Exposomic, Transcriptomic, Epigeno...",Homo sapiens,EPA,LC-MS#,2022-05-09,1,344,mzdata,12.7,G
1295,ST001370,Metabolomic profiling of Canadian species of A...,Alternaria sp.,Agriculture and Agri-Food Canada,LC-MS#,2020-06-01,1,338,raw,13.2,G
2,ST002944,Longitudinal polar fecal metabolomics of mice ...,Mus musculus,Ghent University,LC-MS#,2023-11-01,1,329,raw,21.5,G
1140,ST001618,Metabolomics Analysis: Opioid Addiction Projec...,Homo sapiens,University of North Carolina at Chapel Hill,LC-MS#,2022-05-02,1,336,raw,35.3,G
1242,ST001430,Metabolic dynamics and prediction og gestation...,Homo sapiens,Stanford University,LC-MS#,2020-07-24,1,781,mzXML,41.7,G
