Allow nested event loops.

In [1]:
import nest_asyncio
nest_asyncio.apply()

Source and raw dataset path setup.

Using the standalone storage convention.

In [7]:
from pathlib import Path

datasets_path = Path.home() / "Datasets" / "nifd-testing" 
sourcedata_path = datasets_path / "sourcedata"
rawdata_path = datasets_path / "rawdata"

print(f"Converting from {sourcedata_path} to {rawdata_path}.")

Converting from /Users/ghislain.vaillant/Datasets/nifd-testing/sourcedata to /Users/ghislain.vaillant/Datasets/nifd-testing/rawdata.


Defining our first task.

In [23]:
from os import PathLike
from typing import Tuple
from pydra.mark import annotate, task


@task
@annotate({"return": {"clinical_data": PathLike, "imaging_data": PathLike}})
def get_collection_files(sourcedata: PathLike):
    from pathlib import Path

    sourcedata_path = Path(sourcedata)
    
    return (
        next(sourcedata_path.glob("*.csv")),
        next(sourcedata_path.glob("*.zip")),
    )

Testing our first task.

In [48]:
task = get_collection_files(sourcedata=sourcedata_path)
res1 = task()

res1

Result(output=Output(clinical_data=PosixPath('/Users/ghislain.vaillant/Datasets/nifd-testing/sourcedata/NIFD_TESTING_8_09_2021.csv'), imaging_data=PosixPath('/Users/ghislain.vaillant/Datasets/nifd-testing/sourcedata/NIFD_TESTING.zip')), runtime=None, errored=False)

In [79]:
from os import PathLike
from pandas import DataFrame
from pydra.mark import annotate, task


@task
@annotate(
    {
        "return": {
            "participant_metadata": DataFrame,
            "session_metadata": DataFrame,
            "image_data_ids": list,
        }
    }
)
def extract_metadata(clinical_data: PathLike):
    from pandas import read_csv

    # Read and sanitise clinical data.
    dataframe = (
        read_csv(
            clinical_data,
            index_col="Image Data ID",
            parse_dates=["Acq Date", "Downloaded"],
        )
        .rename_axis(index=lambda x: x.lower().replace(" ", "_"))
        .rename(columns=lambda x: x.lower().replace(" ", "_"))
        .convert_dtypes()
    )

    # Normalise participant and session identifiers.
    dataframe["participant_id"] = "sub-" + dataframe.subject.str.replace("_", "")
    dataframe["session_id"] = "ses-" + dataframe.visit.astype("string")

    # Extract participant-level metadata, i.e. sex and group.
    participant_metadata = (
        dataframe.groupby(by="participant_id", sort="session_id")
        .first()
    )[["sex", "group"]]
    
    # Extract session-level metadata, i.e. age.
    session_metadata = (
        dataframe.groupby(by=["participant_id", "session_id"], sort=True)
        .first()
    )[["age"]]

    # Extract images to convert.
    image_data_ids = (
        dataframe[
            (dataframe.modality == "MRI")
            & (dataframe.format == "DCM")
        ]
        .reset_index()
        .groupby(by=["participant_id", "session_id"], sort="acq_date")
        .first()
    )[["image_data_id"]].squeeze().tolist()

    return participant_metadata, session_metadata, image_data_ids


task = extract_metadata(clinical_data=res1.output.clinical_data)
res2 = task()

res2

Result(output=Output(participant_metadata=               sex    group
participant_id             
sub-1S0006       M  Patient
sub-2S0004       M  Patient
sub-3S0004       F  Patient, session_metadata=                           age
participant_id session_id     
sub-1S0006     ses-1        62
               ses-2        62
               ses-3        62
sub-2S0004     ses-1        66
               ses-2        66
sub-3S0004     ses-1        57
               ses-2        57
               ses-3        58, image_data_ids=['I216349', 'I227721', 'I245466', 'I272054', 'I272124', 'I710045', 'I709764', 'I709741']), runtime=None, errored=False)

In [80]:
from os import PathLike
from pandas import DataFrame
from pydra.mark import task


@task
def write_metadata(
    rawdata: PathLike,
    participant_metadata: DataFrame,
    session_metadata: DataFrame,
) -> None:
    from pathlib import Path
    from csv import DictWriter
    
    # Write participants.tsv
    with (Path(rawdata) / "participants.tsv").open(mode="w") as f:
        participant_metadata.to_csv(f, sep="\t", na_rep="n/a")
    
    # Write sessions.tsv
    for participant_id, session_group in session_metadata.groupby("participant_id"):
        session_group = session_group.droplevel("participant_id")

        participant_dir = Path(rawdata) / str(participant_id)
        participant_dir.mkdir(parents=True, exist_ok=True)

        with (participant_dir / f"{str(participant_id)}_sessions.tsv").open("w") as f:
            session_group.to_csv(f, sep="\t", na_rep="n/a")


task = write_metadata(
    rawdata=rawdata_path,
    participant_metadata=res2.output.participant_metadata,
    session_metadata=res2.output.session_metadata,
)
res3 = task()