diff --git a/CODEOWNERS b/CODEOWNERS index c245f8bebc..5dbad3bf60 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -13,4 +13,8 @@ /src/04-modality-specific-files/01-magnetic-resonance-imaging-data.md @chrisgorgo /src/04-modality-specific-files/03-electroencephalography.md @sappelhoff @ezemikulan /src/04-modality-specific-files/04-intracranial-electroencephalography.md @ezemikulan +/src/05-derivatives/03-imaging.md @effigies +/src/05-derivatives/04-structural-derivatives.md @edickie @ahoopes +/src/05-derivatives/05-functional-derivatives.md @effigies +/src/05-derivatives/06-diffusion-derivatives.md @francopestilli @oesteban @Lestropie /src/99-appendices/06-meg-file-formats.md @monkeyman192 diff --git a/Pipfile b/Pipfile index 095002b9a7..c84893295e 100644 --- a/Pipfile +++ b/Pipfile @@ -6,6 +6,7 @@ name = "pypi" [packages] mkdocs = "==1.0.4" mkdocs-material = "==4.1.2" +pymdown-extensions = "==6.0.0" mkdocs-branchcustomization-plugin = "~=0.1.3" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index a83830cdb9..56a59462a6 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "7669df4f1b94769b64a7e7bca89c22ded77a2fa9073f5f25c990ebc389b1bd5d" + "sha256": "921dc75c77c4cb90cd28e0f16e8bb3fe0ad35761c165c977a3219d888ce74b15" }, "pipfile-spec": 6, "requires": { @@ -115,13 +115,6 @@ "index": "pypi", "version": "==4.1.2" }, - "pep562": { - "hashes": [ - "sha256:58cb1cc9ee63d93e62b4905a50357618d526d289919814bea1f0da8f53b79395", - "sha256:d2a48b178ebf5f8dd31709cc26a19808ef794561fa2fe50ea01ea2bad4d667ef" - ], - "version": "==1.0" - }, "pygments": { "hashes": [ "sha256:2a3fe295e54a20164a9df49c75fa58526d3be48e14aceba6d6b1e8ac0bfd6f1b", @@ -131,10 +124,11 @@ }, "pymdown-extensions": { "hashes": [ - "sha256:27953f071d37b63d418738f75d847d824c0e4430e93f085cfdd9f8dc08a8c5c3", - "sha256:328b9e114925729e0789558a94325be8e7ca9e0323ed2a2b705d9bc1de4d2716" + "sha256:25b0a7967fa697b5035e23340a48594e3e93acb10b06d74574218ace3347d1df", + "sha256:6cf0cf36b5a03b291ace22dc2f320f4789ce56fbdb6635a3be5fadbf5d7694dd" ], - "version": "==6.2" + "index": "pypi", + "version": "==6.0" }, "pyyaml": { "hashes": [ diff --git a/mkdocs.yml b/mkdocs.yml index 5a2650f1ee..3af074092d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -9,6 +9,7 @@ extra_javascript: markdown_extensions: - toc: anchorlink: true + - pymdownx.superfences plugins: - search - branchcustomization: @@ -32,8 +33,12 @@ nav: - Physiological and other continuous recordings: 04-modality-specific-files/06-physiological-and-other-continuous-recordings.md - Behavioral experiments (with no MRI): 04-modality-specific-files/07-behavioral-experiments.md - Genetic Descriptor: 04-modality-specific-files/08-genetic-descriptor.md - - Longitudinal and multi-site studies: 05-longitudinal-and-multi-site-studies.md - - BIDS Extension Proposals: 06-extensions.md + - Derivatives: + - BIDS Derivatives: 05-derivatives/01-introduction.md + - Common data types and metadata: 05-derivatives/02-common-data-types.md + - Imaging data types: 05-derivatives/03-imaging.md + - Longitudinal and multi-site studies: 06-longitudinal-and-multi-site-studies.md + - BIDS Extension Proposals: 07-extensions.md - Appendix: - Contributors: 99-appendices/01-contributors.md - Licenses: 99-appendices/02-licenses.md diff --git a/src/01-introduction.md b/src/01-introduction.md index 8ee919c691..667fdbb8dd 100644 --- a/src/01-introduction.md +++ b/src/01-introduction.md @@ -46,7 +46,7 @@ different backgrounds. The BIDS specification can be extended in a backwards compatible way and will evolve over time. This is accomplished through community-driven BIDS Extension Proposals (BEPs). For more information about the BEP process, see -[Extending the BIDS specification](06-extensions.md). +[Extending the BIDS specification](07-extensions.md). ## Citing BIDS diff --git a/src/02-common-principles.md b/src/02-common-principles.md index a00d1e597c..496f2f1b45 100644 --- a/src/02-common-principles.md +++ b/src/02-common-principles.md @@ -120,39 +120,34 @@ in the appendix. ## Source vs. raw vs. derived data -BIDS in its current form is designed to harmonize and describe raw (unprocessed -or minimally processed due to file format conversion) data. During analysis such -data will be transformed and partial as well as final results will be saved. +BIDS was originally designed to describe and apply consistent naming conventions +to raw (unprocessed or minimally processed due to file format conversion) data. +During analysis such data will be transformed and partial as well as final results +will be saved. Derivatives of the raw data (other than products of DICOM to NIfTI conversion) MUST be kept separate from the raw data. This way one can protect the raw data from accidental changes by file permissions. In addition it is easy to -distinguish partial results from the raw data and share the latter. Similar -rules apply to source data which is defined as data before harmonization and/or -file format conversion (for example E-Prime event logs or DICOM files). - -This specification currently does not go into details of recommending a -particular naming scheme for including different types of source data (raw event -logs, parameter files, etc. before conversion to BIDS) and data derivatives -(correlation maps, brain masks, contrasts maps, etc.). However, in the case that -these data are to be included: - -1. These data MUST be kept in separate `sourcedata` and `derivatives` folders - each with a similar folder structure as presented below for the BIDS-managed - data. For example: - `derivatives/fmriprep/sub-01/ses-pre/sub-01_ses-pre_mask.nii.gz` or +distinguish partial results from the raw data and share the latter. +See [Storage of derived datasets](#storage-of-derived-datasets) for more on +organizing derivatives. + +Similar rules apply to source data, which is defined as data before +harmonization, reconstruction, and/or file format conversion (for example, E-Prime event logs or +DICOM files). This specification currently does not go into details of +recommending a particular naming scheme for including different types of +source data (raw event logs, parameter files, etc. before conversion to BIDS). +However, in the case that these data are to be included: + +1. These data MUST be kept in separate `sourcedata` folder with a similar + folder structure as presented below for the BIDS-managed data. For example: `sourcedata/sub-01/ses-pre/func/sub-01_ses-pre_task-rest_bold.dicom.tgz` or `sourcedata/sub-01/ses-pre/func/MyEvent.sce`. -1. A README file SHOULD be found at the root of the `sourcedata` or the - `derivatives` folder (or both). This file should describe the nature of the - raw data or the derived data. In the case of the existence of a - `derivatives` folder, we RECOMMEND including details about the software - stack and settings used to generate the results. Inclusion of non-imaging - objects that improve reproducibility are encouraged (scripts, settings - files, etc.). - -1. We RECOMMEND including the PDF print-out with the actual sequence parameters - generated by the scanner in the `sourcedata` folder. +1. A README file SHOULD be found at the root of the `sourcedata` folder or the + `derivatives` folder, or both. + This file should describe the nature of the raw data or the derived data. + We RECOMMEND including the PDF print-out with the actual sequence + parameters generated by the scanner in the `sourcedata` folder. Alternatively one can organize their data in the following way @@ -167,15 +162,120 @@ my_dataset/ sub-02/ ... derivatives/ + pipeline_1/ + pipeline_2/ ... ``` -In this example **only the `rawdata` subfolder needs to be a BIDS compliant -dataset**. This specification does not prescribe anything about the contents of -`sourcedata` and `derivatives` folders in the above example - nor does it -prescribe the `sourcedata`, `derivatives`, or `rawdata` folder names. The above -example is just a convention that can be useful for organizing raw, source, and -derived data while maintaining BIDS compliancy of the raw data folder. +In this example, where `sourcedata` and `derivatives` are not nested inside +`rawdata`, **only the `rawdata` subfolder** needs to be a BIDS-compliant +dataset. +The subfolders of `derivatives` MAY be BIDS-compliant derivatives datasets +(see [Non-compliant derivatives][#non-compliant-derivatives] for further discussion). +This specification does not prescribe anything about the contents of `sourcedata` +folders in the above example - nor does it prescribe the `sourcedata`, +`derivatives`, or `rawdata` folder names. +The above example is just a convention that can be useful for organizing raw, +source, and derived data while maintaining BIDS compliancy of the raw data +folder. When using this convention it is RECOMMENDED to set the `SourceDatasets` +field in `dataset_description.json` of each subfolder of `derivatives` to: + +```JSON +{ + "SourceDatasets": [ {"URL": "file://../../rawdata/"} ] +} +``` + +### Storage of derived datasets + +Derivatives can be stored/distributed in two ways: + +1. Under a `derivatives/` subfolder in the root of the source BIDS dataset + folder to make a clear distinction between raw data and results of data + processing. A data processing pipeline will typically have a dedicated directory + under which it stores all of its outputs. Different components of a pipeline can, + however, also be stored under different subfolders. There are few restrictions on + the directory names; it is RECOMMENDED to use the format `-` in + cases where it is anticipated that the same pipeline will output more than one variant (e.g., + `AFNI-blurring`, `AFNI-noblurring`, etc.). For the sake of consistency, the + subfolder name SHOULD be the `GeneratedBy.Name` field in + `data_description.json`, optionally followed by a hyphen and a suffix (see + [Derived dataset and pipeline description][derived-dataset-description]). + + Example of derivatives with one directory per pipeline: + + ```Plain + /derivatives/fmriprep-v1.4.1/sub-0001 + /derivatives/spm/sub-0001 + /derivatives/vbm/sub-0001 + ``` + + Example of a pipeline with split derivative directories: + + ```Plain + /derivatives/spm-preproc/sub-0001 + /derivatives/spm-stats/sub-0001 + ``` + + Example of a pipeline with nested derivative directories: + + ```Plain + /derivatives/spm-preproc/sub-0001 + /derivatives/spm-preproc/derivatives/spm-stats/sub-0001 + ``` + + +1. As a standalone dataset independent of the source (raw or derived) BIDS + dataset. + This way of specifying derivatives is particularly useful when the source + dataset is provided with read-only access, for publishing derivatives as + independent bodies of work, or for describing derivatives that were created + from more than one source dataset. + The `sourcedata/` subdirectory MAY be used to include the source dataset(s) + that were used to generate the derivatives. + Likewise, any code used to generate the derivatives from the source data + MAY be included in the `code/` subdirectory. + + Example of a derivative dataset including the raw dataset as source: + + ```Plain + my_processed_data/ + code/ + processing_pipeline-1.0.0.img + hpc_submitter.sh + ... + sourcedata/ + dataset_description.json + participants.tsv + sub-01/ + sub-02/ + ... + dataset_description.json + sub-01/ + sub-02/ + ... + ``` + +Throughout this specification, if a section applies particularly to derivatives, +then Case 1 will be assumed for clarity in templates and examples, but removing +`/derivatives/` from the template name will provide the equivalent for +Case 2. +In both cases, every derivatives dataset is considered a BIDS dataset and must +include a `dataset_description.json` file at the root level (see +[Dataset description][dataset-description]. +Consequently, files should be organized to comply with BIDS to the full extent +possible (that is, unless explicitly contradicted for derivatives). +Any subject-specific derivatives should be housed within each subject’s directory; +if session-specific derivatives are generated, they should be deposited under a +session subdirectory within the corresponding subject directory; and so on. + +### Non-compliant deriatives + +Nothing in this specification should be interpreted to disallow the +storage/distribution of non-compliant derivatives of BIDS datasets. +In particular, if a BIDS dataset contains a `derivatives/` sub-directory, +the contents of that directory may be a heterogeneous mix of BIDS Derivatives +datasets and non-compliant derivatives. ## The Inheritance Principle @@ -509,3 +609,10 @@ meaning of file names and setting requirements on their contents or metadata. Validation and parsing tools MAY treat the presence of non-standard files and directories as an error, so consult the details of these tools for mechanisms to suppress warnings or provide interpretations of your file names. + +[]: <> (################) +[]: <> (Link definitions) +[]: <> (################) + +[dataset-description]: 03-modality-agnostic-files.md#dataset-description +[derived-dataset-description]: 03-modality-agnostic-files.md#derived-dataset-and-pipeline-description diff --git a/src/03-modality-agnostic-files.md b/src/03-modality-agnostic-files.md index 9c711f18fc..43b21518a9 100644 --- a/src/03-modality-agnostic-files.md +++ b/src/03-modality-agnostic-files.md @@ -18,6 +18,7 @@ Every dataset MUST include this file with the following fields: | ------------------------------------------------------------------------------| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | Name | REQUIRED. Name of the dataset. | | BIDSVersion | REQUIRED. The version of the BIDS standard that was used. | +| DatasetType | RECOMMENDED. The interpretaton of the dataset. MUST be one of `"raw"` or `"derivative"`. For backwards compatibility, the default value is `"raw"`. | | License | RECOMMENDED. The license for the dataset. The use of license name abbreviations is RECOMMENDED for specifying a license (see [Appendix II](./99-appendices/02-licenses.md)). The corresponding full license text MAY be specified in an additional `LICENSE` file. | | Authors | OPTIONAL. List of individuals who contributed to the creation/curation of the dataset. | | Acknowledgements | OPTIONAL. Text acknowledging contributions of individuals or institutions beyond those listed in Authors or Funding. | @@ -32,7 +33,8 @@ Example: ```JSON { "Name": "The mother of all experiments", - "BIDSVersion": "1.0.1", + "BIDSVersion": "1.4.0", + "DatasetType": "raw", "License": "CC0", "Authors": [ "Paul Broca", @@ -55,6 +57,68 @@ Example: } ``` +#### Derived dataset and pipeline description + +As for any BIDS dataset, a `dataset_description.json` file MUST be found at the +top level of the a derived dataset: +`/derivatives//dataset_description.json` + +In addition to the keys for raw BIDS datasets, +derived BIDS datasets include the following REQUIRED and RECOMMENDED +`dataset_description.json` keys: + +| **Key name** | **Description** | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| GeneratedBy | REQUIRED. List of [objects][object] with at least one element. | +| SourceDatasets | RECOMMENDED. A list of [objects][object] specifying the locations and relevant attributes of all source datasets. Valid fields in each object include `URL`, `DOI`, and `Version`. | + +Each object in the `GeneratedBy` list includes the following REQUIRED, RECOMMENDED +and OPTIONAL keys: + +| **Key name** | **Description** | +| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| Name | REQUIRED. Name of the pipeline or process that generated the outputs. Use `"Manual"` to indicate the derivatives were generated by hand, or adjusted manually after an initial run of an automated pipeline. | +| Version | RECOMMENDED. Version of the pipeline. | +| Description | OPTIONAL. Plain-text description of the pipeline or process that generated the outputs. RECOMMENDED if `Name` is `"Manual"`. | +| CodeURL | OPTIONAL. URL where the code used to generate the derivatives may be found. | +| Container | OPTIONAL. [Object][object] specifying the location and relevant attributes of software container image used to produce the derivative. Valid fields in this object include `Type`, `Tag` and `URI`. | + +Example: + +```JSON +{ + "Name": "FMRIPREP Outputs", + "BIDSVersion": "1.4.0", + "DatasetType": "derivative", + "GeneratedBy": [ + { + "Name": "fmriprep", + "Version": "1.4.1", + "Container": { + "Type": "docker", + "Tag": "poldracklab/fmriprep:1.4.1" + } + }, + { + "Name": "Manual", + "Description": "Re-added RepetitionTime metadata to bold.json files" + } + ], + "SourceDatasets": [ + { + "DOI": "10.18112/openneuro.ds000114.v1.0.1", + "URL": "https://openneuro.org/datasets/ds000114/versions/1.0.1", + "Version": "1.0.1" + } + ] +} +``` + +If a derived dataset is stored as a subfolder of the raw dataset, then the `Name` field +of the first `GeneratedBy` object MUST be a substring of the derived dataset folder name. +That is, in a directory `/derivatives/[-]/`, the first +`GeneratedBy` object should have a `Name` of ``. + ### `README` In addition a free form text file (`README`) describing the dataset in more diff --git a/src/04-modality-specific-files/02-magnetoencephalography.md b/src/04-modality-specific-files/02-magnetoencephalography.md index fbe4865655..5fe8e74f34 100644 --- a/src/04-modality-specific-files/02-magnetoencephalography.md +++ b/src/04-modality-specific-files/02-magnetoencephalography.md @@ -1,6 +1,6 @@ # Magnetoencephalography -Support for Magnetoencephalography (MEG) was developed as a [BIDS Extension Proposal](../06-extensions.md#bids-extension-proposals). +Support for Magnetoencephalography (MEG) was developed as a [BIDS Extension Proposal](../07-extensions.md#bids-extension-proposals). Please cite the following paper when referring to this part of the standard in context of the academic literature: diff --git a/src/04-modality-specific-files/03-electroencephalography.md b/src/04-modality-specific-files/03-electroencephalography.md index a409960036..bb4c239608 100644 --- a/src/04-modality-specific-files/03-electroencephalography.md +++ b/src/04-modality-specific-files/03-electroencephalography.md @@ -1,6 +1,6 @@ # Electroencephalography -Support for Electroencephalography (EEG) was developed as a [BIDS Extension Proposal](../06-extensions.md#bids-extension-proposals). +Support for Electroencephalography (EEG) was developed as a [BIDS Extension Proposal](../07-extensions.md#bids-extension-proposals). Please cite the following paper when referring to this part of the standard in context of the academic literature: diff --git a/src/04-modality-specific-files/04-intracranial-electroencephalography.md b/src/04-modality-specific-files/04-intracranial-electroencephalography.md index e7e65517d1..18b16f9db6 100644 --- a/src/04-modality-specific-files/04-intracranial-electroencephalography.md +++ b/src/04-modality-specific-files/04-intracranial-electroencephalography.md @@ -1,6 +1,6 @@ # Intracranial Electroencephalography -Support for Intracranial Electroencephalography (iEEG) was developed as a [BIDS Extension Proposal](../06-extensions.md#bids-extension-proposals). +Support for Intracranial Electroencephalography (iEEG) was developed as a [BIDS Extension Proposal](../07-extensions.md#bids-extension-proposals). Please cite the following paper when referring to this part of the standard in context of the academic literature: diff --git a/src/04-modality-specific-files/08-genetic-descriptor.md b/src/04-modality-specific-files/08-genetic-descriptor.md index ad24b06fb4..52dbc823a8 100644 --- a/src/04-modality-specific-files/08-genetic-descriptor.md +++ b/src/04-modality-specific-files/08-genetic-descriptor.md @@ -1,7 +1,7 @@ # Genetic Descriptor Support for genetic descriptors was developed as a [BIDS Extension -Proposal](../06-extensions.md#bids-extension-proposals). +Proposal](../07-extensions.md#bids-extension-proposals). The extension was primarily developed by Cyril Pernet and Clara Moreau with contributions from Tom Nichols and Jessica Turner. diff --git a/src/05-derivatives/01-introduction.md b/src/05-derivatives/01-introduction.md new file mode 100644 index 0000000000..9615fe814a --- /dev/null +++ b/src/05-derivatives/01-introduction.md @@ -0,0 +1,78 @@ +# BIDS Derivatives + +Derivatives are outputs of common processing pipelines, capturing data and +meta-data sufficient for a researcher to understand and (critically) reuse those +outputs in subsequent processing. +Standardizing derivatives is motivated by use cases where formalized +machine-readable access to processed data enables higher level processing. + +The following sections cover additions to and divergences from "raw" BIDS. +Placement and naming conventions for derived datasets are addressed in +[Storage of derived datasets][storage], and dataset-level metadata is included +in [Derived dataset and pipeline description][derived-dataset-description]. + +## Metadata conventions + +- Unless specified otherwise, individual sidecar JSON files and all metadata + fields within are OPTIONAL. However, the appropriate use of these files and + pertinent fields is very valuable and thus encouraged. Moreover, for some + types of files, there may be one or more required metadata fields, in which + case at least one metadata file containing that field must be located + somewhere within the file’s hierarchy (per [the Inheritance + Principle](../02-common-principles.md#the-inheritance-principle)). + +- When chaining derivative pipelines, any JSON fields that were specified as + mandatory in the input files SHOULD be propagated forward in the output + file’s JSON provided they remain valid. Non-required JSON fields MAY be + propagated, and are highly useful, but it is the pipeline’s responsibility + to ensure that the values are still relevant and appropriate to the type of + output data. + +## File naming conventions + +- Filenames that are permissible for a raw BIDS data type have a privileged + status. Any modification of raw files must use a modified filename that does + not conflict with the raw filename. Further, any files created as part of a + derivative dataset must not match a permissible filename of a valid raw + dataset. Stated equivalently, if any filename in a derivative dataset has a + name permissible for a raw BIDS data, then that file must be an identical + copy of that raw file. + +- Each Derivatives filename MUST be of the form: + `[_keyword-]_.` + (where `` could either be an `` or a `