From 6e4b86d07376c1d256d76fe3a3fd32f366f5ba43 Mon Sep 17 00:00:00 2001
From: Christian Busse <christian.busse@dkfz-heidelberg.de>
Date: Fri, 15 Feb 2019 19:57:11 +0100
Subject: [PATCH] Include recent MiAIRR refinements

The recent changes introduced via the `miairr_refinement` branch
(PR #155) will potentially conflict with the changes in CRWG-API.

Resolve conflicts.
---
 AIRR_Minimal_Standard_Data_Elements.tsv | 30 ++++++++++++-------------
 specs/airr-schema.yaml                  | 12 +++++-----
 specs/miairr.yaml                       | 30 ++++++++++++-------------
 3 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/AIRR_Minimal_Standard_Data_Elements.tsv b/AIRR_Minimal_Standard_Data_Elements.tsv
index d73538d71..d97d8db42 100644
--- a/AIRR_Minimal_Standard_Data_Elements.tsv
+++ b/AIRR_Minimal_Standard_Data_Elements.tsv
@@ -1,25 +1,25 @@
 MiAIRR data set / subset	MiAIRR field designation	Data type	Content format	MiAIRR content definition	Field value example	AIRR Formats WG field name
 1 / study	Study	string	Free text	Unique ID assigned by study registry	PRJNA001	study_id
 1 / study	Study title	string	Free text	Descriptive study title	Effects of sun light exposure of the Treg repertoire	study_title
-1 / study	Study type	string	Controlled vocabulary	Generic study design	Placebo controlled phase 3 clinical trial	study_description
+1 / study	Study type	string	{"ontology": "NCIT", "top_node": "Study Design", "draft": true}	Generic study design	Case-Control Study	study_description
 1 / study	Study inclusion/exclusion criteria	string	Free text	List of criteria for inclusion/exclusion for the study	Include: Clinical P. falciparum infection; Exclude: Seropositive for HIV	inclusion_exclusion_criteria
 1 / study	Grant funding agency	string	Free text	Funding agencies and grant numbers	NIH, award number R01GM987654	grants
-1 / study	Contact information (data collection)	string	Free text	Full contact information of the corresponding author, i.e. who is legally responsible for data collection and release. This should include an e-mail address.	p.stibbons@unseenu.edu	collected_by
-1 / study	Lab name	string	Free text	Department of corresponding author	Stibbons Lab	lab_name
-1 / study	Lab address	string	Free text	Institutional address of corresponding author	School of Medicine, Unseen University, Ankh-Morpork, Disk World	lab_address
-1 / study	Contact information (data deposition)	string	Free text	Full contact information of the submitter, i.e. the person deposition the data	Dr. P. Stibbons	submitted_by
+1 / study	Contact information (data collection)	string	Free text	Full contact information of the data collector, i.e. the person who is legally responsible for data collection and release. This should include an e-mail address.	Dr. P. Stibbons, p.stibbons@unseenu.edu	collected_by
+1 / study	Lab name	string	Free text	Department of data collector	Department for Planar Immunology	lab_name
+1 / study	Lab address	string	Free text	Institution and institutional address of data collector	School of Medicine, Unseen University, Ankh-Morpork, Disk World	lab_address
+1 / study	Contact information (data deposition)	string	Free text	Full contact information of the data depositor, i.e. the person submitting the data to a repository. This is supposed to be a short-lived and technical role until the submission is relased.	Adrian Turnipseed, a.turnipseed@unseenu.edu	submitted_by
 1 / study	Relevant publications	string	Valid PubMed ID	Publications describing the rationale and/or outcome of the study	PMID85642	pub_ids
 1 / subject	Subject ID	string	Free text	Subject ID assigned by submitter, unique within study	SUB856413	subject_id
 1 / subject	Synthetic library	boolean	TRUE/FALSE	TRUE for libraries in which the diversity has been synthetically generated (e.g. phage display)	FALSE	synthetic
-1 / subject	Organism	string	Controlled vocabulary	Binomial designation of subject's species	Homo sapiens	organism
-1 / subject	Sex	string	/(male|female|pooled|hermaphrodite|intersex|not collected|not applicable)/	Biological sex of subject	female	sex
+1 / subject	Organism	string	{"ontology": "NCBITAXON", "top_node": "Gnathostomata", "draft": false}	Species of subject (using binomial nomenclature)	Homo sapiens	organism
+1 / subject	Sex	string	{"controlled_vocabulary": ["male", "female", "pooled", "hermaphrodite", "intersex", "not collected", "not applicable"]}	Biological sex of subject	female	sex
 1 / subject	Age	string	Time duration and unit	Absolute age of subject at time point `Age event`	65 a	age
-1 / subject	Age event	string	Free text	Event in the study schedule to which `Age` refers to. For NCBI BioSample this MUST be `sampling`. For other implementations submitters need to be aware that there is currently no mechanism to encode to potential delta between `Age event` and `Sample collection time`, hence the chosen events should be in temporal proximity.	enrollment	age_event
+1 / subject	Age event	string	Free text	Event in the study schedule to which `Age` refers. For NCBI BioSample this MUST be `sampling`. For other implementations submitters need to be aware that there is currently no mechanism to encode to potential delta between `Age event` and `Sample collection time`, hence the chosen events should be in temporal proximity.	enrollment	age_event
 1 / subject	Ancestry population	string	Free text	Broad geographic origin of ancestry (continent)	list of continents, mixed or unknown	ancestry_population
 1 / subject	Ethnicity	string	Free text	Ethnic group of subject (defined as cultural/language-based membership)	English, Kurds, Manchu, Yakuts (and other fields from Wikipedia)	ethnicity
 1 / subject	Race	string	Free text	Racial group of subject (as defined by NIH)	White, American Indian or Alaska Native, Black, Asian, Native Hawaiian or Other Pacific Islander, Other 	race
 1 / subject	Strain name	string	Free text	Non-human: designation of the strain or breed of animal used	C57BL/6J	strain_name
-1 / subject	Relation to other subjects	string	Free text	Subject ID to which `Relation type` refers to	SUB1355648	linked_subjects
+1 / subject	Relation to other subjects	string	Free text	Subject ID to which `Relation type` refers	SUB1355648	linked_subjects
 1 / subject	Relation type	string	Free text	Relation between subject and `linked_subjects`, can be genetic or environmental (e.g.exposure)	father, daughter, household	link_type
 1 / diag. & intervent.	Study group description	string	Free text	Designation of study arm to which the subject is assigned to	control	study_group_description
 1 / diag. & intervent.	Diagnosis	string	Free text	Diagnosis of subject	Multiple myeloma	disease_diagnosis
@@ -38,26 +38,26 @@ MiAIRR data set / subset	MiAIRR field designation	Data type	Content format	MiAIR
 2 / sample	Collection time event	string	Free text	Event in the study schedule to which `Sample collection time` relates to	Primary vaccination	collection_time_point_reference
 2 / sample	Biomaterial provider	string	Free text	Name and address of the entity providing the sample	Tissues-R-Us, Tampa, FL, USA	biomaterial_provider
 3 / process (cell)	Tissue processing	string	Free text	Enzymatic digestion and/or physical methods used to isolate cells from sample	Collagenase A/Dnase I digested, followed by Percoll gradient	tissue_processing
-3 / process (cell)	Cell subset	string	Controlled vocabulary	Commonly-used designation of isolated cell population	Class-switched Memory B cells	cell_subset
+3 / process (cell)	Cell subset	string	{"ontology": "CL", "top_node": "lymphocyte", "draft": true}	Commonly-used designation of isolated cell population	class switched memory B cell	cell_subset
 3 / process (cell)	Cell subset phenotype	string	Free text	List of cellular markers and their expression levels used to isolate the cell population	CD19+ CD38+ CD27+ IgM- IgD-	cell_phenotype
 3 / process (cell)	Single-cell sort	boolean	TRUE/FALSE	TRUE if single cells were isolated into separate compartments	FALSE	single_cell
 3 / process (cell)	Number of cells in experiment	integer	Number	Total number of cells that went into the experiment	1000000	cell_number
-3 / process (cell)	Number of cells per sequencing reaction	integer	Number	Number of cells for each biological repicate	50000	cells_per_reaction
+3 / process (cell)	Number of cells per sequencing reaction	integer	Number	Number of cells for each biological replicate	50000	cells_per_reaction
 3 / process (cell)	Cell storage	boolean	TRUE/FALSE	TRUE if cells were cryo-preserved between isolation and further processing	TRUE	cell_storage
 3 / process (cell)	Cell quality	string	Free text	Relative amount of viable cells after preparation and (if applicable) thawing	90% viability as determined by 7-AAD	cell_quality
 3 / process (cell)	Cell isolation / enrichment procedure	string	Free text	Description of the procedure used for marker-based isolation or enrich cells	Cells were stained with fluorochrome labeled antibodies and then sorted on a FlowMerlin (CE) cytometer	cell_isolation
 3 / process (cell)	Processing protocol	string	Free text	Description of the methods applied to the sample including cell preparation/ isolation/enrichment and nucleic acid extraction. This should closely mirror the Materials and methods section in the manuscript	Stimulated wih anti-CD3/anti-CD28	cell_processing_protocol
-3 / process (nucl. acid)	Target substrate	string	/(DNA|RNA)/	The class of nucleic acid that was used as primary starting material for the following procedures	RNA	template_class
+3 / process (nucl. acid)	Target substrate	string	{"controlled_vocabulary": ["DNA", "RNA"]}	The class of nucleic acid that was used as primary starting material for the following procedures	RNA	template_class
 3 / process (nucl. acid)	Target substrate quality	string	Free text	Description and results of the quality control performed on the template material	RIN 9.2	template_quality
 3 / process (nucl. acid)	Template amount	string	Free text	Amount of template that went into the process	1000 ng	template_amount
-3 / process (nucl. acid)	Library generation method	string	Controlled vocabulary	Generic type of library generation	Oligo-dT primed 5' RACE	library_generation_method
+3 / process (nucl. acid)	Library generation method	string	{"controlled_vocabulary": ["PCR", "RT(RHP)+PCR", "RT(oligo-dT)+PCR", "RT(oligo-dT)+TS+PCR", "RT(oligo-dT)+TS(UMI)+PCR", "RT(specific)+PCR", "RT(specific)+TS+PCR", "RT(specific)+TS(UMI)+PCR", "RT(specific+UMI)+PCR", "RT(specific+UMI)+TS+PCR", "RT(specific)+TS", "other"]}	Generic type of library generation	RT(oligo-dT)+PCR	library_generation_method
 3 / process (nucl. acid)	Library generation protocol	string	Free text	Description of processes applied to substrate to obtain a library that is ready for sequencing	cDNA was generated using	library_generation_protocol
 3 / process (nucl. acid)	Protocol IDs	string	Free text	When using a library generation protocol from a commercial provider, provide the protocol version number	v2.1 (2016-09-15)	library_generation_kit_version
 3 / process (nucl. acid [pcr])	Target locus for PCR	string	Free text	Designation of the target locus according to standard gene nomencleature	Constant region vs. V region amplification	pcr_target_locus
 3 / process (nucl. acid [pcr])	Forward PCR primer target location	string	Free text	Position of the most distal nucleotide templated by the forward primer or primer mix	IGHV, +23	forward_pcr_primer_target_location
 3 / process (nucl. acid [pcr])	Reverse PCR primer target location	string	Free text	Position of the most proximal nucleotide templated by the reverse primer or primer mix	IGHG, +57	reverse_pcr_primer_target_location
-3 / process (nucl. acid)	Complete sequences	string	/(partial|complete|complete+untemplated)/	To be considered `complete`, the procedure used for library construction MUST generate sequences that 1) include the first V segment codon that encodes the mature polypeptide chain (i.e. after the leader sequence) and 2) include the last complete codon of the J segment (i.e. 1 bp 5' of the J->C splice site) and 3) provide sequence information for all positions between 1) and 2). To be considered `complete & untemplated`, the sections of the sequences defined in points 1) to 3) of the previous sentence MUST be untemplated, i.e. MUST NOT overlap with the primers used in library preparation.	partial	complete_sequences
-3 / process (nucl. acid)	Physical linkage of different loci	string	Controlled vocabulary	Describes the mode of linkage if a method was used which physically links nucleic acids derived from distinct loci in a single-cell context	IGH-IGK/IGL-head/head	physical_linkage
+3 / process (nucl. acid)	Complete sequences	string	{"controlled_vocabulary": ["partial", "complete", "complete+untemplated"]}	To be considered `complete`, the procedure used for library construction MUST generate sequences that 1) include the first V segment codon that encodes the mature polypeptide chain (i.e. after the leader sequence) and 2) include the last complete codon of the J segment (i.e. 1 bp before the J->C splice site) and 3) provide sequence information for all positions between 1) and 2). To be considered `complete & untemplated`, the sections of the sequences defined in points 1) to 3) of the previous sentence MUST be untemplated, i.e. MUST NOT overlap with the primers used in library preparation.	partial	complete_sequences
+3 / process (nucl. acid)	Physical linkage of different loci	string	{"controlled_vocabulary": ["none", "hetero_head-head"]}	Describes the mode of linkage if a method was used which physically links nucleic acids derived from distinct loci in a single-cell context	IGH-IGK/IGL-head/head	physical_linkage
 3 / process (sequencing)	Total reads passing QC filter	integer	Number	Number of usable reads for analysis	10365118	total_reads_passing_qc_filter
 3 / process (sequencing)	Sequencing platform	string	Free text	Designation of sequencing instrument used	Alumina LoSeq 1000	sequencing_platform
 3 / process (sequencing)	Read lengths	integer	Array of numbers	Read length in bases for each direction	[300,300]	read_length
diff --git a/specs/airr-schema.yaml b/specs/airr-schema.yaml
index 2d4680699..d8f7c6b05 100644
--- a/specs/airr-schema.yaml
+++ b/specs/airr-schema.yaml
@@ -39,19 +39,19 @@ Study:
             x-miairr: true
         collected_by:
             type: string
-            description: Full contact information of the corresponding author, i.e. who is legally responsible for data collection and release. This should include an e-mail address.
+            description: Full contact information of the data collector, i.e. the person who is legally responsible for data collection and release. This should include an e-mail address.
             x-miairr: true
         lab_name:
             type: string
-            description: Department of corresponding author
+            description: Department of data collector
             x-miairr: true
         lab_address:
             type: string
-            description: Institutional address of corresponding author
+            description: Institution and institutional address of data collector
             x-miairr: true
         submitted_by:
             type: string
-            description: Full contact information of the submitter, i.e. the person deposition the data
+            description: Full contact information of the data depositor, i.e. the person submitting the data to a repository. This is supposed to be a short-lived and technical role until the submission is relased.
             x-miairr: true
         pub_ids:
             type: string
@@ -94,7 +94,7 @@ Subject:
             x-miairr: true
         age_event:
             type: string
-            description: Event in the study schedule to which `Age` refers to. For NCBI BioSample this MUST be `sampling`. For other implementations submitters need to be aware that there is currently no mechanism to encode to potential delta between `Age event` and `Sample collection time`, hence the chosen events should be in temporal proximity.
+            description: Event in the study schedule to which `Age` refers. For NCBI BioSample this MUST be `sampling`. For other implementations submitters need to be aware that there is currently no mechanism to encode to potential delta between `Age event` and `Sample collection time`, hence the chosen events should be in temporal proximity.
             x-miairr: true
         ancestry_population:
             type: string
@@ -114,7 +114,7 @@ Subject:
             x-miairr: true
         linked_subjects:
             type: string
-            description: Subject ID to which `Relation type` refers to
+            description: Subject ID to which `Relation type` refers
             x-miairr: true
         link_type:
             type: string
diff --git a/specs/miairr.yaml b/specs/miairr.yaml
index cc1bfaa58..809ec13d8 100644
--- a/specs/miairr.yaml
+++ b/specs/miairr.yaml
@@ -14,8 +14,8 @@ study_description:
    miairr_set: 1
    miairr_subset: study
    miairr_name: Study type
-   format: controlled vocabulary
-   example: Placebo controlled phase 3 clinical trial
+   format: {"ontology": "NCIT", "top_node": "Study Design", "draft": true}
+   example: Case-Control Study
 inclusion_exclusion_criteria:
    miairr_set: 1
    miairr_subset: study
@@ -33,13 +33,13 @@ collected_by:
    miairr_subset: study
    miairr_name: Contact information (data collection)
    format: free text
-   example: p.stibbons@unseenu.edu
+   example: Dr. P. Stibbons, p.stibbons@unseenu.edu
 lab_name:
    miairr_set: 1
    miairr_subset: study
    miairr_name: Lab name
    format: free text
-   example: Department of Planar Immunology
+   example: Department for Planar Immunology
 lab_address:
    miairr_set: 1
    miairr_subset: study
@@ -51,7 +51,7 @@ submitted_by:
    miairr_subset: study
    miairr_name: Contact information (data deposition)
    format: free text
-   example: Dr. P. Stibbons
+   example: Adrian Turnipseed, a.turnipseed@unseenu.edu
 pub_ids:
    miairr_set: 1
    miairr_subset: study
@@ -74,13 +74,13 @@ organism:
    miairr_set: 1
    miairr_subset: subject
    miairr_name: Organism
-   format: controlled vocabulary
+   format: {"ontology": "NCBITAXON", "top_node": "Gnathostomata", "draft": false}
    example: Homo sapiens
 sex:
    miairr_set: 1
    miairr_subset: subject
    miairr_name: Sex
-   format: /(male|female|pooled|hermaphrodite|intersex|not collected|not applicable)/
+   format: {"controlled_vocabulary": ["male", "female", "pooled", "hermaphrodite", "intersex", "not collected", "not applicable"]}
    example: female
 age:
    miairr_set: 1
@@ -236,8 +236,8 @@ cell_subset:
    miairr_set: 3
    miairr_subset: process (cell)
    miairr_name: Cell subset
-   format: controlled vocabulary
-   example: Class-switched Memory B cells
+   format: {"ontology": "CL", "top_node": "lymphocyte", "draft": true}
+   example: class switched memory B cell
 cell_phenotype:
    miairr_set: 3
    miairr_subset: process (cell)
@@ -290,7 +290,7 @@ template_class:
    miairr_set: 3
    miairr_subset: process (nucleic acid)
    miairr_name: Target substrate
-   format: /(DNA|RNA)/
+   format: {"controlled_vocabulary": ["DNA", "RNA"]}
    example: RNA
 template_quality:
    miairr_set: 3
@@ -308,8 +308,8 @@ library_generation_method:
    miairr_set: 3
    miairr_subset: process (nucleic acid)
    miairr_name: Library generation method
-   format: controlled vocabulary
-   example: Oligo-dT primed 5' RACE
+   format: {"controlled_vocabulary": ["PCR", "RT(RHP)+PCR", "RT(oligo-dT)+PCR", "RT(oligo-dT)+TS+PCR", "RT(oligo-dT)+TS(UMI)+PCR", "RT(specific)+PCR", "RT(specific)+TS+PCR", "RT(specific)+TS(UMI)+PCR", "RT(specific+UMI)+PCR", "RT(specific+UMI)+TS+PCR", "RT(specific)+TS", "other"]}
+   example: RT(oligo-dT)+TS(UMI)+PCR
 library_generation_protocol:
    miairr_set: 3
    miairr_subset: process (nucleic acid)
@@ -344,14 +344,14 @@ complete_sequences:
    miairr_set: 3
    miairr_subset: process (nucleic acid)
    miairr_name: Complete sequences
-   format: /(partial|complete|complete+untemplated)/
+   format: {"controlled_vocabulary": ["partial", "complete", "complete+untemplated"]}
    example: partial
 physical_linkage:
    miairr_set: 3
    miairr_subset: process (nucleic acid)
    miairr_name: Physical linkage of different loci
-   format: controlled vocabulary
-   example: IGH-IGK/IGL-head/head
+   format: {"controlled_vocabulary": ["none", "hetero_head-head"]}
+   example: hetero_head-head
 total_reads_passing_qc_filter:
    miairr_set: 3
    miairr_subset: process (nucleic acid)