Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update validator for generic assay binary and categorical data #7973

Merged
merged 1 commit into from
Oct 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions core/src/main/scripts/importer/allowed_data_types.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,6 @@ PROTEIN_LEVEL Z-SCORE protein_quantification_zscores
GENESET_SCORE GSVA-SCORE gsva_scores
GENESET_SCORE P-VALUE gsva_pvalues
GENERIC_ASSAY LIMIT-VALUE *
GENERIC_ASSAY BINARY *
GENERIC_ASSAY CATEGORICAL *
STRUCTURAL_VARIANT SV structural_variants
38 changes: 34 additions & 4 deletions core/src/main/scripts/importer/cbioportal_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ class MetaFileTypes(object):
GENE_PANEL_MATRIX = 'meta_gene_panel_matrix'
GSVA_SCORES = 'meta_gsva_scores'
GSVA_PVALUES = 'meta_gsva_pvalues'
GENERIC_ASSAY = 'meta_generic_assay'
GENERIC_ASSAY_CONTINUOUS = 'meta_generic_assay_continuous'
GENERIC_ASSAY_BINARY = 'meta_generic_assay_binary'
GENERIC_ASSAY_CATEGORICAL = 'meta_generic_assay_categorical'
STRUCTURAL_VARIANT = 'meta_structural_variants'
SAMPLE_RESOURCES = 'meta_resource_sample'
PATIENT_RESOURCES = 'meta_resource_patient'
Expand Down Expand Up @@ -257,7 +259,7 @@ class MetaFileTypes(object):
'show_profile_in_analysis_tab': True,
'geneset_def_version': True
},
MetaFileTypes.GENERIC_ASSAY: {
MetaFileTypes.GENERIC_ASSAY_CONTINUOUS: {
'cancer_study_identifier': True,
'genetic_alteration_type': True,
'generic_assay_type': True,
Expand All @@ -271,6 +273,30 @@ class MetaFileTypes(object):
'pivot_threshold_value': False,
'value_sort_order': False
},
MetaFileTypes.GENERIC_ASSAY_BINARY: {
'cancer_study_identifier': True,
'genetic_alteration_type': True,
'generic_assay_type': True,
'datatype': True,
'stable_id': True,
'profile_name': True,
'profile_description': True,
'data_filename': True,
'show_profile_in_analysis_tab': True,
'generic_entity_meta_properties': False
},
MetaFileTypes.GENERIC_ASSAY_CATEGORICAL: {
'cancer_study_identifier': True,
'genetic_alteration_type': True,
'generic_assay_type': True,
'datatype': True,
'stable_id': True,
'profile_name': True,
'profile_description': True,
'data_filename': True,
'show_profile_in_analysis_tab': True,
'generic_entity_meta_properties': False
},
MetaFileTypes.STRUCTURAL_VARIANT: {
'cancer_study_identifier': True,
'genetic_alteration_type': True,
Expand Down Expand Up @@ -325,7 +351,9 @@ class MetaFileTypes(object):
MetaFileTypes.GENE_PANEL_MATRIX: "org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap",
MetaFileTypes.GSVA_SCORES: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.GSVA_PVALUES: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.GENERIC_ASSAY: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.GENERIC_ASSAY_CONTINUOUS: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.GENERIC_ASSAY_BINARY: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.GENERIC_ASSAY_CATEGORICAL: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.STRUCTURAL_VARIANT: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.SAMPLE_RESOURCES: "org.mskcc.cbio.portal.scripts.ImportResourceData",
MetaFileTypes.PATIENT_RESOURCES: "org.mskcc.cbio.portal.scripts.ImportResourceData",
Expand Down Expand Up @@ -594,7 +622,9 @@ def get_meta_file_type(meta_dictionary, logger, filename):
("MUTSIG", "Q-VALUE"): MetaFileTypes.MUTATION_SIGNIFICANCE,
("GENESET_SCORE", "GSVA-SCORE"): MetaFileTypes.GSVA_SCORES,
("GENESET_SCORE", "P-VALUE"): MetaFileTypes.GSVA_PVALUES,
("GENERIC_ASSAY", "LIMIT-VALUE"): MetaFileTypes.GENERIC_ASSAY
("GENERIC_ASSAY", "LIMIT-VALUE"): MetaFileTypes.GENERIC_ASSAY_CONTINUOUS,
("GENERIC_ASSAY", "BINARY"): MetaFileTypes.GENERIC_ASSAY_BINARY,
("GENERIC_ASSAY", "CATEGORICAL"): MetaFileTypes.GENERIC_ASSAY_CATEGORICAL
}
result = None
if 'genetic_alteration_type' in meta_dictionary and 'datatype' in meta_dictionary:
Expand Down
112 changes: 83 additions & 29 deletions core/src/main/scripts/importer/validateData.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,9 @@
cbioportal_common.MetaFileTypes.GENE_PANEL_MATRIX:'GenePanelMatrixValidator',
cbioportal_common.MetaFileTypes.GSVA_SCORES:'GsvaScoreValidator',
cbioportal_common.MetaFileTypes.GSVA_PVALUES:'GsvaPvalueValidator',
cbioportal_common.MetaFileTypes.GENERIC_ASSAY:'GenericAssayValidator',
cbioportal_common.MetaFileTypes.GENERIC_ASSAY_CONTINUOUS:'GenericAssayContinuousValidator',
cbioportal_common.MetaFileTypes.GENERIC_ASSAY_BINARY:'GenericAssayBinaryValidator',
cbioportal_common.MetaFileTypes.GENERIC_ASSAY_CATEGORICAL:'GenericAssayCategoricalValidator',
cbioportal_common.MetaFileTypes.STRUCTURAL_VARIANT:'StructuralVariantValidator',
cbioportal_common.MetaFileTypes.SAMPLE_RESOURCES:'SampleResourceValidator',
cbioportal_common.MetaFileTypes.PATIENT_RESOURCES:'PatientResourceValidator',
Expand Down Expand Up @@ -4334,6 +4336,7 @@ def __init__(self, *args, **kwargs):
REQUIRED_HEADERS = ['ENTITY_STABLE_ID']
OPTIONAL_HEADERS = []
UNIQUE_COLUMNS = ['ENTITY_STABLE_ID']
NULL_VALUES = ["NA"]

def parseFeatureColumns(self, nonsample_col_vals):
"""Check the IDs in the first column."""
Expand All @@ -4343,21 +4346,15 @@ def parseFeatureColumns(self, nonsample_col_vals):
extra={'line_number': self.line_number,
'column_number': 1,
'cause': nonsample_col_vals[0]})
return None

def checkValue(self, value, col_index):
"""Check a value in a sample column."""
stripped_value = value.strip()
if stripped_value not in self.NULL_VALUES and not self.checkFloat(stripped_value):
self.logger.error("Value is neither a real number nor " + ', '.join(self.NULL_VALUES),
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return value

class GenericAssayValidator(GenericAssayWiseFileValidator):
class GenericAssayContinuousValidator(GenericAssayWiseFileValidator):

""" Validator for files containing generic assay values.
""" Validator for files containing generic assay limit continuous values.
"""
def __init__(self, *args, **kwargs):
"""Initialize the instance attributes of the data file validator."""
super(GenericAssayContinuousValidator, self).__init__(*args, **kwargs)

# (1) Natural positive number (not 0)
# (2) Number may be prefixed by ">" or "<"; f.i. ">n" means that the real value lies beyond value n.
Expand All @@ -4367,56 +4364,113 @@ class GenericAssayValidator(GenericAssayWiseFileValidator):
# (1) Cell contains a value without decimals and is not prefixed by ">"; value appears to be truncated but lacks ">" truncation indicator
def checkValue(self, value, col_index):
"""Check a value in a sample column."""
stripped_value = value.strip()
# if the value is prefixed with '>' or '<' remove this prefix
# prior to evaluation of the numeric value
hasTruncSymbol = re.match("^[><]", stripped_value)
stripped_value = re.sub(r"^[><]\s*","", stripped_value)

# do not check null values
# 'NA' is an allowed value. No further validations apply.
if stripped_value in self.NULL_VALUES:
return

# value is not defined (empty cell)
stripped_value = value.strip()
if stripped_value == "":
self.logger.error("Cell is empty. A response value value is expected. Use 'NA' to indicate missing values.",
if len(stripped_value) == 0:
self.logger.error("Cell is empty. A value is expected. Use 'NA' to indicate missing values.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return

# 'NA' is an allowed value. No further validations apply.
if stripped_value == 'NA':
return

# if the value is prefixed with '>' or '<' remove this prefix
# prior to evaluation of the numeric value
hasTruncSymbol = re.match("^[><]", stripped_value)
stripped_value = re.sub(r"^[><]\s*","", stripped_value)

try:
numeric_value = float(stripped_value)
except ValueError:
self.logger.error("Value cannot be interpreted as a floating point number and is not valid response value.",
self.logger.error("Value cannot be interpreted as a floating point number and is not valid value.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return

if math.isnan(numeric_value):
self.logger.error("Value is NaN, therefore, not a valid response value.",
self.logger.error("Value is NaN, therefore, not a valid value.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return

if math.isinf(numeric_value):
self.logger.error("Value is infinite and, therefore, not a valid response value.",
self.logger.error("Value is infinite and, therefore, not a valid value.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return

if numeric_value % 1 == 0 and not hasTruncSymbol:
self.logger.warning("Value has no decimals and may represent an invalid response value.",
self.logger.warning("Value has no decimals and may represent an invalid value.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})

return

class GenericAssayCategoricalValidator(GenericAssayWiseFileValidator):

""" Validator for files containing generic assay categorical values.
"""
def __init__(self, *args, **kwargs):
"""Initialize the instance attributes of the data file validator."""
super(GenericAssayCategoricalValidator, self).__init__(*args, **kwargs)

# (1) non-empty string
# (2) NA cell value is allowed; means value was not tested on a sample

def checkValue(self, value, col_index):
"""Check a value in a sample column."""
stripped_value = value.strip()
# do not check null values
# 'NA' is an allowed value. No further validations apply.
if stripped_value in self.NULL_VALUES:
return
# non-empty string
if len(stripped_value) == 0:
self.logger.error("Cell is empty. A categorical value is expected. Use 'NA' to indicate missing values.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})

return

class GenericAssayBinaryValidator(GenericAssayWiseFileValidator):

""" Validator for files containing generic assay binary values.
"""
def __init__(self, *args, **kwargs):
"""Initialize the instance attributes of the data file validator."""
super(GenericAssayBinaryValidator, self).__init__(*args, **kwargs)

# (1) values defined in ALLOWED_VALUES
# (2) NA cell value is allowed; means value was not tested on a sample

ALLOWED_VALUES = ['yes', 'no', 'true', 'false'] + GenericAssayWiseFileValidator.NULL_VALUES
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a small question. Are the binary values fixed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, in the RFC, we planned to have a controlled vocabulary for binary type: https://docs.google.com/document/d/1-6O16_j5b5LeHA5SnChnlEKQTYhcwNh4AEwCxB8FwC8/edit?disco=AAAAHCuNjcs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. We can extend the list if we have to introduce other binary values.


def checkValue(self, value, col_index):
"""Check a value in a sample column."""
stripped_value = value.strip()
# do not check null values
# 'NA' is an allowed value. No further validations apply.
if stripped_value in self.NULL_VALUES:
return
if stripped_value not in self.ALLOWED_VALUES:
self.logger.error(
'Invalid generic assay binary value: possible values are [%s]',
', '.join(self.ALLOWED_VALUES),
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})

return

# ------------------------------------------------------------------------------
# Functions

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public void testImportGenericAssayData() throws Exception {

// import data and test all mutational signatures were added
ImportGenericAssayEntity.importData(file, GeneticAlterationType.GENERIC_ASSAY, "name,description", false);
assertEquals(61, getNumRecordsForGenericAssay());
assertEquals(60, getNumRecordsForGenericAssay());

// test wether a record can be retrieved via stable id
GenericAssayMeta genericAssayMeta1 = DaoGenericAssay.getGenericAssayMetaByStableId("mean_1");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -349,19 +349,19 @@ public void testLoadStudyEs0() throws Throwable {
assertEquals("this is an optional custom case list", customSampleList.getName());

// ===== check mutational signature
String testMutationalSignatureStableIds = "Nmut";
String testMutationalSignatureStableIds = "mean_1";
String testMutationalSignatureMolecularProfileIds = "study_es_0_mutational_signature";
assertNotNull(DaoGeneticEntity.getGeneticEntityByStableId(testMutationalSignatureStableIds));
// ENTITY_STABLE_ID name description TCGA-A1-A0SB-01 TCGA-A1-A0SD-01
// TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01
// TCGA-BH-A0HP-01 TCGA-BH-A18P-01
// Nmut ... ... ... 18 3 32 13 3 4 1 7
// mean_1 ... ... ... 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
GenericAssayService genericAssayService = applicationContext.getBean(GenericAssayService.class);
List<GenericAssayData> mutationalSignatureData = genericAssayService.fetchGenericAssayData(Arrays.asList(testMutationalSignatureMolecularProfileIds),
Arrays.asList("TCGA-A1-A0SB-01", "TCGA-A1-A0SH-01"), Arrays.asList(testMutationalSignatureStableIds), PersistenceConstants.SUMMARY_PROJECTION);
assertEquals(2, mutationalSignatureData.size());
assertEquals("18", mutationalSignatureData.get(0).getValue());
assertEquals("13", mutationalSignatureData.get(1).getValue());
assertEquals("0.370266873", mutationalSignatureData.get(0).getValue());
assertEquals("0.022753384", mutationalSignatureData.get(1).getValue());

// ===== check GSVA data
// ...
Expand Down
1 change: 0 additions & 1 deletion core/src/test/resources/data_mutational_signature.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ENTITY_STABLE_ID name description TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
Nmut Nmut number of mutations 18 3 32 13 3 4 1 7
mean_1 mean_1 mean_1 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_10 mean_10 mean_10 0.002709404 0.009212318 0.002650657 0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_11 mean_11 mean_11 0.006035782 0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 0.025730547 0.020463421
Expand Down
1 change: 0 additions & 1 deletion core/src/test/resources/data_mutational_signature_new.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ENTITY_STABLE_ID name description TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
Nmut Nmut number of mutations 18 3 32 13 3 4 1 7
mean_1 mean_1 new mean_1 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_10 mean_10 mean_10 0.002709404 0.009212318 0.002650657 0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_11 mean_11 mean_11 0.006035782 0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 0.025730547 0.020463421
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 category_1 category_2 category_2 category_2 category_1 category_1 category_3 category_4
mean_2 mean_2 mean_2 url_2 category_1 category_1 category_2 category_1 category_2 category_3 category_4
mean_3 mean_3 mean_3 url_3 category_2 category_1 category_2 category_2 category_1 category_2 NA category_3
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 true false false false true true yes no
mean_2 mean_2 mean_2 url_2 false true true false true false yes no
mean_3 mean_3 mean_3 url_3 false true false false true false NA yes
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 category_1 category_2 category_2 category_2 category_1 category_1 category_3 category_4
mean_2 mean_2 mean_2 url_2 category_2 category_1 category_1 category_2 category_1 category_2 category_3 category_4
mean_3 mean_3 mean_3 url_3 category_2 category_1 category_2 category_2 category_1 category_2 NA category_3
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 NA 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_2 mean_2 mean_2 url_2 0.002709404 0.009212318 0.002650657 <0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_3 mean_3 mean_3 url_3 0.006035782 >0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 NA 0.020463421
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 NA 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_2 mean_2 mean_2 url_2 NON_NUMERIC 0.009212318 0.002650657 0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_3 mean_3 mean_3 url_3 0.006035782 0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 0.025730547 0.020463421
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 true false false false true true yes no
mean_2 mean_2 mean_2 url_2 false NOT_DEFINED true false true false yes no
mean_3 mean_3 mean_3 url_3 false true false false true false NA yes
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 url_1 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_2 mean_2 url_2 0.002709404 0.009212318 0.002650657 0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_3 mean_3 url_3 0.006035782 0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 0.025730547 0.020463421

This file was deleted.

This file was deleted.