Skip to content

Commit

Permalink
generic assay binary data and categorical data validator
Browse files Browse the repository at this point in the history
  • Loading branch information
dippindots authored and Gaofei Zhao committed Oct 26, 2020
1 parent e944d9c commit 38dc58b
Show file tree
Hide file tree
Showing 24 changed files with 270 additions and 87 deletions.
2 changes: 2 additions & 0 deletions core/src/main/scripts/importer/allowed_data_types.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,6 @@ PROTEIN_LEVEL Z-SCORE protein_quantification_zscores
GENESET_SCORE GSVA-SCORE gsva_scores
GENESET_SCORE P-VALUE gsva_pvalues
GENERIC_ASSAY LIMIT-VALUE *
GENERIC_ASSAY BINARY *
GENERIC_ASSAY CATEGORICAL *
STRUCTURAL_VARIANT SV structural_variants
38 changes: 34 additions & 4 deletions core/src/main/scripts/importer/cbioportal_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ class MetaFileTypes(object):
GENE_PANEL_MATRIX = 'meta_gene_panel_matrix'
GSVA_SCORES = 'meta_gsva_scores'
GSVA_PVALUES = 'meta_gsva_pvalues'
GENERIC_ASSAY = 'meta_generic_assay'
GENERIC_ASSAY_CONTINUOUS = 'meta_generic_assay_continuous'
GENERIC_ASSAY_BINARY = 'meta_generic_assay_binary'
GENERIC_ASSAY_CATEGORICAL = 'meta_generic_assay_categorical'
STRUCTURAL_VARIANT = 'meta_structural_variants'
SAMPLE_RESOURCES = 'meta_resource_sample'
PATIENT_RESOURCES = 'meta_resource_patient'
Expand Down Expand Up @@ -257,7 +259,7 @@ class MetaFileTypes(object):
'show_profile_in_analysis_tab': True,
'geneset_def_version': True
},
MetaFileTypes.GENERIC_ASSAY: {
MetaFileTypes.GENERIC_ASSAY_CONTINUOUS: {
'cancer_study_identifier': True,
'genetic_alteration_type': True,
'generic_assay_type': True,
Expand All @@ -271,6 +273,30 @@ class MetaFileTypes(object):
'pivot_threshold_value': False,
'value_sort_order': False
},
MetaFileTypes.GENERIC_ASSAY_BINARY: {
'cancer_study_identifier': True,
'genetic_alteration_type': True,
'generic_assay_type': True,
'datatype': True,
'stable_id': True,
'profile_name': True,
'profile_description': True,
'data_filename': True,
'show_profile_in_analysis_tab': True,
'generic_entity_meta_properties': False
},
MetaFileTypes.GENERIC_ASSAY_CATEGORICAL: {
'cancer_study_identifier': True,
'genetic_alteration_type': True,
'generic_assay_type': True,
'datatype': True,
'stable_id': True,
'profile_name': True,
'profile_description': True,
'data_filename': True,
'show_profile_in_analysis_tab': True,
'generic_entity_meta_properties': False
},
MetaFileTypes.STRUCTURAL_VARIANT: {
'cancer_study_identifier': True,
'genetic_alteration_type': True,
Expand Down Expand Up @@ -325,7 +351,9 @@ class MetaFileTypes(object):
MetaFileTypes.GENE_PANEL_MATRIX: "org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap",
MetaFileTypes.GSVA_SCORES: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.GSVA_PVALUES: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.GENERIC_ASSAY: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.GENERIC_ASSAY_CONTINUOUS: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.GENERIC_ASSAY_BINARY: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.GENERIC_ASSAY_CATEGORICAL: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.STRUCTURAL_VARIANT: "org.mskcc.cbio.portal.scripts.ImportProfileData",
MetaFileTypes.SAMPLE_RESOURCES: "org.mskcc.cbio.portal.scripts.ImportResourceData",
MetaFileTypes.PATIENT_RESOURCES: "org.mskcc.cbio.portal.scripts.ImportResourceData",
Expand Down Expand Up @@ -594,7 +622,9 @@ def get_meta_file_type(meta_dictionary, logger, filename):
("MUTSIG", "Q-VALUE"): MetaFileTypes.MUTATION_SIGNIFICANCE,
("GENESET_SCORE", "GSVA-SCORE"): MetaFileTypes.GSVA_SCORES,
("GENESET_SCORE", "P-VALUE"): MetaFileTypes.GSVA_PVALUES,
("GENERIC_ASSAY", "LIMIT-VALUE"): MetaFileTypes.GENERIC_ASSAY
("GENERIC_ASSAY", "LIMIT-VALUE"): MetaFileTypes.GENERIC_ASSAY_CONTINUOUS,
("GENERIC_ASSAY", "BINARY"): MetaFileTypes.GENERIC_ASSAY_BINARY,
("GENERIC_ASSAY", "CATEGORICAL"): MetaFileTypes.GENERIC_ASSAY_CATEGORICAL
}
result = None
if 'genetic_alteration_type' in meta_dictionary and 'datatype' in meta_dictionary:
Expand Down
112 changes: 83 additions & 29 deletions core/src/main/scripts/importer/validateData.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,9 @@
cbioportal_common.MetaFileTypes.GENE_PANEL_MATRIX:'GenePanelMatrixValidator',
cbioportal_common.MetaFileTypes.GSVA_SCORES:'GsvaScoreValidator',
cbioportal_common.MetaFileTypes.GSVA_PVALUES:'GsvaPvalueValidator',
cbioportal_common.MetaFileTypes.GENERIC_ASSAY:'GenericAssayValidator',
cbioportal_common.MetaFileTypes.GENERIC_ASSAY_CONTINUOUS:'GenericAssayContinuousValidator',
cbioportal_common.MetaFileTypes.GENERIC_ASSAY_BINARY:'GenericAssayBinaryValidator',
cbioportal_common.MetaFileTypes.GENERIC_ASSAY_CATEGORICAL:'GenericAssayCategoricalValidator',
cbioportal_common.MetaFileTypes.STRUCTURAL_VARIANT:'StructuralVariantValidator',
cbioportal_common.MetaFileTypes.SAMPLE_RESOURCES:'SampleResourceValidator',
cbioportal_common.MetaFileTypes.PATIENT_RESOURCES:'PatientResourceValidator',
Expand Down Expand Up @@ -4334,6 +4336,7 @@ def __init__(self, *args, **kwargs):
REQUIRED_HEADERS = ['ENTITY_STABLE_ID']
OPTIONAL_HEADERS = []
UNIQUE_COLUMNS = ['ENTITY_STABLE_ID']
NULL_VALUES = ["NA"]

def parseFeatureColumns(self, nonsample_col_vals):
"""Check the IDs in the first column."""
Expand All @@ -4343,21 +4346,15 @@ def parseFeatureColumns(self, nonsample_col_vals):
extra={'line_number': self.line_number,
'column_number': 1,
'cause': nonsample_col_vals[0]})
return None

def checkValue(self, value, col_index):
"""Check a value in a sample column."""
stripped_value = value.strip()
if stripped_value not in self.NULL_VALUES and not self.checkFloat(stripped_value):
self.logger.error("Value is neither a real number nor " + ', '.join(self.NULL_VALUES),
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return value

class GenericAssayValidator(GenericAssayWiseFileValidator):
class GenericAssayContinuousValidator(GenericAssayWiseFileValidator):

""" Validator for files containing generic assay values.
""" Validator for files containing generic assay limit continuous values.
"""
def __init__(self, *args, **kwargs):
"""Initialize the instance attributes of the data file validator."""
super(GenericAssayContinuousValidator, self).__init__(*args, **kwargs)

# (1) Natural positive number (not 0)
# (2) Number may be prefixed by ">" or "<"; f.i. ">n" means that the real value lies beyond value n.
Expand All @@ -4367,56 +4364,113 @@ class GenericAssayValidator(GenericAssayWiseFileValidator):
# (1) Cell contains a value without decimals and is not prefixed by ">"; value appears to be truncated but lacks ">" truncation indicator
def checkValue(self, value, col_index):
"""Check a value in a sample column."""
stripped_value = value.strip()
# if the value is prefixed with '>' or '<' remove this prefix
# prior to evaluation of the numeric value
hasTruncSymbol = re.match("^[><]", stripped_value)
stripped_value = re.sub(r"^[><]\s*","", stripped_value)

# do not check null values
# 'NA' is an allowed value. No further validations apply.
if stripped_value in self.NULL_VALUES:
return

# value is not defined (empty cell)
stripped_value = value.strip()
if stripped_value == "":
self.logger.error("Cell is empty. A response value value is expected. Use 'NA' to indicate missing values.",
if len(stripped_value) == 0:
self.logger.error("Cell is empty. A value is expected. Use 'NA' to indicate missing values.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return

# 'NA' is an allowed value. No further validations apply.
if stripped_value == 'NA':
return

# if the value is prefixed with '>' or '<' remove this prefix
# prior to evaluation of the numeric value
hasTruncSymbol = re.match("^[><]", stripped_value)
stripped_value = re.sub(r"^[><]\s*","", stripped_value)

try:
numeric_value = float(stripped_value)
except ValueError:
self.logger.error("Value cannot be interpreted as a floating point number and is not valid response value.",
self.logger.error("Value cannot be interpreted as a floating point number and is not valid value.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return

if math.isnan(numeric_value):
self.logger.error("Value is NaN, therefore, not a valid response value.",
self.logger.error("Value is NaN, therefore, not a valid value.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return

if math.isinf(numeric_value):
self.logger.error("Value is infinite and, therefore, not a valid response value.",
self.logger.error("Value is infinite and, therefore, not a valid value.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return

if numeric_value % 1 == 0 and not hasTruncSymbol:
self.logger.warning("Value has no decimals and may represent an invalid response value.",
self.logger.warning("Value has no decimals and may represent an invalid value.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})

return

class GenericAssayCategoricalValidator(GenericAssayWiseFileValidator):

""" Validator for files containing generic assay categorical values.
"""
def __init__(self, *args, **kwargs):
"""Initialize the instance attributes of the data file validator."""
super(GenericAssayCategoricalValidator, self).__init__(*args, **kwargs)

# (1) non-empty string
# (2) NA cell value is allowed; means value was not tested on a sample

def checkValue(self, value, col_index):
"""Check a value in a sample column."""
stripped_value = value.strip()
# do not check null values
# 'NA' is an allowed value. No further validations apply.
if stripped_value in self.NULL_VALUES:
return
# non-empty string
if len(stripped_value) == 0:
self.logger.error("Cell is empty. A categorical value is expected. Use 'NA' to indicate missing values.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})

return

class GenericAssayBinaryValidator(GenericAssayWiseFileValidator):

""" Validator for files containing generic assay binary values.
"""
def __init__(self, *args, **kwargs):
"""Initialize the instance attributes of the data file validator."""
super(GenericAssayBinaryValidator, self).__init__(*args, **kwargs)

# (1) values defined in ALLOWED_VALUES
# (2) NA cell value is allowed; means value was not tested on a sample

ALLOWED_VALUES = ['yes', 'no', 'true', 'false'] + GenericAssayWiseFileValidator.NULL_VALUES

def checkValue(self, value, col_index):
"""Check a value in a sample column."""
stripped_value = value.strip()
# do not check null values
# 'NA' is an allowed value. No further validations apply.
if stripped_value in self.NULL_VALUES:
return
if stripped_value not in self.ALLOWED_VALUES:
self.logger.error(
'Invalid generic assay binary value: possible values are [%s]',
', '.join(self.ALLOWED_VALUES),
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})

return

# ------------------------------------------------------------------------------
# Functions

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public void testImportGenericAssayData() throws Exception {

// import data and test all mutational signatures were added
ImportGenericAssayEntity.importData(file, GeneticAlterationType.GENERIC_ASSAY, "name,description", false);
assertEquals(61, getNumRecordsForGenericAssay());
assertEquals(60, getNumRecordsForGenericAssay());

// test wether a record can be retrieved via stable id
GenericAssayMeta genericAssayMeta1 = DaoGenericAssay.getGenericAssayMetaByStableId("mean_1");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -349,19 +349,19 @@ public void testLoadStudyEs0() throws Throwable {
assertEquals("this is an optional custom case list", customSampleList.getName());

// ===== check mutational signature
String testMutationalSignatureStableIds = "Nmut";
String testMutationalSignatureStableIds = "mean_1";
String testMutationalSignatureMolecularProfileIds = "study_es_0_mutational_signature";
assertNotNull(DaoGeneticEntity.getGeneticEntityByStableId(testMutationalSignatureStableIds));
// ENTITY_STABLE_ID name description TCGA-A1-A0SB-01 TCGA-A1-A0SD-01
// TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01
// TCGA-BH-A0HP-01 TCGA-BH-A18P-01
// Nmut ... ... ... 18 3 32 13 3 4 1 7
// mean_1 ... ... ... 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
GenericAssayService genericAssayService = applicationContext.getBean(GenericAssayService.class);
List<GenericAssayData> mutationalSignatureData = genericAssayService.fetchGenericAssayData(Arrays.asList(testMutationalSignatureMolecularProfileIds),
Arrays.asList("TCGA-A1-A0SB-01", "TCGA-A1-A0SH-01"), Arrays.asList(testMutationalSignatureStableIds), PersistenceConstants.SUMMARY_PROJECTION);
assertEquals(2, mutationalSignatureData.size());
assertEquals("18", mutationalSignatureData.get(0).getValue());
assertEquals("13", mutationalSignatureData.get(1).getValue());
assertEquals("0.370266873", mutationalSignatureData.get(0).getValue());
assertEquals("0.022753384", mutationalSignatureData.get(1).getValue());

// ===== check GSVA data
// ...
Expand Down
1 change: 0 additions & 1 deletion core/src/test/resources/data_mutational_signature.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ENTITY_STABLE_ID name description TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
Nmut Nmut number of mutations 18 3 32 13 3 4 1 7
mean_1 mean_1 mean_1 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_10 mean_10 mean_10 0.002709404 0.009212318 0.002650657 0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_11 mean_11 mean_11 0.006035782 0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 0.025730547 0.020463421
Expand Down
1 change: 0 additions & 1 deletion core/src/test/resources/data_mutational_signature_new.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ENTITY_STABLE_ID name description TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
Nmut Nmut number of mutations 18 3 32 13 3 4 1 7
mean_1 mean_1 new mean_1 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_10 mean_10 mean_10 0.002709404 0.009212318 0.002650657 0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_11 mean_11 mean_11 0.006035782 0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 0.025730547 0.020463421
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 category_1 category_2 category_2 category_2 category_1 category_1 category_3 category_4
mean_2 mean_2 mean_2 url_2 category_1 category_1 category_2 category_1 category_2 category_3 category_4
mean_3 mean_3 mean_3 url_3 category_2 category_1 category_2 category_2 category_1 category_2 NA category_3
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 true false false false true true yes no
mean_2 mean_2 mean_2 url_2 false true true false true false yes no
mean_3 mean_3 mean_3 url_3 false true false false true false NA yes
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 category_1 category_2 category_2 category_2 category_1 category_1 category_3 category_4
mean_2 mean_2 mean_2 url_2 category_2 category_1 category_1 category_2 category_1 category_2 category_3 category_4
mean_3 mean_3 mean_3 url_3 category_2 category_1 category_2 category_2 category_1 category_2 NA category_3
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 NA 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_2 mean_2 mean_2 url_2 0.002709404 0.009212318 0.002650657 <0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_3 mean_3 mean_3 url_3 0.006035782 >0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 NA 0.020463421
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 NA 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_2 mean_2 mean_2 url_2 NON_NUMERIC 0.009212318 0.002650657 0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_3 mean_3 mean_3 url_3 0.006035782 0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 0.025730547 0.020463421
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 true false false false true true yes no
mean_2 mean_2 mean_2 url_2 false NOT_DEFINED true false true false yes no
mean_3 mean_3 mean_3 url_3 false true false false true false NA yes
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 url_1 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_2 mean_2 url_2 0.002709404 0.009212318 0.002650657 0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_3 mean_3 url_3 0.006035782 0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 0.025730547 0.020463421

This file was deleted.

This file was deleted.

0 comments on commit 38dc58b

Please sign in to comment.