Skip to content

Commit

Permalink
generic assay binary data and categorical data validator
Browse files Browse the repository at this point in the history
  • Loading branch information
dippindots committed Oct 16, 2020
1 parent 59729fd commit 669316a
Show file tree
Hide file tree
Showing 22 changed files with 229 additions and 77 deletions.
2 changes: 2 additions & 0 deletions core/src/main/scripts/importer/allowed_data_types.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,6 @@ PROTEIN_LEVEL Z-SCORE protein_quantification_zscores
GENESET_SCORE GSVA-SCORE gsva_scores
GENESET_SCORE P-VALUE gsva_pvalues
GENERIC_ASSAY LIMIT-VALUE *
GENERIC_ASSAY BINARY *
GENERIC_ASSAY CATEGORICAL *
STRUCTURAL_VARIANT SV structural_variants
6 changes: 5 additions & 1 deletion core/src/main/scripts/importer/cbioportal_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ class MetaFileTypes(object):
GSVA_SCORES = 'meta_gsva_scores'
GSVA_PVALUES = 'meta_gsva_pvalues'
GENERIC_ASSAY = 'meta_generic_assay'
GENERIC_ASSAY_BINARY = 'meta_generic_assay_binary'
GENERIC_ASSAY_CATEGORICAL = 'meta_generic_assay_categorical'
STRUCTURAL_VARIANT = 'meta_structural_variants'
SAMPLE_RESOURCES = 'meta_resource_sample'
PATIENT_RESOURCES = 'meta_resource_patient'
Expand Down Expand Up @@ -593,7 +595,9 @@ def get_meta_file_type(meta_dictionary, logger, filename):
("MUTSIG", "Q-VALUE"): MetaFileTypes.MUTATION_SIGNIFICANCE,
("GENESET_SCORE", "GSVA-SCORE"): MetaFileTypes.GSVA_SCORES,
("GENESET_SCORE", "P-VALUE"): MetaFileTypes.GSVA_PVALUES,
("GENERIC_ASSAY", "LIMIT-VALUE"): MetaFileTypes.GENERIC_ASSAY
("GENERIC_ASSAY", "LIMIT-VALUE"): MetaFileTypes.GENERIC_ASSAY,
("GENERIC_ASSAY", "BINARY"): MetaFileTypes.GENERIC_ASSAY_BINARY,
("GENERIC_ASSAY", "CATEGORICAL"): MetaFileTypes.GENERIC_ASSAY_CATEGORICAL
}
result = None
if 'genetic_alteration_type' in meta_dictionary and 'datatype' in meta_dictionary:
Expand Down
108 changes: 81 additions & 27 deletions core/src/main/scripts/importer/validateData.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@
cbioportal_common.MetaFileTypes.GSVA_SCORES:'GsvaScoreValidator',
cbioportal_common.MetaFileTypes.GSVA_PVALUES:'GsvaPvalueValidator',
cbioportal_common.MetaFileTypes.GENERIC_ASSAY:'GenericAssayValidator',
cbioportal_common.MetaFileTypes.GENERIC_ASSAY_BINARY:'GenericAssayBinaryValidator',
cbioportal_common.MetaFileTypes.GENERIC_ASSAY_CATEGORICAL:'GenericAssayCategoricalValidator',
cbioportal_common.MetaFileTypes.STRUCTURAL_VARIANT:'StructuralVariantValidator',
cbioportal_common.MetaFileTypes.SAMPLE_RESOURCES:'SampleResourceValidator',
cbioportal_common.MetaFileTypes.PATIENT_RESOURCES:'PatientResourceValidator',
Expand Down Expand Up @@ -4328,6 +4330,7 @@ def __init__(self, *args, **kwargs):
REQUIRED_HEADERS = ['ENTITY_STABLE_ID']
OPTIONAL_HEADERS = []
UNIQUE_COLUMNS = ['ENTITY_STABLE_ID']
NULL_VALUES = ["NA"]

def parseFeatureColumns(self, nonsample_col_vals):
"""Check the IDs in the first column."""
Expand All @@ -4337,21 +4340,15 @@ def parseFeatureColumns(self, nonsample_col_vals):
extra={'line_number': self.line_number,
'column_number': 1,
'cause': nonsample_col_vals[0]})
return None

def checkValue(self, value, col_index):
"""Check a value in a sample column."""
stripped_value = value.strip()
if stripped_value not in self.NULL_VALUES and not self.checkFloat(stripped_value):
self.logger.error("Value is neither a real number nor " + ', '.join(self.NULL_VALUES),
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return value

class GenericAssayValidator(GenericAssayWiseFileValidator):

""" Validator for files containing generic assay values.
""" Validator for files containing generic assay limit continuous values.
"""
def __init__(self, *args, **kwargs):
"""Initialize the instance attributes of the data file validator."""
super(GenericAssayValidator, self).__init__(*args, **kwargs)

# (1) Natural positive number (not 0)
# (2) Number may be prefixed by ">" or "<"; f.i. ">n" means that the real value lies beyond value n.
Expand All @@ -4361,56 +4358,113 @@ class GenericAssayValidator(GenericAssayWiseFileValidator):
# (1) Cell contains a value without decimals and is not prefixed by ">"; value appears to be truncated but lacks ">" truncation indicator
def checkValue(self, value, col_index):
"""Check a value in a sample column."""
stripped_value = value.strip()
# if the value is prefixed with '>' or '<' remove this prefix
# prior to evaluation of the numeric value
hasTruncSymbol = re.match("^[><]", stripped_value)
stripped_value = re.sub(r"^[><]\s*","", stripped_value)

# do not check null values
# 'NA' is an allowed value. No further validations apply.
if stripped_value in self.NULL_VALUES:
return

# value is not defined (empty cell)
stripped_value = value.strip()
if stripped_value == "":
self.logger.error("Cell is empty. A response value value is expected. Use 'NA' to indicate missing values.",
if len(stripped_value) == 0:
self.logger.error("Cell is empty. A value is expected. Use 'NA' to indicate missing values.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return

# 'NA' is an allowed value. No further validations apply.
if stripped_value == 'NA':
return

# if the value is prefixed with '>' or '<' remove this prefix
# prior to evaluation of the numeric value
hasTruncSymbol = re.match("^[><]", stripped_value)
stripped_value = re.sub(r"^[><]\s*","", stripped_value)

try:
numeric_value = float(stripped_value)
except ValueError:
self.logger.error("Value cannot be interpreted as a floating point number and is not valid response value.",
self.logger.error("Value cannot be interpreted as a floating point number and is not valid value.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return

if math.isnan(numeric_value):
self.logger.error("Value is NaN, therefore, not a valid response value.",
self.logger.error("Value is NaN, therefore, not a valid value.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return

if math.isinf(numeric_value):
self.logger.error("Value is infinite and, therefore, not a valid response value.",
self.logger.error("Value is infinite and, therefore, not a valid value.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})
return

if numeric_value % 1 == 0 and not hasTruncSymbol:
self.logger.warning("Value has no decimals and may represent an invalid response value.",
self.logger.warning("Value has no decimals and may represent an invalid value.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})

return

class GenericAssayCategoricalValidator(GenericAssayWiseFileValidator):

""" Validator for files containing generic assay categorical values.
"""
def __init__(self, *args, **kwargs):
"""Initialize the instance attributes of the data file validator."""
super(GenericAssayCategoricalValidator, self).__init__(*args, **kwargs)

# (1) non-empty string
# (2) NA cell value is allowed; means value was not tested on a sample

def checkValue(self, value, col_index):
"""Check a value in a sample column."""
stripped_value = value.strip()
# do not check null values
# 'NA' is an allowed value. No further validations apply.
if stripped_value in self.NULL_VALUES:
return
# non-empty string
if len(stripped_value) == 0:
self.logger.error("Cell is empty. A categorical value is expected. Use 'NA' to indicate missing values.",
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})

return

class GenericAssayBinaryValidator(GenericAssayWiseFileValidator):

""" Validator for files containing generic assay binary values.
"""
def __init__(self, *args, **kwargs):
"""Initialize the instance attributes of the data file validator."""
super(GenericAssayBinaryValidator, self).__init__(*args, **kwargs)

# (1) values defined in ALLOWED_VALUES
# (2) NA cell value is allowed; means value was not tested on a sample

ALLOWED_VALUES = ['yes', 'no', 'true', 'false'] + GenericAssayWiseFileValidator.NULL_VALUES

def checkValue(self, value, col_index):
"""Check a value in a sample column."""
stripped_value = value.strip()
# do not check null values
# 'NA' is an allowed value. No further validations apply.
if stripped_value in self.NULL_VALUES:
return
if stripped_value not in self.ALLOWED_VALUES:
self.logger.error(
'Invalid generic assay binary value: possible values are [%s]',
', '.join(self.ALLOWED_VALUES),
extra={'line_number': self.line_number,
'column_number': col_index + 1,
'cause': value})

return

# ------------------------------------------------------------------------------
# Functions

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -349,19 +349,19 @@ public void testLoadStudyEs0() throws Throwable {
assertEquals("this is an optional custom case list", customSampleList.getName());

// ===== check mutational signature
String testMutationalSignatureStableIds = "Nmut";
String testMutationalSignatureStableIds = "mean_1";
String testMutationalSignatureMolecularProfileIds = "study_es_0_mutational_signature";
assertNotNull(DaoGeneticEntity.getGeneticEntityByStableId(testMutationalSignatureStableIds));
// ENTITY_STABLE_ID name description TCGA-A1-A0SB-01 TCGA-A1-A0SD-01
// TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01
// TCGA-BH-A0HP-01 TCGA-BH-A18P-01
// Nmut ... ... ... 18 3 32 13 3 4 1 7
// mean_1 ... ... ... 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
GenericAssayService genericAssayService = applicationContext.getBean(GenericAssayService.class);
List<GenericAssayData> mutationalSignatureData = genericAssayService.fetchGenericAssayData(Arrays.asList(testMutationalSignatureMolecularProfileIds),
Arrays.asList("TCGA-A1-A0SB-01", "TCGA-A1-A0SH-01"), Arrays.asList(testMutationalSignatureStableIds), PersistenceConstants.SUMMARY_PROJECTION);
assertEquals(2, mutationalSignatureData.size());
assertEquals("18", mutationalSignatureData.get(0).getValue());
assertEquals("13", mutationalSignatureData.get(1).getValue());
assertEquals("0.370266873", mutationalSignatureData.get(0).getValue());
assertEquals("0.022753384", mutationalSignatureData.get(1).getValue());

// ===== check GSVA data
// ...
Expand Down
1 change: 0 additions & 1 deletion core/src/test/resources/data_mutational_signature.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ENTITY_STABLE_ID name description TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
Nmut Nmut number of mutations 18 3 32 13 3 4 1 7
mean_1 mean_1 mean_1 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_10 mean_10 mean_10 0.002709404 0.009212318 0.002650657 0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_11 mean_11 mean_11 0.006035782 0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 0.025730547 0.020463421
Expand Down
1 change: 0 additions & 1 deletion core/src/test/resources/data_mutational_signature_new.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ENTITY_STABLE_ID name description TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
Nmut Nmut number of mutations 18 3 32 13 3 4 1 7
mean_1 mean_1 new mean_1 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_10 mean_10 mean_10 0.002709404 0.009212318 0.002650657 0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_11 mean_11 mean_11 0.006035782 0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 0.025730547 0.020463421
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 category_1 category_2 category_2 category_2 category_1 category_1 category_3 category_4
mean_2 mean_2 mean_2 url_2 category_1 category_1 category_2 category_1 category_2 category_3 category_4
mean_3 mean_3 mean_3 url_3 category_2 category_1 category_2 category_2 category_1 category_2 NA category_3
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 true false false false true true yes no
mean_2 mean_2 mean_2 url_2 false true true false true false yes no
mean_3 mean_3 mean_3 url_3 false true false false true false NA yes
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 category_1 category_2 category_2 category_2 category_1 category_1 category_3 category_4
mean_2 mean_2 mean_2 url_2 category_2 category_1 category_1 category_2 category_1 category_2 category_3 category_4
mean_3 mean_3 mean_3 url_3 category_2 category_1 category_2 category_2 category_1 category_2 NA category_3
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 NA 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_2 mean_2 mean_2 url_2 0.002709404 0.009212318 0.002650657 <0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_3 mean_3 mean_3 url_3 0.006035782 >0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 NA 0.020463421
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 NA 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_2 mean_2 mean_2 url_2 NON_NUMERIC 0.009212318 0.002650657 0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_3 mean_3 mean_3 url_3 0.006035782 0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 0.025730547 0.020463421
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ENTITY_STABLE_ID name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 mean_1 url_1 true false false false true true yes no
mean_2 mean_2 mean_2 url_2 false NOT_DEFINED true false true false yes no
mean_3 mean_3 mean_3 url_3 false true false false true false NA yes
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name description url TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
mean_1 mean_1 url_1 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_2 mean_2 url_2 0.002709404 0.009212318 0.002650657 0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_3 mean_3 url_3 0.006035782 0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 0.025730547 0.020463421

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ENTITY_STABLE_ID name description confidenceStatement TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 TCGA-BH-A0HP-01 TCGA-BH-A18P-01
Nmut Nmut number of mutations NA 18 3 32 13 3 4 1 7
mean_1 mean_1 mean_1 Signature 1, the aging signature, is detected in this case. 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723
mean_10 mean_10 mean_10 Signature 10, the POLE signature, is detected in this case. It is associated with functions to the exonucleus domain of the POLE gene and enormous mutational burden. Oftentimes MMR signatures 6, 14,16, 20,21 and 26 co-occur with the POLE signature. 0.002709404 0.009212318 0.002650657 0.005475484 0.074175715 0.033049207 0.027323826 0.008861145
mean_11 mean_11 mean_11 Signature 11, the Temozolomide (TMZ) signature, is detected in this case. 0.006035782 0.010095773 0.011926486 0.010637541 0.012168938 0.006641113 0.025730547 0.020463421
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1196,7 +1196,7 @@ <h4 class="panel-title">data_mutational_signature.txt</h4>
<td><span class="glyphicon glyphicon-ok" aria-hidden="true"></span><span class="sr-only">Info</span></td>
<td>&ndash;</td>
<td>&ndash;</td>
<td>Read 62 lines. Lines with warning: 0. Lines with error: 0</td>
<td>Read 61 lines. Lines with warning: 0. Lines with error: 0</td>
<td>&ndash;</td>
</tr>
</tbody>
Expand Down

0 comments on commit 669316a

Please sign in to comment.