Viterbi segmentation and segment quality calculation in gcnvkernel

saving log emission posteriors to disk put __init__ files back in ... Viterbi decoder w/ theano.scan doc updates for Viterbi skeletons for HMM segmentation quality calculation theano-based HMM log constrained probability calculation fixed a notorious bug due to theano fancy indexing ... left and right end point quality calculation for a segment exact quality viterbi API update SAM sequence dictionary parsing interval ordering using SAM sequence dictionary lazy initialization of denoising workspace variables to make it efficient and re-usable for Viterbi segmentation exporting baseline copy number for each sample (for java post-processing) some I/O refactorings loading configs from JSON Viterbi segmentation engine scattered model/calls assembly some refactoring of denoising model Viterbi segmentation and quality calculation finished fixed numerical instability issues with segment quality calculation Viterbi segmentation engine complete Viterbi segmentation python script some gcnvkernel refactoring removed SAM sequence dictionary code fixed the bug causing travis failure
broadinstitute · Feb 5, 2018 · e4a3c54 · e4a3c54
1 parent 2efffce
commit e4a3c54
Show file tree

Hide file tree

Showing 25 changed files with 1,382 additions and 184 deletions.
diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/__init__.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/__init__.py
@@ -23,6 +23,9 @@
     io_intervals_and_counts, io_metadata, io_adamax
 from .utils import cli_commons
 
+# post-processing
+from .postprocess.viterbi_segmentation import ViterbiSegmentationEngine
+
 # structs
 from .structs.interval import Interval
 

diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/_version.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/_version.py
@@ -1 +1 @@
-__version__ = '0.5.2'
+__version__ = '0.6.0'
diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/config.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/config.py
@@ -4,3 +4,7 @@
 # if a normalized PMF violates total probability by the following threshold, it will
 # be normalized and a warning will be emitted
 prob_sum_tol = 1e-10
+
+# floating point decimals in reporting phred-scaled qualities
+phred_decimals = 2
+
diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_commons.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_commons.py
@@ -7,6 +7,7 @@
 import os
 import json
 import re
+import filecmp
 
 from .._version import __version__ as gcnvkernel_version
 from . import io_consts
@@ -57,6 +58,12 @@ def get_sample_name_from_txt_file(input_path: str) -> str:
             return line.strip()
 
 
+def write_sample_name_to_txt_file(output_path: str, sample_name: str):
+    """Writes sample name to a text file."""
+    with open(os.path.join(output_path, io_consts.default_sample_name_txt_filename), 'w') as f:
+        f.write(sample_name + '\n')
+
+
 def assert_output_path_writable(output_path: str,
                                 try_creating_output_path: bool = True):
     """Assert an output path is either writable or can be created upon request.
@@ -509,3 +516,11 @@ def assert_mandatory_columns(mandatory_columns_set: Set[str],
     not_found_set = mandatory_columns_set.difference(found_columns_set)
     assert len(not_found_set) == 0, "The following mandatory columns could not be found in \"{0}\"; " \
                                     "cannot continue: {1}".format(input_tsv_file, not_found_set)
+
+
+def assert_files_are_identical(input_file_1: str, input_file_2: str):
+    """Asserts that two given files are bit identical."""
+    assert os.path.isfile(input_file_1), "Cannot find {0}.".format(input_file_1)
+    assert os.path.isfile(input_file_2), "Cannot find {0}.".format(input_file_2)
+    assert filecmp.cmp(input_file_1, input_file_2, shallow=False), \
+        "The following two files are expected to be identical: {0}, {1}".format(input_file_1, input_file_2)
diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_consts.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_consts.py
@@ -21,18 +21,29 @@
 ploidy_column_name = 'PLOIDY'
 ploidy_gq_column_name = 'PLOIDY_GQ'
 
+# column names for copy-number segments file
+copy_number_call_column_name = 'COPY_NUMBER_CALL'
+num_spanning_intervals_column_name = 'NUM_SPANNING_INTERVALS'
+some_quality_column_name = 'SOME_QUALITY'
+exact_quality_column_name = 'EXACT_QUALITY'
+start_quality_column_name = 'START_QUALITY'
+end_quality_column_name =  'END_QUALITY'
+
 # regular expression for matching sample name from header comment line
 sample_name_header_regexp = "^@RG.*SM:(.*)[\t]*.*$"
 
 # prefix for adding sample name as a header comment line
-sample_name_header_prefix = "RG\tID:GATKCopyNumber\tSM:"
+sample_name_sam_header_prefix = "RG\tID:GATKCopyNumber\tSM:"
 
 # default file names for loading and saving models, posteriors, and configurations
 default_sample_read_depth_tsv_filename = 'global_read_depth.tsv'
 default_sample_name_txt_filename = 'sample_name.txt'
 default_sample_contig_ploidy_tsv_filename = 'contig_ploidy.tsv'
 default_copy_number_log_posterior_tsv_filename = "log_q_c_tc.tsv"
+default_copy_number_log_emission_tsv_filename = "log_c_emission_tc.tsv"
 default_class_log_posterior_tsv_filename = "log_q_tau_tk.tsv"
+default_baseline_copy_number_tsv_filename = "baseline_copy_number_t.tsv"
+default_copy_number_segments_tsv_filename = "copy_number_segments.tsv"
 
 default_denoising_config_json_filename = "denoising_config.json"
 default_calling_config_json_filename = "calling_config.json"

diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_denoising_calling.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_denoising_calling.py
@@ -4,7 +4,6 @@
 import pymc3 as pm
 import pandas as pd
 import os
-import json
 from typing import List, Optional
 
 from .. import config
@@ -93,52 +92,68 @@ def __call__(self):
             self.input_path, self.denoising_model_approx, self.denoising_model)
 
 
+def get_sample_posterior_path(calls_path: str, sample_index: int):
+    return os.path.join(calls_path, io_consts.sample_folder_prefix + repr(sample_index))
+
+
 class SampleDenoisingAndCallingPosteriorsExporter:
     """Exports sample-specific model parameters and associated workspace variables to disk."""
     def __init__(self,
+                 denoising_config: DenoisingModelConfig,
+                 calling_config: CopyNumberCallingConfig,
                  denoising_calling_workspace: DenoisingCallingWorkspace,
                  denoising_model: DenoisingModel,
                  denoising_model_approx: pm.MeanField,
                  output_path: str):
         io_commons.assert_output_path_writable(output_path)
+        self.denoising_config = denoising_config
+        self.calling_config = calling_config
         self.denoising_calling_workspace = denoising_calling_workspace
         self.denoising_model = denoising_model
         self.denoising_model_approx = denoising_model_approx
         self.output_path = output_path
 
     @staticmethod
-    def _export_sample_copy_number_log_posterior(sample_posterior_path: str,
-                                                 log_q_c_tc: np.ndarray,
-                                                 delimiter='\t',
-                                                 comment='@',
-                                                 extra_comment_lines: Optional[List[str]] = None):
-        assert isinstance(log_q_c_tc, np.ndarray)
-        assert log_q_c_tc.ndim == 2
-        num_copy_number_states = log_q_c_tc.shape[1]
+    def export_ndarray_tc_with_copy_number_header(sample_posterior_path: str,
+                                                  ndarray_tc: np.ndarray,
+                                                  output_file_name: str,
+                                                  delimiter='\t',
+                                                  comment='@',
+                                                  extra_comment_lines: Optional[List[str]] = None):
+        assert isinstance(ndarray_tc, np.ndarray)
+        assert ndarray_tc.ndim == 2
+        num_copy_number_states = ndarray_tc.shape[1]
         copy_number_header_columns = [io_consts.copy_number_column_prefix + str(cn)
                                       for cn in range(num_copy_number_states)]
-        with open(os.path.join(sample_posterior_path,
-                               io_consts.default_copy_number_log_posterior_tsv_filename), 'w') as f:
+        with open(os.path.join(sample_posterior_path, output_file_name), 'w') as f:
             if extra_comment_lines is not None:
                 for comment_line in extra_comment_lines:
                     f.write(comment + comment_line + '\n')
             f.write(delimiter.join(copy_number_header_columns) + '\n')
-            for ti in range(log_q_c_tc.shape[0]):
-                f.write(delimiter.join([repr(x) for x in log_q_c_tc[ti, :]]) + '\n')
-
-    @staticmethod
-    def _export_sample_name(sample_posterior_path: str,
-                            sample_name: str):
-        with open(os.path.join(sample_posterior_path, io_consts.default_sample_name_txt_filename), 'w') as f:
-            f.write(sample_name + '\n')
+            for ti in range(ndarray_tc.shape[0]):
+                f.write(delimiter.join([repr(x) for x in ndarray_tc[ti, :]]) + '\n')
 
     def __call__(self):
+        # export gcnvkernel version
+        io_commons.export_gcnvkernel_version(self.output_path)
+
+        # export denoising config
+        io_commons.export_dict_to_json_file(
+            os.path.join(self.output_path, io_consts.default_denoising_config_json_filename),
+            self.denoising_config.__dict__, set())
+
+        # export calling config
+        io_commons.export_dict_to_json_file(
+            os.path.join(self.output_path, io_consts.default_calling_config_json_filename),
+            self.calling_config.__dict__, set())
+
+        # extract meanfield parameters
         approx_var_set, approx_mu_map, approx_std_map = io_commons.extract_meanfield_posterior_parameters(
             self.denoising_model_approx)
 
         for si, sample_name in enumerate(self.denoising_calling_workspace.sample_names):
-            sample_name_comment_line = [io_consts.sample_name_header_prefix + sample_name]
-            sample_posterior_path = os.path.join(self.output_path, io_consts.sample_folder_prefix + repr(si))
+            sample_name_comment_line = [io_consts.sample_name_sam_header_prefix + sample_name]
+            sample_posterior_path = get_sample_posterior_path(self.output_path, si)
             _logger.info("Saving posteriors for sample \"{0}\" in \"{1}\"...".format(
                 sample_name, sample_posterior_path))
             io_commons.assert_output_path_writable(sample_posterior_path, try_creating_output_path=True)
@@ -149,14 +164,29 @@ def __call__(self):
                 self.denoising_model, sample_name_comment_line)
 
             # export sample name
-            self._export_sample_name(sample_posterior_path, sample_name)
+            io_commons.write_sample_name_to_txt_file(sample_posterior_path, sample_name)
 
-            # export copy number posterior
-            self._export_sample_copy_number_log_posterior(
+            # export copy number log posterior
+            self.export_ndarray_tc_with_copy_number_header(
                 sample_posterior_path,
                 self.denoising_calling_workspace.log_q_c_stc.get_value(borrow=True)[si, ...],
+                io_consts.default_copy_number_log_posterior_tsv_filename,
+                extra_comment_lines=sample_name_comment_line)
+
+            # export copy number log emission
+            self.export_ndarray_tc_with_copy_number_header(
+                sample_posterior_path,
+                self.denoising_calling_workspace.log_copy_number_emission_stc.get_value(borrow=True)[si, ...],
+                io_consts.default_copy_number_log_emission_tsv_filename,
                 extra_comment_lines=sample_name_comment_line)
 
+            # export baseline copy numbers
+            baseline_copy_number_t = self.denoising_calling_workspace.baseline_copy_number_sj[
+                si, self.denoising_calling_workspace.t_to_j_map.get_value(borrow=True)]
+            io_commons.write_ndarray_to_tsv(
+                os.path.join(sample_posterior_path, io_consts.default_baseline_copy_number_tsv_filename),
+                baseline_copy_number_t)
+
 
 class SampleDenoisingAndCallingPosteriorsImporter:
     """Imports sample-specific model parameters and associated workspace variables from disk."""
@@ -170,24 +200,49 @@ def __init__(self,
         self.denoising_model_approx = denoising_model_approx
         self.input_calls_path = input_calls_path
 
+    @staticmethod
+    def import_ndarray_tc_with_copy_number_header(sample_posterior_path: str,
+                                                  input_file_name: str,
+                                                  delimiter='\t',
+                                                  comment='@') -> np.ndarray:
+        ndarray_tc_tsv_file = os.path.join(sample_posterior_path, input_file_name)
+        ndarray_tc_pd = pd.read_csv(ndarray_tc_tsv_file, delimiter=delimiter, comment=comment)
+        imported_columns = [str(column_name) for column_name in ndarray_tc_pd.columns.values]
+        num_imported_columns = len(imported_columns)
+        expected_copy_number_header_columns =\
+            [io_consts.copy_number_column_prefix + str(cn) for cn in range(num_imported_columns)]
+        assert imported_columns == expected_copy_number_header_columns
+        imported_ndarray_tc = ndarray_tc_pd.values
+        assert imported_ndarray_tc.ndim == 2
+        return imported_ndarray_tc
+
     def _import_sample_copy_number_log_posterior(self,
                                                  sample_posterior_path: str,
                                                  delimiter='\t',
                                                  comment='@') -> np.ndarray:
-        expected_copy_number_header_columns = [
-            io_consts.copy_number_column_prefix + str(cn)
-            for cn in range(self.denoising_calling_workspace.calling_config.num_copy_number_states)]
-        log_q_c_tc_tsv_file = os.path.join(sample_posterior_path,
-                                           io_consts.default_copy_number_log_posterior_tsv_filename)
-        log_q_c_tc_pd = pd.read_csv(log_q_c_tc_tsv_file, delimiter=delimiter, comment=comment)
-        imported_columns = log_q_c_tc_pd.columns.values
-        assert all([column in imported_columns for column in expected_copy_number_header_columns])
-        imported_log_q_c_tc = log_q_c_tc_pd.values
-        assert imported_log_q_c_tc.ndim == 2
+        imported_log_q_c_tc = self.import_ndarray_tc_with_copy_number_header(
+            sample_posterior_path,
+            io_consts.default_copy_number_log_posterior_tsv_filename,
+            delimiter=delimiter,
+            comment=comment)
         assert imported_log_q_c_tc.shape == (self.denoising_calling_workspace.num_intervals,
                                              self.denoising_calling_workspace.calling_config.num_copy_number_states)
         return imported_log_q_c_tc
 
+    def _import_sample_copy_number_log_emission(self,
+                                                sample_posterior_path: str,
+                                                delimiter='\t',
+                                                comment='@') -> np.ndarray:
+        imported_log_emission_tc = self.import_ndarray_tc_with_copy_number_header(
+            sample_posterior_path,
+            io_consts.default_copy_number_log_emission_tsv_filename,
+            delimiter=delimiter,
+            comment=comment)
+        assert imported_log_emission_tc.shape ==\
+               (self.denoising_calling_workspace.num_intervals,
+                self.denoising_calling_workspace.calling_config.num_copy_number_states)
+        return imported_log_emission_tc
+
     def __call__(self):
         # assert that the interval list is the same
         interval_list_tsv_file = os.path.join(self.input_calls_path, io_consts.default_interval_list_filename)
@@ -196,25 +251,35 @@ def __call__(self):
         assert imported_interval_list == self.denoising_calling_workspace.interval_list
 
         for si in range(self.denoising_calling_workspace.num_samples):
-            sample_posterior_path = os.path.join(self.input_calls_path, io_consts.sample_folder_prefix + repr(si))
+            sample_posterior_path = get_sample_posterior_path(self.input_calls_path, si)
             assert os.path.exists(sample_posterior_path)
 
             # import sample-specific posteriors and update approximation
             io_commons.import_meanfield_sample_specific_params(
                 sample_posterior_path, si, self.denoising_calling_workspace.sample_names[si],
                 self.denoising_model_approx, self.denoising_model)
 
-            # import copy number posterior and update log_q_c_stc in workspace
+            # import copy number posterior and emission and update workspace
             log_q_c_tc = self._import_sample_copy_number_log_posterior(sample_posterior_path)
+            log_copy_number_emission_tc = self._import_sample_copy_number_log_emission(sample_posterior_path)
 
             def update_log_q_c_stc_for_sample(log_q_c_stc):
                 log_q_c_stc[si, ...] = log_q_c_tc[...]
                 return log_q_c_stc
 
+            def update_log_copy_number_emission_stc_for_sample(log_copy_number_emission_stc):
+                log_copy_number_emission_stc[si, ...] = log_copy_number_emission_tc[...]
+                return log_copy_number_emission_stc
+
             self.denoising_calling_workspace.log_q_c_stc.set_value(
                 update_log_q_c_stc_for_sample(
                     self.denoising_calling_workspace.log_q_c_stc.get_value(borrow=True)),
                 borrow=True)
 
+            self.denoising_calling_workspace.log_copy_number_emission_stc.set_value(
+                update_log_copy_number_emission_stc_for_sample(
+                    self.denoising_calling_workspace.log_copy_number_emission_stc.get_value(borrow=True)),
+                borrow=True)
+
         # update auxiliary workspace variables
         self.denoising_calling_workspace.update_auxiliary_vars()
diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_intervals_and_counts.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_intervals_and_counts.py
@@ -68,6 +68,29 @@ def load_interval_list_tsv_file(interval_list_tsv_file: str,
     return _convert_interval_list_pandas_to_gcnv_interval_list(interval_list_pd, interval_list_tsv_file)
 
 
+def extract_sam_sequence_dictionary_from_file(input_file: str):
+    """Extract SAM sequence dictionary from a file.
+
+    Notes:
+        Only contiguous SAM header lines (starting with '@') are considered. The parsing of the input file
+        stops as soon as a line starting with any other character is reached.
+
+    Returns:
+        a list of str
+    """
+    sam_seq_dict: List[str] = list()
+    with open(input_file, 'r') as f:
+        for line in f:
+            stripped_line = line.strip()
+            if len(stripped_line) == 0:
+                continue
+            elif stripped_line[0] == '@':
+                sam_seq_dict.append(stripped_line)
+            else:
+                break
+    return sam_seq_dict
+
+
 def load_counts_in_the_modeling_zone(read_count_file_list: List[str],
                                      modeling_interval_list: List[Interval]) -> Tuple[List[str], np.ndarray]:
     """Loads read counts for a given cohort corresponding to a provided list of intervals.
@@ -153,7 +176,8 @@ def _convert_interval_list_pandas_to_gcnv_interval_list(interval_list_pd: pd.Dat
     return interval_list
 
 
-def write_interval_list_to_tsv_file(output_file: str, interval_list: List[Interval]):
+def write_interval_list_to_tsv_file(output_file: str, interval_list: List[Interval],
+                                    sam_header_lines: Optional[List[str]] = None):
     """Write a list of interval list to .tsv file.
 
     Note:
@@ -164,6 +188,7 @@ def write_interval_list_to_tsv_file(output_file: str, interval_list: List[Interv
     Args:
         output_file: output .tsv file
         interval_list: list of intervals to write to .tsv file
+        sam_header_lines: (optional) SAM header lines
 
     Returns:
         None
@@ -182,6 +207,9 @@ def write_interval_list_to_tsv_file(output_file: str, interval_list: List[Interv
                             "Cannot write this annotation to a .tsv file; "
                             "Neglecting \"{0}\" and proceeding...".format(key))
     with open(output_file, 'w') as out:
+        if sam_header_lines is not None:
+            for sam_header_line in sam_header_lines:
+                out.write(sam_header_line + '\n')
         header = '\t'.join([io_consts.contig_column_name,
                             io_consts.start_column_name,
                             io_consts.end_column_name]

diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_ploidy.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_ploidy.py
@@ -116,7 +116,7 @@ def _export_sample_read_depth(sample_posterior_path: str,
 
     def __call__(self):
         for si, sample_name in enumerate(self.ploidy_workspace.sample_names):
-            sample_name_comment_line = [io_consts.sample_name_header_prefix + sample_name]
+            sample_name_comment_line = [io_consts.sample_name_sam_header_prefix + sample_name]
             sample_posterior_path = os.path.join(self.output_path, io_consts.sample_folder_prefix + repr(si))
             io_commons.assert_output_path_writable(sample_posterior_path, try_creating_output_path=True)
             _logger.info("Saving posteriors for sample \"{0}\" in \"{1}\"...".format(
@@ -132,7 +132,7 @@ def __call__(self):
             # generate sample ploidy metadata
             sample_ploidy_metadata = SamplePloidyMetadata(
                 sample_name, ploidy_j, ploidy_genotyping_quality_j,
-                self.ploidy_workspace.interval_list_metadata.contig_list)
+                self.ploidy_workspace.interval_list_metadata.ordered_contig_list)
 
             # generate sample read depth metadata
             sample_read_depth_metadata = SampleReadDepthMetadata.generate_sample_read_depth_metadata(