Skip to content

Commit

Permalink
Viterbi segmentation and segment quality calculation in gcnvkernel
Browse files Browse the repository at this point in the history
saving log emission posteriors to disk
put __init__ files back in ...
Viterbi decoder w/ theano.scan
doc updates for Viterbi
skeletons for HMM segmentation quality calculation
theano-based HMM log constrained probability calculation
fixed a notorious bug due to theano fancy indexing ...
left and right end point quality calculation for a segment
exact quality
viterbi API update
SAM sequence dictionary parsing
interval ordering using SAM sequence dictionary
lazy initialization of denoising workspace variables to make it efficient and re-usable for Viterbi segmentation
exporting baseline copy number for each sample (for java post-processing)
some I/O refactorings
loading configs from JSON
Viterbi segmentation engine
scattered model/calls assembly
some refactoring of denoising model
Viterbi segmentation and quality calculation finished
fixed numerical instability issues with segment quality calculation
Viterbi segmentation engine complete
Viterbi segmentation python script
some gcnvkernel refactoring
removed SAM sequence dictionary code
fixed the bug causing travis failure
  • Loading branch information
mbabadi committed Feb 5, 2018
1 parent 2efffce commit e4a3c54
Show file tree
Hide file tree
Showing 25 changed files with 1,382 additions and 184 deletions.
Expand Up @@ -23,6 +23,9 @@
io_intervals_and_counts, io_metadata, io_adamax
from .utils import cli_commons

# post-processing
from .postprocess.viterbi_segmentation import ViterbiSegmentationEngine

# structs
from .structs.interval import Interval

Expand Down
@@ -1 +1 @@
__version__ = '0.5.2'
__version__ = '0.6.0'
Expand Up @@ -4,3 +4,7 @@
# if a normalized PMF violates total probability by the following threshold, it will
# be normalized and a warning will be emitted
prob_sum_tol = 1e-10

# floating point decimals in reporting phred-scaled qualities
phred_decimals = 2

Expand Up @@ -7,6 +7,7 @@
import os
import json
import re
import filecmp

from .._version import __version__ as gcnvkernel_version
from . import io_consts
Expand Down Expand Up @@ -57,6 +58,12 @@ def get_sample_name_from_txt_file(input_path: str) -> str:
return line.strip()


def write_sample_name_to_txt_file(output_path: str, sample_name: str):
"""Writes sample name to a text file."""
with open(os.path.join(output_path, io_consts.default_sample_name_txt_filename), 'w') as f:
f.write(sample_name + '\n')


def assert_output_path_writable(output_path: str,
try_creating_output_path: bool = True):
"""Assert an output path is either writable or can be created upon request.
Expand Down Expand Up @@ -509,3 +516,11 @@ def assert_mandatory_columns(mandatory_columns_set: Set[str],
not_found_set = mandatory_columns_set.difference(found_columns_set)
assert len(not_found_set) == 0, "The following mandatory columns could not be found in \"{0}\"; " \
"cannot continue: {1}".format(input_tsv_file, not_found_set)


def assert_files_are_identical(input_file_1: str, input_file_2: str):
"""Asserts that two given files are bit identical."""
assert os.path.isfile(input_file_1), "Cannot find {0}.".format(input_file_1)
assert os.path.isfile(input_file_2), "Cannot find {0}.".format(input_file_2)
assert filecmp.cmp(input_file_1, input_file_2, shallow=False), \
"The following two files are expected to be identical: {0}, {1}".format(input_file_1, input_file_2)
Expand Up @@ -21,18 +21,29 @@
ploidy_column_name = 'PLOIDY'
ploidy_gq_column_name = 'PLOIDY_GQ'

# column names for copy-number segments file
copy_number_call_column_name = 'COPY_NUMBER_CALL'
num_spanning_intervals_column_name = 'NUM_SPANNING_INTERVALS'
some_quality_column_name = 'SOME_QUALITY'
exact_quality_column_name = 'EXACT_QUALITY'
start_quality_column_name = 'START_QUALITY'
end_quality_column_name = 'END_QUALITY'

# regular expression for matching sample name from header comment line
sample_name_header_regexp = "^@RG.*SM:(.*)[\t]*.*$"

# prefix for adding sample name as a header comment line
sample_name_header_prefix = "RG\tID:GATKCopyNumber\tSM:"
sample_name_sam_header_prefix = "RG\tID:GATKCopyNumber\tSM:"

# default file names for loading and saving models, posteriors, and configurations
default_sample_read_depth_tsv_filename = 'global_read_depth.tsv'
default_sample_name_txt_filename = 'sample_name.txt'
default_sample_contig_ploidy_tsv_filename = 'contig_ploidy.tsv'
default_copy_number_log_posterior_tsv_filename = "log_q_c_tc.tsv"
default_copy_number_log_emission_tsv_filename = "log_c_emission_tc.tsv"
default_class_log_posterior_tsv_filename = "log_q_tau_tk.tsv"
default_baseline_copy_number_tsv_filename = "baseline_copy_number_t.tsv"
default_copy_number_segments_tsv_filename = "copy_number_segments.tsv"

default_denoising_config_json_filename = "denoising_config.json"
default_calling_config_json_filename = "calling_config.json"
Expand Down
Expand Up @@ -4,7 +4,6 @@
import pymc3 as pm
import pandas as pd
import os
import json
from typing import List, Optional

from .. import config
Expand Down Expand Up @@ -93,52 +92,68 @@ def __call__(self):
self.input_path, self.denoising_model_approx, self.denoising_model)


def get_sample_posterior_path(calls_path: str, sample_index: int):
return os.path.join(calls_path, io_consts.sample_folder_prefix + repr(sample_index))


class SampleDenoisingAndCallingPosteriorsExporter:
"""Exports sample-specific model parameters and associated workspace variables to disk."""
def __init__(self,
denoising_config: DenoisingModelConfig,
calling_config: CopyNumberCallingConfig,
denoising_calling_workspace: DenoisingCallingWorkspace,
denoising_model: DenoisingModel,
denoising_model_approx: pm.MeanField,
output_path: str):
io_commons.assert_output_path_writable(output_path)
self.denoising_config = denoising_config
self.calling_config = calling_config
self.denoising_calling_workspace = denoising_calling_workspace
self.denoising_model = denoising_model
self.denoising_model_approx = denoising_model_approx
self.output_path = output_path

@staticmethod
def _export_sample_copy_number_log_posterior(sample_posterior_path: str,
log_q_c_tc: np.ndarray,
delimiter='\t',
comment='@',
extra_comment_lines: Optional[List[str]] = None):
assert isinstance(log_q_c_tc, np.ndarray)
assert log_q_c_tc.ndim == 2
num_copy_number_states = log_q_c_tc.shape[1]
def export_ndarray_tc_with_copy_number_header(sample_posterior_path: str,
ndarray_tc: np.ndarray,
output_file_name: str,
delimiter='\t',
comment='@',
extra_comment_lines: Optional[List[str]] = None):
assert isinstance(ndarray_tc, np.ndarray)
assert ndarray_tc.ndim == 2
num_copy_number_states = ndarray_tc.shape[1]
copy_number_header_columns = [io_consts.copy_number_column_prefix + str(cn)
for cn in range(num_copy_number_states)]
with open(os.path.join(sample_posterior_path,
io_consts.default_copy_number_log_posterior_tsv_filename), 'w') as f:
with open(os.path.join(sample_posterior_path, output_file_name), 'w') as f:
if extra_comment_lines is not None:
for comment_line in extra_comment_lines:
f.write(comment + comment_line + '\n')
f.write(delimiter.join(copy_number_header_columns) + '\n')
for ti in range(log_q_c_tc.shape[0]):
f.write(delimiter.join([repr(x) for x in log_q_c_tc[ti, :]]) + '\n')

@staticmethod
def _export_sample_name(sample_posterior_path: str,
sample_name: str):
with open(os.path.join(sample_posterior_path, io_consts.default_sample_name_txt_filename), 'w') as f:
f.write(sample_name + '\n')
for ti in range(ndarray_tc.shape[0]):
f.write(delimiter.join([repr(x) for x in ndarray_tc[ti, :]]) + '\n')

def __call__(self):
# export gcnvkernel version
io_commons.export_gcnvkernel_version(self.output_path)

# export denoising config
io_commons.export_dict_to_json_file(
os.path.join(self.output_path, io_consts.default_denoising_config_json_filename),
self.denoising_config.__dict__, set())

# export calling config
io_commons.export_dict_to_json_file(
os.path.join(self.output_path, io_consts.default_calling_config_json_filename),
self.calling_config.__dict__, set())

# extract meanfield parameters
approx_var_set, approx_mu_map, approx_std_map = io_commons.extract_meanfield_posterior_parameters(
self.denoising_model_approx)

for si, sample_name in enumerate(self.denoising_calling_workspace.sample_names):
sample_name_comment_line = [io_consts.sample_name_header_prefix + sample_name]
sample_posterior_path = os.path.join(self.output_path, io_consts.sample_folder_prefix + repr(si))
sample_name_comment_line = [io_consts.sample_name_sam_header_prefix + sample_name]
sample_posterior_path = get_sample_posterior_path(self.output_path, si)
_logger.info("Saving posteriors for sample \"{0}\" in \"{1}\"...".format(
sample_name, sample_posterior_path))
io_commons.assert_output_path_writable(sample_posterior_path, try_creating_output_path=True)
Expand All @@ -149,14 +164,29 @@ def __call__(self):
self.denoising_model, sample_name_comment_line)

# export sample name
self._export_sample_name(sample_posterior_path, sample_name)
io_commons.write_sample_name_to_txt_file(sample_posterior_path, sample_name)

# export copy number posterior
self._export_sample_copy_number_log_posterior(
# export copy number log posterior
self.export_ndarray_tc_with_copy_number_header(
sample_posterior_path,
self.denoising_calling_workspace.log_q_c_stc.get_value(borrow=True)[si, ...],
io_consts.default_copy_number_log_posterior_tsv_filename,
extra_comment_lines=sample_name_comment_line)

# export copy number log emission
self.export_ndarray_tc_with_copy_number_header(
sample_posterior_path,
self.denoising_calling_workspace.log_copy_number_emission_stc.get_value(borrow=True)[si, ...],
io_consts.default_copy_number_log_emission_tsv_filename,
extra_comment_lines=sample_name_comment_line)

# export baseline copy numbers
baseline_copy_number_t = self.denoising_calling_workspace.baseline_copy_number_sj[
si, self.denoising_calling_workspace.t_to_j_map.get_value(borrow=True)]
io_commons.write_ndarray_to_tsv(
os.path.join(sample_posterior_path, io_consts.default_baseline_copy_number_tsv_filename),
baseline_copy_number_t)


class SampleDenoisingAndCallingPosteriorsImporter:
"""Imports sample-specific model parameters and associated workspace variables from disk."""
Expand All @@ -170,24 +200,49 @@ def __init__(self,
self.denoising_model_approx = denoising_model_approx
self.input_calls_path = input_calls_path

@staticmethod
def import_ndarray_tc_with_copy_number_header(sample_posterior_path: str,
input_file_name: str,
delimiter='\t',
comment='@') -> np.ndarray:
ndarray_tc_tsv_file = os.path.join(sample_posterior_path, input_file_name)
ndarray_tc_pd = pd.read_csv(ndarray_tc_tsv_file, delimiter=delimiter, comment=comment)
imported_columns = [str(column_name) for column_name in ndarray_tc_pd.columns.values]
num_imported_columns = len(imported_columns)
expected_copy_number_header_columns =\
[io_consts.copy_number_column_prefix + str(cn) for cn in range(num_imported_columns)]
assert imported_columns == expected_copy_number_header_columns
imported_ndarray_tc = ndarray_tc_pd.values
assert imported_ndarray_tc.ndim == 2
return imported_ndarray_tc

def _import_sample_copy_number_log_posterior(self,
sample_posterior_path: str,
delimiter='\t',
comment='@') -> np.ndarray:
expected_copy_number_header_columns = [
io_consts.copy_number_column_prefix + str(cn)
for cn in range(self.denoising_calling_workspace.calling_config.num_copy_number_states)]
log_q_c_tc_tsv_file = os.path.join(sample_posterior_path,
io_consts.default_copy_number_log_posterior_tsv_filename)
log_q_c_tc_pd = pd.read_csv(log_q_c_tc_tsv_file, delimiter=delimiter, comment=comment)
imported_columns = log_q_c_tc_pd.columns.values
assert all([column in imported_columns for column in expected_copy_number_header_columns])
imported_log_q_c_tc = log_q_c_tc_pd.values
assert imported_log_q_c_tc.ndim == 2
imported_log_q_c_tc = self.import_ndarray_tc_with_copy_number_header(
sample_posterior_path,
io_consts.default_copy_number_log_posterior_tsv_filename,
delimiter=delimiter,
comment=comment)
assert imported_log_q_c_tc.shape == (self.denoising_calling_workspace.num_intervals,
self.denoising_calling_workspace.calling_config.num_copy_number_states)
return imported_log_q_c_tc

def _import_sample_copy_number_log_emission(self,
sample_posterior_path: str,
delimiter='\t',
comment='@') -> np.ndarray:
imported_log_emission_tc = self.import_ndarray_tc_with_copy_number_header(
sample_posterior_path,
io_consts.default_copy_number_log_emission_tsv_filename,
delimiter=delimiter,
comment=comment)
assert imported_log_emission_tc.shape ==\
(self.denoising_calling_workspace.num_intervals,
self.denoising_calling_workspace.calling_config.num_copy_number_states)
return imported_log_emission_tc

def __call__(self):
# assert that the interval list is the same
interval_list_tsv_file = os.path.join(self.input_calls_path, io_consts.default_interval_list_filename)
Expand All @@ -196,25 +251,35 @@ def __call__(self):
assert imported_interval_list == self.denoising_calling_workspace.interval_list

for si in range(self.denoising_calling_workspace.num_samples):
sample_posterior_path = os.path.join(self.input_calls_path, io_consts.sample_folder_prefix + repr(si))
sample_posterior_path = get_sample_posterior_path(self.input_calls_path, si)
assert os.path.exists(sample_posterior_path)

# import sample-specific posteriors and update approximation
io_commons.import_meanfield_sample_specific_params(
sample_posterior_path, si, self.denoising_calling_workspace.sample_names[si],
self.denoising_model_approx, self.denoising_model)

# import copy number posterior and update log_q_c_stc in workspace
# import copy number posterior and emission and update workspace
log_q_c_tc = self._import_sample_copy_number_log_posterior(sample_posterior_path)
log_copy_number_emission_tc = self._import_sample_copy_number_log_emission(sample_posterior_path)

def update_log_q_c_stc_for_sample(log_q_c_stc):
log_q_c_stc[si, ...] = log_q_c_tc[...]
return log_q_c_stc

def update_log_copy_number_emission_stc_for_sample(log_copy_number_emission_stc):
log_copy_number_emission_stc[si, ...] = log_copy_number_emission_tc[...]
return log_copy_number_emission_stc

self.denoising_calling_workspace.log_q_c_stc.set_value(
update_log_q_c_stc_for_sample(
self.denoising_calling_workspace.log_q_c_stc.get_value(borrow=True)),
borrow=True)

self.denoising_calling_workspace.log_copy_number_emission_stc.set_value(
update_log_copy_number_emission_stc_for_sample(
self.denoising_calling_workspace.log_copy_number_emission_stc.get_value(borrow=True)),
borrow=True)

# update auxiliary workspace variables
self.denoising_calling_workspace.update_auxiliary_vars()
Expand Up @@ -68,6 +68,29 @@ def load_interval_list_tsv_file(interval_list_tsv_file: str,
return _convert_interval_list_pandas_to_gcnv_interval_list(interval_list_pd, interval_list_tsv_file)


def extract_sam_sequence_dictionary_from_file(input_file: str):
"""Extract SAM sequence dictionary from a file.
Notes:
Only contiguous SAM header lines (starting with '@') are considered. The parsing of the input file
stops as soon as a line starting with any other character is reached.
Returns:
a list of str
"""
sam_seq_dict: List[str] = list()
with open(input_file, 'r') as f:
for line in f:
stripped_line = line.strip()
if len(stripped_line) == 0:
continue
elif stripped_line[0] == '@':
sam_seq_dict.append(stripped_line)
else:
break
return sam_seq_dict


def load_counts_in_the_modeling_zone(read_count_file_list: List[str],
modeling_interval_list: List[Interval]) -> Tuple[List[str], np.ndarray]:
"""Loads read counts for a given cohort corresponding to a provided list of intervals.
Expand Down Expand Up @@ -153,7 +176,8 @@ def _convert_interval_list_pandas_to_gcnv_interval_list(interval_list_pd: pd.Dat
return interval_list


def write_interval_list_to_tsv_file(output_file: str, interval_list: List[Interval]):
def write_interval_list_to_tsv_file(output_file: str, interval_list: List[Interval],
sam_header_lines: Optional[List[str]] = None):
"""Write a list of interval list to .tsv file.
Note:
Expand All @@ -164,6 +188,7 @@ def write_interval_list_to_tsv_file(output_file: str, interval_list: List[Interv
Args:
output_file: output .tsv file
interval_list: list of intervals to write to .tsv file
sam_header_lines: (optional) SAM header lines
Returns:
None
Expand All @@ -182,6 +207,9 @@ def write_interval_list_to_tsv_file(output_file: str, interval_list: List[Interv
"Cannot write this annotation to a .tsv file; "
"Neglecting \"{0}\" and proceeding...".format(key))
with open(output_file, 'w') as out:
if sam_header_lines is not None:
for sam_header_line in sam_header_lines:
out.write(sam_header_line + '\n')
header = '\t'.join([io_consts.contig_column_name,
io_consts.start_column_name,
io_consts.end_column_name]
Expand Down
Expand Up @@ -116,7 +116,7 @@ def _export_sample_read_depth(sample_posterior_path: str,

def __call__(self):
for si, sample_name in enumerate(self.ploidy_workspace.sample_names):
sample_name_comment_line = [io_consts.sample_name_header_prefix + sample_name]
sample_name_comment_line = [io_consts.sample_name_sam_header_prefix + sample_name]
sample_posterior_path = os.path.join(self.output_path, io_consts.sample_folder_prefix + repr(si))
io_commons.assert_output_path_writable(sample_posterior_path, try_creating_output_path=True)
_logger.info("Saving posteriors for sample \"{0}\" in \"{1}\"...".format(
Expand All @@ -132,7 +132,7 @@ def __call__(self):
# generate sample ploidy metadata
sample_ploidy_metadata = SamplePloidyMetadata(
sample_name, ploidy_j, ploidy_genotyping_quality_j,
self.ploidy_workspace.interval_list_metadata.contig_list)
self.ploidy_workspace.interval_list_metadata.ordered_contig_list)

# generate sample read depth metadata
sample_read_depth_metadata = SampleReadDepthMetadata.generate_sample_read_depth_metadata(
Expand Down

0 comments on commit e4a3c54

Please sign in to comment.