From 60b86406fb4828a42bb50bcf944f516552fca13d Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 21 Jul 2020 11:56:25 -0400 Subject: [PATCH 01/61] Added constants, make_label_combos, generic_field_check, and make_filters_sanity_check_expr to vcf.py from gnomad_qc. Also added INFO_DICT, FORMAT_DICT, and make_vcf_filter_dict from ukb repo --- gnomad/utils/vcf.py | 286 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 286 insertions(+) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 43c4b6200..8138e0a07 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -6,6 +6,43 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) + +GROUPS = ["adj", "raw"] +""" +Group names used to generate labels for high quality genotypes and all raw genotypes. Used in VCF export. +""" + +HISTS = ["gq_hist_alt", "gq_hist_all", "dp_hist_alt", "dp_hist_all", "ab_hist_alt"] +""" +Quality histograms used in VCF export. +""" + +POPS = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"] +""" +Continental population names used in VCF export. +""" + +NFE_SUBPOPS = ["onf", "bgr", "swe", "nwe", "seu", "est"] +""" +gnomAD subpopulations for NFE population. Used in VCF export. +""" + +EAS_SUBPOPS = ["kor", "oea", "jpn"] +""" +gnomAD subpopulations for EAS population. Used in VCF export. +""" + +FAF_POPS = ["afr", "amr", "eas", "nfe", "sas"] +""" +Populations that are included in filtering allele frequency calculations. Used in VCF export. +""" + +SEXES = ["male", "female"] +""" +Sample sexes used in VCF export. +""" + + INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [ "AS_QUALapprox", "AS_VarDP", @@ -14,6 +51,148 @@ "AS_SB_TABLE", ] +INFO_DICT = { + "FS": { + "Description": "Phred-scaled p-value of Fisher's exact test for strand bias" + }, + "InbreedingCoeff": { + "Description": "Inbreeding coefficient, the excess heterozygosity at a variant site, computed as 1 - (the number of heterozygous genotypes)/(the number of heterozygous genotypes expected under Hardy-Weinberg equilibrium)" + }, + "MQ": { + "Description": "Root mean square of the mapping quality of reads across all samples" + }, + "MQRankSum": { + "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference read mapping qualities" + }, + "QD": { + "Description": "Variant call confidence normalized by depth of sample reads supporting a variant" + }, + "ReadPosRankSum": { + "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference read position bias" + }, + "SOR": {"Description": "Strand bias estimated by the symmetric odds ratio test"}, + "VQSR_POSITIVE_TRAIN_SITE": { + "Description": "Variant was used to build the positive training set of high-quality variants for VQSR" + }, + "VQSR_NEGATIVE_TRAIN_SITE": { + "Description": "Variant was used to build the negative training set of low-quality variants for VQSR" + }, + "AS_BaseQRankSum": { + "Number": "A", + "Description": "Allele-specific Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities", + }, + "ClippingRankSum": { + "Number": "A", + "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference number of hard clipped bases", + }, + "VarDP": { + "Description": "Depth over variant genotypes (does not include depth of reference samples)" + }, + "AS_VarDP": { + "Number": "A", + "Description": "Allele-specific depth over variant genotypes (does not include depth of reference samples)", + }, + "AS_VQSLOD": { + "Number": "A", + "Description": "Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model", + }, + "AS_VQSR_culprit": { + "Number": "A", + "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model", + }, + "lcr": {"Description": "Variant falls within a low complexity region"}, + "fail_interval_qc": { + "Description": f"Variant falls within a region where less than {INTERVAL_QC_PARAMETERS[0]}% of samples had a mean coverage of {INTERVAL_QC_PARAMETERS[1]}X" + }, + "nonpar": { + "Description": "Variant (on sex chromosome) falls outside a pseudoautosomal region" + }, + "rf_positive_label": { + "Description": "Variant was labelled as a positive example for training of random forest model" + }, + "rf_negative_label": { + "Description": "Variant was labelled as a negative example for training of random forest model" + }, + "rf_label": {"Description": "Random forest training label"}, + "rf_train": {"Description": "Variant was used in training random forest model"}, + "rf_tp_probability": { + "Description": "Probability of a called variant being a true variant as determined by random forest model" + }, + "transmitted_singleton": { + "Description": "Variant was a callset-wide doubleton that was transmitted within a family from a parent to a child (i.e., a singleton amongst unrelated samples in cohort)" + }, + "original_alleles": {"Description": "Alleles before splitting multiallelics"}, + "variant_type": { + "Description": "Variant type (snv, indel, multi-snv, multi-indel, or mixed)" + }, + "allele_type": { + "Number": "A", + "Description": "Allele type (snv, insertion, deletion, or mixed)", + }, + "n_alt_alleles": { + "Number": "A", + "Description": "Total number of alternate alleles observed at variant locus", + }, + "was_mixed": {"Description": "Variant type was mixed"}, + "has_star": { + "Description": "Variant locus coincides with a spanning deletion (represented by a star) observed elsewhere in the callset" + }, + "pab_max": { + "Number": "A", + "Description": "Maximum p-value over callset for binomial test of observed allele balance for a heterozygous genotype, given expectation of 0.5", + }, +} +""" +Dictionary used during VCF export to export row (variant) annotations. +""" + +FORMAT_DICT = { + "GT": {"Description": "Genotype", "Number": "1", "Type": "String"}, + "AD": { + "Description": "Allelic depths for the ref and alt alleles in the order listed", + "Number": "R", + "Type": "Integer", + }, + "DP": { + "Description": "Approximate read depth (reads with MQ=255 or with bad mates are filtered)", + "Number": "1", + "Type": "Integer", + }, + "GQ": { + "Description": "Phred-scaled confidence that the genotype assignment is correct. Value is the difference between the second lowest PL and the lowest PL (always normalized to 0).", + "Number": "1", + "Type": "Integer", + }, + "MIN_DP": { + "Description": "Minimum DP observed within the GVCF block", + "Number": "1", + "Type": "Integer", + }, + "PGT": { + "Description": "Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another", + "Number": "1", + "Type": "String", + }, + "PID": { + "Description": "Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group", + "Number": "1", + "Type": "String", + }, + "PL": { + "Description": "Normalized, phred-scaled likelihoods for genotypes as defined in the VCF specification", + "Number": "G", + "Type": "Integer", + }, + "SB": { + "Description": "Per-sample component statistics which comprise the Fisher's exact test to detect strand bias. Values are: depth of reference allele on forward strand, depth of reference allele on reverse strand, depth of alternate allele on forward strand, depth of alternate allele on reverse strand.", + "Number": "4", + "Type": "Integer", + }, +} +""" +Dictionary used during VCF export to export MatrixTable entries. +""" + def ht_to_vcf_mt( info_ht: hl.Table, @@ -92,3 +271,110 @@ def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpressi # Create an MT with no cols so that we acn export to VCF info_mt = info_ht.to_matrix_table_row_major(columns=["s"], entry_field_name="s") return info_mt.filter_cols(False) + + +def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]: + """ + Make combinations of all possible labels for a supplied dictionary of label groups + :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]) + :return: list of all possible combinations of values for the supplied label groupings + :rtype: list[str] + """ + copy_label_groups = copy.deepcopy(label_groups) + if len(copy_label_groups) == 1: + return [item for sublist in copy_label_groups.values() for item in sublist] + anchor_group = sorted(copy_label_groups.keys(), key=lambda x: SORT_ORDER.index(x))[ + 0 + ] + anchor_val = copy_label_groups.pop(anchor_group) + combos = [] + for x, y in itertools.product(anchor_val, make_label_combos(copy_label_groups)): + combos.append("{0}_{1}".format(x, y)) + return combos + + +def generic_field_check( + ht: hl.Table, cond_expr, check_description, display_fields, verbose +): + """ + Check a generic logical condition involving annotations in a Hail Table and print the results to terminal + :param Table ht: Table containing annotations to be checked + :param Expression cond_expr: logical expression referring to annotations in ht to be checked + :param str check_description: String describing the condition being checked; is displayed in terminal summary message + :param list of str display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes); + these fields are also displayed if verbose is True + :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False, + show only top values of annotations that fail checks + :rtype: None + """ + ht_orig = ht + ht = ht.filter(cond_expr) + n_fail = ht.count() + if n_fail > 0: + logger.info(f"Found {n_fail} sites that fail {check_description} check:") + ht = ht.flatten() + ht.select("locus", "alleles", *display_fields).show() + else: + logger.info(f"PASSED {check_description} check") + if verbose: + ht_orig = ht_orig.flatten() + ht_orig.select(*display_fields).show() + + +def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]: + """ + Make Hail Expressions to measure % variants filtered under varying conditions of interest + :param Table ht: Hail Table containing 'filter' annotation to be examined + :return: Dictionary containing Hail aggregation expressions to measure filter flags + :rtype: Dict of str: Expression + """ + filters_dict = { + "n": hl.agg.count(), + "frac_any_filter": hl.agg.count_where(ht.is_filtered) / hl.agg.count(), + "frac_inbreed_coeff": hl.agg.count_where( + ht.filters.contains("inbreeding_coeff") + ) + / hl.agg.count(), + "frac_ac0": hl.agg.count_where(ht.filters.contains("AC0")) / hl.agg.count(), + "frac_rf": hl.agg.count_where(ht.filters.contains("rf")) / hl.agg.count(), + "frac_inbreed_coeff_only": hl.agg.count_where( + ht.filters.contains("inbreeding_coeff") & (ht.filters.length() == 1) + ) + / hl.agg.count(), + "frac_ac0_only": hl.agg.count_where( + ht.filters.contains("AC0") & (ht.filters.length() == 1) + ) + / hl.agg.count(), + "frac_rf_only": hl.agg.count_where( + ht.filters.contains("rf") & (ht.filters.length() == 1) + ) + / hl.agg.count(), + } + return filters_dict + + +def make_vcf_filter_dict( + ht: hl.Table, snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float +) -> Dict[str, str]: + """ + Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations. + + :param Table ht: Table containing global annotations of the Random Forests SNP and indel cutoffs. + :param float snp_cutoff: Minimum SNP cutoff score from random forest model. + :param float indel_cutoff: Minimum indel cutoff score from random forest model. + :param float inbreeding_cutoff: Inbreeding coefficient hard cutoff. + :return: Dictionary keyed by VCF FILTER annotations, where values are Dictionaries of Number and Description attributes. + :rtype: Dict[str, str] + """ + filter_dict = { + "AC0": { + "Description": "Allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB < 0.2 for het calls)" + }, + "InbreedingCoeff": {"Description": f"InbreedingCoeff < {inbreeding_cutoff}"}, + "RF": { + "Description": f"Failed random forest filtering thresholds of {snp_cutoff}, {indel_cutoff} (probabilities of being a true positive variant) for SNPs, indels" + }, + "PASS": {"Description": "Passed all variant filters"}, + } + return filter_dict From 3decb5f119ee18124aab2d4954298b68b2be6203 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 21 Jul 2020 12:42:48 -0400 Subject: [PATCH 02/61] Added AS_FIELDS, SITE_FIELDS constants and added function add_as_info_dict to update INFO_DICT with AS fields and their values --- gnomad/utils/vcf.py | 71 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 14 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 8138e0a07..a19aa215a 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -42,6 +42,36 @@ Sample sexes used in VCF export. """ +AS_FIELDS = [ + "AS_BaseQRankSum", + "AS_FS", + "AS_MQ", + "AS_MQRankSum", + "AS_pab_max", + "AS_QD", + "AS_ReadPosRankSum", + "AS_SOR", + "AS_VarDP", +] +""" +Allele-specific variant annotations. +""" + +SITE_FIELDS = [ + "FS", + "InbreedingCoeff", + "MQ", + "MQRankSum", + "QD", + "ReadPosRankSum", + "sibling_singleton", + "SOR", + "transmitted_singleton", + "VarDP", +] +""" +Site level variant annotations. +""" INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [ "AS_QUALapprox", @@ -77,21 +107,12 @@ "VQSR_NEGATIVE_TRAIN_SITE": { "Description": "Variant was used to build the negative training set of low-quality variants for VQSR" }, - "AS_BaseQRankSum": { - "Number": "A", - "Description": "Allele-specific Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities", - }, - "ClippingRankSum": { - "Number": "A", - "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference number of hard clipped bases", + "BaseQRankSum": { + "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities", }, "VarDP": { "Description": "Depth over variant genotypes (does not include depth of reference samples)" }, - "AS_VarDP": { - "Number": "A", - "Description": "Allele-specific depth over variant genotypes (does not include depth of reference samples)", - }, "AS_VQSLOD": { "Number": "A", "Description": "Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model", @@ -101,9 +122,6 @@ "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model", }, "lcr": {"Description": "Variant falls within a low complexity region"}, - "fail_interval_qc": { - "Description": f"Variant falls within a region where less than {INTERVAL_QC_PARAMETERS[0]}% of samples had a mean coverage of {INTERVAL_QC_PARAMETERS[1]}X" - }, "nonpar": { "Description": "Variant (on sex chromosome) falls outside a pseudoautosomal region" }, @@ -354,6 +372,31 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression return filters_dict +def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> None: + """ + Updates info dictionary with allele-specific terms and their descriptions. + + Used in VCF export. + + :param INFO_DICT: Dictionary containing site-level annotations and their descriptions. + :return: None + """ + prefix_text = "Allele-specific" + AS_DICT = {} + + for field in AS_FIELDS: + # Strip AS_ from field name + site_field = field[3:] + + AS_DICT[field] = {} + AS_DICT[field]["Number"] = "A" + AS_DICT[field][ + "Description" + ] = f"{prefix_text} {INFO_DICT[site_field]['Description'][0].lower()}{INFO_DICT[site_field]['Description'][1:]}" + + INFO_DICT.update(AS_DICT) + + def make_vcf_filter_dict( ht: hl.Table, snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float ) -> Dict[str, str]: From c4ad6b254d1a51ff475b55aabe94eafd354c3fd0 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 21 Jul 2020 13:06:25 -0400 Subject: [PATCH 03/61] Updated make_as_info_dict and added more constants (RF_FIELDS, VQSR_FIELDS, REGION_TYPE_FIELDS, ALLELE_TYPE_FIELDS --- gnomad/utils/vcf.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index a19aa215a..d27e8ea00 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -73,6 +73,33 @@ Site level variant annotations. """ +ALLELE_TYPE_FIELDS = ["allele_type", "has_star", "n_alt_alleles", "original_alleles", "variant_type", "was_mixed"] +""" +Allele-type annotations. +""" + +REGION_TYPE_FIELDS = ["lcr", "nonpar"] +""" +Annotations about variant region type. +""" + +RF_FIELDS = [ + "fail_hard_filters", + "filters", + "rf_label", + "rf_train", + "rf_probability", + "tp", +] +""" +Annotations specific to the random forest model. +""" + +VQSR_FIELDS = ["AS_VQSLOD", "AS_culprit", "NEGATIVE_TRAIN_SITE", "POSITIVE_TRAIN_SITE"] +""" +Annotations specific to VQSR. +""" + INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [ "AS_QUALapprox", "AS_VarDP", @@ -372,14 +399,14 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression return filters_dict -def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> None: +def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]: """ Updates info dictionary with allele-specific terms and their descriptions. Used in VCF export. :param INFO_DICT: Dictionary containing site-level annotations and their descriptions. - :return: None + :return: Dictionary with allele specific annotations, their descriptions, and their VCF number field. """ prefix_text = "Allele-specific" AS_DICT = {} @@ -394,7 +421,7 @@ def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> None: "Description" ] = f"{prefix_text} {INFO_DICT[site_field]['Description'][0].lower()}{INFO_DICT[site_field]['Description'][1:]}" - INFO_DICT.update(AS_DICT) + return AS_DICT def make_vcf_filter_dict( From 6c9b95fbddfc98966e0720f41ae630eb941973e1 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 21 Jul 2020 13:27:26 -0400 Subject: [PATCH 04/61] Added constants for entries to select during export --- gnomad/utils/vcf.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index d27e8ea00..cbcbe8b28 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -73,7 +73,14 @@ Site level variant annotations. """ -ALLELE_TYPE_FIELDS = ["allele_type", "has_star", "n_alt_alleles", "original_alleles", "variant_type", "was_mixed"] +ALLELE_TYPE_FIELDS = [ + "allele_type", + "has_star", + "n_alt_alleles", + "original_alleles", + "variant_type", + "was_mixed", +] """ Allele-type annotations. """ @@ -191,6 +198,28 @@ Dictionary used during VCF export to export row (variant) annotations. """ +ENTRIES = ["GT", "GQ", "DP", "AD", "MIN_DP", "PGT", "PID", "PL", "SB"] +""" +Densified entries to be selected during VCF export. +""" + +SPARSE_ENTRIES = [ + "DP", + "GQ", + "LA", + "LAD", + "LGT", + "LPGT", + "LPL", + "MIN_DP", + "PID", + "RGQ", + "SB", +] +""" +Sparse entries to be selected and densified during VCF export. +""" + FORMAT_DICT = { "GT": {"Description": "Genotype", "Number": "1", "Type": "String"}, "AD": { From a9c0354b648fb5a9e37440bd5c7b1deed578d891 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 21 Jul 2020 17:00:18 -0400 Subject: [PATCH 05/61] Added make_info_dict to vcf.py --- gnomad/utils/vcf.py | 125 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index cbcbe8b28..afeff1b27 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -428,6 +428,131 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression return filters_dict +def make_info_dict( + prefix: str, + label_groups: Dict[str, str] = None, + bin_edges: Dict[str, str] = None, + faf: bool = False, + popmax: bool = False, + description_text: str = "", + age_hist_data: str = None, +) -> Dict[str, Dict[str, str]]: + """ + Generate dictionary of Number and Description attributes of VCF INFO fields. + + Used to populate the VCF header during export. + + :param str prefix: gnomAD or empty string. + :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). + :param dict bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation. + :param bool faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations. + :param bool popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations. + :param bool description_text: Optinal text to append to the end of age and popmax descriptions. + :param str age_hist_data: Pipe-delimited string of age histograms, from `get_age_distributions`. + :return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes. + :rtype: Dict[str, Dict[str, str]] + """ + if prefix != "": + prefix = f"{prefix}_" + + info_dict = dict() + + if age_hist_data: + age_hist_dict = { + f"{prefix}age_hist_het_bin_freq": { + "Number": "A", + "Description": f"Histogram of ages of heterozygous individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", + }, + f"{prefix}age_hist_het_n_smaller": { + "Number": "A", + "Description": f"Count of age values falling below lowest histogram bin edge for heterozygous individuals{description_test}", + }, + f"{prefix}age_hist_het_n_larger": { + "Number": "A", + "Description": f"Count of age values falling above highest histogram bin edge for heterozygous individuals{description_test}", + }, + f"{prefix}age_hist_hom_bin_freq": { + "Number": "A", + "Description": f"Histogram of ages of homozygous alternate individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", + }, + f"{prefix}age_hist_hom_n_smaller": { + "Number": "A", + "Description": f"Count of age values falling below lowest histogram bin edge for homozygous alternate individuals{description_test}", + }, + f"{prefix}age_hist_hom_n_larger": { + "Number": "A", + "Description": f"Count of age values falling above highest histogram bin edge for homozygous alternate individuals{description_test}", + }, + } + info_dict.update(age_hist_dict) + + if popmax: + popmax_dict = { + f"{prefix}popmax": { + "Number": "A", + "Description": f"Population with maximum AF{description_test}", + }, + f"{prefix}AC_popmax": { + "Number": "A", + "Description": f"Allele count in the population with the maximum AF{description_test}", + }, + f"{prefix}AN_popmax": { + "Number": "A", + "Description": f"Total number of alleles in the population with the maximum AF{description_test}", + }, + f"{prefix}AF_popmax": { + "Number": "A", + "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_test}", + }, + f"{prefix}nhomalt_popmax": { + "Number": "A", + "Description": f"Count of homozygous individuals in the population with the maximum allele frequency{description_test}", + }, + } + info_dict.update(popmax_dict) + + else: + group_types = sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x)) + combos = make_label_combos(label_groups) + + for combo in combos: + combo_fields = combo.split("_") + + if not faf: + combo_dict = { + f"{prefix}AC_{combo}": { + "Number": "A", + "Description": f"Alternate allele count{make_combo_header_text('for', group_types, combo_fields, prefix)}", + }, + f"{prefix}AN_{combo}": { + "Number": "1", + "Description": f"Total number of alleles{make_combo_header_text('in', group_types, combo_fields, prefix)}", + }, + f"{prefix}AF_{combo}": { + "Number": "A", + "Description": f"Alternate allele frequency{make_combo_header_text('in', group_types, combo_fields, prefix)}", + }, + f"{prefix}nhomalt_{combo}": { + "Number": "A", + "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_types, combo_fields, prefix)}", + }, + } + else: + combo_dict = { + f"{prefix}faf95_{combo}": { + "Number": "A", + "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}", + }, + f"{prefix}faf99_{combo}": { + "Number": "A", + "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}", + }, + } + info_dict.update(combo_dict) + return info_dict + + def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]: """ Updates info dictionary with allele-specific terms and their descriptions. From 20f5d52e30fad160d8cab322ea6c51422edbd375 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 09:48:18 -0400 Subject: [PATCH 06/61] Removed unnecessary pop constants (can be imported from ancestry.py) --- gnomad/utils/vcf.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index afeff1b27..3022a63e5 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -17,21 +17,6 @@ Quality histograms used in VCF export. """ -POPS = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"] -""" -Continental population names used in VCF export. -""" - -NFE_SUBPOPS = ["onf", "bgr", "swe", "nwe", "seu", "est"] -""" -gnomAD subpopulations for NFE population. Used in VCF export. -""" - -EAS_SUBPOPS = ["kor", "oea", "jpn"] -""" -gnomAD subpopulations for EAS population. Used in VCF export. -""" - FAF_POPS = ["afr", "amr", "eas", "nfe", "sas"] """ Populations that are included in filtering allele frequency calculations. Used in VCF export. From 2117d82c7ca015c539d28bbbfaa5f1f5bf00facf Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 10:20:33 -0400 Subject: [PATCH 07/61] Added make hist bin edges expr function to vcf.py, also removed types from docstrings --- gnomad/utils/vcf.py | 78 +++++++++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 20 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 3022a63e5..081996506 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -355,17 +355,16 @@ def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]: def generic_field_check( ht: hl.Table, cond_expr, check_description, display_fields, verbose -): +) -> None: """ Check a generic logical condition involving annotations in a Hail Table and print the results to terminal - :param Table ht: Table containing annotations to be checked - :param Expression cond_expr: logical expression referring to annotations in ht to be checked - :param str check_description: String describing the condition being checked; is displayed in terminal summary message - :param list of str display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes); + :param ht: Table containing annotations to be checked + :param cond_expr: logical expression referring to annotations in ht to be checked + :param check_description: String describing the condition being checked; is displayed in terminal summary message + :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes); these fields are also displayed if verbose is True :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks - :rtype: None """ ht_orig = ht ht = ht.filter(cond_expr) @@ -384,9 +383,8 @@ def generic_field_check( def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]: """ Make Hail Expressions to measure % variants filtered under varying conditions of interest - :param Table ht: Hail Table containing 'filter' annotation to be examined + :param ht: Hail Table containing 'filter' annotation to be examined :return: Dictionary containing Hail aggregation expressions to measure filter flags - :rtype: Dict of str: Expression """ filters_dict = { "n": hl.agg.count(), @@ -427,16 +425,15 @@ def make_info_dict( Used to populate the VCF header during export. - :param str prefix: gnomAD or empty string. - :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + :param prefix: gnomAD or empty string. + :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). - :param dict bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation. - :param bool faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations. - :param bool popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations. - :param bool description_text: Optinal text to append to the end of age and popmax descriptions. + :param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation. + :param faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations. + :param popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations. + :param description_text: Optinal text to append to the end of age and popmax descriptions. :param str age_hist_data: Pipe-delimited string of age histograms, from `get_age_distributions`. :return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes. - :rtype: Dict[str, Dict[str, str]] """ if prefix != "": prefix = f"{prefix}_" @@ -569,12 +566,11 @@ def make_vcf_filter_dict( """ Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations. - :param Table ht: Table containing global annotations of the Random Forests SNP and indel cutoffs. - :param float snp_cutoff: Minimum SNP cutoff score from random forest model. - :param float indel_cutoff: Minimum indel cutoff score from random forest model. - :param float inbreeding_cutoff: Inbreeding coefficient hard cutoff. + :param ht: Table containing global annotations of the Random Forests SNP and indel cutoffs. + :param snp_cutoff: Minimum SNP cutoff score from random forest model. + :param indel_cutoff: Minimum indel cutoff score from random forest model. + :param inbreeding_cutoff: Inbreeding coefficient hard cutoff. :return: Dictionary keyed by VCF FILTER annotations, where values are Dictionaries of Number and Description attributes. - :rtype: Dict[str, str] """ filter_dict = { "AC0": { @@ -587,3 +583,45 @@ def make_vcf_filter_dict( "PASS": {"Description": "Passed all variant filters"}, } return filter_dict + + +def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str, str]: + """ + Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string + separated by pipe delimiters. + + :param ht: Table containing histogram variant annotations. + :param prefix: Prefix text for age histogram bin edges. Default is gnomad_. + :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values. + """ + edges_dict = { + "het": "|".join( + map(lambda x: f"{x:.1f}", ht.head(1).age_hist_het.collect()[0].bin_edges) + ), + "hom": "|".join( + map(lambda x: f"{x:.1f}", ht.head(1).age_hist_hom.collect()[0].bin_edges) + ), + } + for hist in HISTS: + + # Parse hists calculated on both raw and adj-filtered data + for hist_type in ["adj_qual_hists", "qual_hists"]: + hist_name = hist + if "adj" in hist_type: + hist_name = f"{hist}_adj" + edges_dict[hist_name] = ( + "|".join( + map( + lambda x: f"{x:.2f}", + ht.head(1)[hist_type][hist].collect()[0].bin_edges, + ) + ) + if "ab" in hist + else "|".join( + map( + lambda x: str(int(x)), + ht.head(1)[hist_type][hist].collect()[0].bin_edges, + ) + ) + ) + return edges_dict From 78084a7422b5b9e532101abafa71fedd076624f7 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 10:29:54 -0400 Subject: [PATCH 08/61] Added make hist dict --- gnomad/utils/vcf.py | 69 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 12 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 081996506..85cacf397 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -334,11 +334,11 @@ def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpressi def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]: """ - Make combinations of all possible labels for a supplied dictionary of label groups + Make combinations of all possible labels for a supplied dictionary of label groups. + :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, - e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]) - :return: list of all possible combinations of values for the supplied label groupings - :rtype: list[str] + e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). + :return: list of all possible combinations of values for the supplied label groupings. """ copy_label_groups = copy.deepcopy(label_groups) if len(copy_label_groups) == 1: @@ -358,13 +358,14 @@ def generic_field_check( ) -> None: """ Check a generic logical condition involving annotations in a Hail Table and print the results to terminal - :param ht: Table containing annotations to be checked - :param cond_expr: logical expression referring to annotations in ht to be checked - :param check_description: String describing the condition being checked; is displayed in terminal summary message + + :param ht: Table containing annotations to be checked. + :param cond_expr: logical expression referring to annotations in ht to be checked. + :param check_description: String describing the condition being checked; is displayed in terminal summary message. :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes); - these fields are also displayed if verbose is True + these fields are also displayed if verbose is True. :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False, - show only top values of annotations that fail checks + show only top values of annotations that fail checks. """ ht_orig = ht ht = ht.filter(cond_expr) @@ -382,9 +383,9 @@ def generic_field_check( def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]: """ - Make Hail Expressions to measure % variants filtered under varying conditions of interest - :param ht: Hail Table containing 'filter' annotation to be examined - :return: Dictionary containing Hail aggregation expressions to measure filter flags + Make Hail Expressions to measure % variants filtered under varying conditions of interest. + :param ht: Hail Table containing 'filter' annotation to be examined. + :return: Dictionary containing Hail aggregation expressions to measure filter flags. """ filters_dict = { "n": hl.agg.count(), @@ -625,3 +626,47 @@ def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str, ) ) return edges_dict + + +def make_hist_dict(bin_edges: Dict[str, Dict[str, str]], adj: bool) -> Dict[str, str]: + """ + Generate dictionary of Number and Description attributes to be used in the VCF header, specifically for histogram annotations. + + :param bin_edges: Dictionary keyed by histogram annotation name, with corresponding string-reformatted bin edges for values. + :param adj: Whether to create a header dict for raw or adj quality histograms. + :return: Dictionary keyed by VCF INFO annotations, where values are Dictionaries of Number and Description attributes. + """ + header_hist_dict = {} + for hist in HISTS: + + # Get hists for both raw and adj data + if adj: + hist = f"{hist}_adj" + + edges = bin_edges[hist] + hist_fields = hist.split("_") + hist_text = hist_fields[0].upper() + + if hist_fields[2] == "alt": + hist_text = hist_text + " in heterozygous individuals" + if adj: + hist_text = hist_text + " calculated on high quality genotypes" + + hist_dict = { + f"{hist}_bin_freq": { + "Number": "A", + "Description": f"Histogram for {hist_text}; bin edges are: {edges}", + }, + f"{hist}_n_smaller": { + "Number": "A", + "Description": f"Count of {hist_fields[0].upper()} values falling below lowest histogram bin edge {hist_text}", + }, + f"{hist}_n_larger": { + "Number": "A", + "Description": f"Count of {hist_fields[0].upper()} values falling above highest histogram bin edge {hist_text}", + }, + } + + header_hist_dict.update(hist_dict) + + return header_hist_dict From 5c698b7fd3650a41c0818d7af953bbb6e92f4ce6 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 10:33:02 -0400 Subject: [PATCH 09/61] Updated changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e4ab4b5f1..78388cbf2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * Fix for error in `compute_quantile_bin` that caused incorrect binning when a single score overlapped multiple bins. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238) * Removed assumption of `snv` annotation from `compute_quantile_bin`. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238) * Fixed `create_binned_ht` because it produced a "Cannot combine expressions from different source objects error". [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238) +* Added constants and functions relevant to VCF export [(#241)](https://github.com/broadinstitute/gnomad_methods/pull/241) ## Version 0.4.0 - July 9th, 2020 From e30d67b7bb3465b637a1079d0a15dfe7ce4ff875 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 10:56:30 -0400 Subject: [PATCH 10/61] Added SORT_ORDER and sample_sum_check to vcf.py --- gnomad/utils/vcf.py | 98 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 86 insertions(+), 12 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 85cacf397..6bee6707e 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -1,11 +1,15 @@ import hail as hl -from typing import List +from typing import Dict, List import logging logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) +SORT_ORDER = ["popmax", "group", "pop", "subpop", "sex"] +""" +Order to sort subgroupings during VCF export. +""" GROUPS = ["adj", "raw"] """ @@ -445,27 +449,27 @@ def make_info_dict( age_hist_dict = { f"{prefix}age_hist_het_bin_freq": { "Number": "A", - "Description": f"Histogram of ages of heterozygous individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", + "Description": f"Histogram of ages of heterozygous individuals{description_text}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", }, f"{prefix}age_hist_het_n_smaller": { "Number": "A", - "Description": f"Count of age values falling below lowest histogram bin edge for heterozygous individuals{description_test}", + "Description": f"Count of age values falling below lowest histogram bin edge for heterozygous individuals{description_text}", }, f"{prefix}age_hist_het_n_larger": { "Number": "A", - "Description": f"Count of age values falling above highest histogram bin edge for heterozygous individuals{description_test}", + "Description": f"Count of age values falling above highest histogram bin edge for heterozygous individuals{description_text}", }, f"{prefix}age_hist_hom_bin_freq": { "Number": "A", - "Description": f"Histogram of ages of homozygous alternate individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", + "Description": f"Histogram of ages of homozygous alternate individuals{description_text}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", }, f"{prefix}age_hist_hom_n_smaller": { "Number": "A", - "Description": f"Count of age values falling below lowest histogram bin edge for homozygous alternate individuals{description_test}", + "Description": f"Count of age values falling below lowest histogram bin edge for homozygous alternate individuals{description_text}", }, f"{prefix}age_hist_hom_n_larger": { "Number": "A", - "Description": f"Count of age values falling above highest histogram bin edge for homozygous alternate individuals{description_test}", + "Description": f"Count of age values falling above highest histogram bin edge for homozygous alternate individuals{description_text}", }, } info_dict.update(age_hist_dict) @@ -474,23 +478,23 @@ def make_info_dict( popmax_dict = { f"{prefix}popmax": { "Number": "A", - "Description": f"Population with maximum AF{description_test}", + "Description": f"Population with maximum AF{description_text}", }, f"{prefix}AC_popmax": { "Number": "A", - "Description": f"Allele count in the population with the maximum AF{description_test}", + "Description": f"Allele count in the population with the maximum AF{description_text}", }, f"{prefix}AN_popmax": { "Number": "A", - "Description": f"Total number of alleles in the population with the maximum AF{description_test}", + "Description": f"Total number of alleles in the population with the maximum AF{description_text}", }, f"{prefix}AF_popmax": { "Number": "A", - "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_test}", + "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_text}", }, f"{prefix}nhomalt_popmax": { "Number": "A", - "Description": f"Count of homozygous individuals in the population with the maximum allele frequency{description_test}", + "Description": f"Count of homozygous individuals in the population with the maximum allele frequency{description_text}", }, } info_dict.update(popmax_dict) @@ -670,3 +674,73 @@ def make_hist_dict(bin_edges: Dict[str, Dict[str, str]], adj: bool) -> Dict[str, header_hist_dict.update(hist_dict) return header_hist_dict + + +def sample_sum_check( + ht: hl.Table, + prefix: str, + label_groups: Dict[str, List[str]], + verbose: bool, + subpop: bool = None, +) -> None: + """ + Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version; + display results from checking the sum of the specified annotations in the terminal. + + :param Table ht: Table containing annotations to be summed. + :param str prefix: String indicating sample subset. + :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). + :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False, + show only top values of annotations that fail checks. + :param str subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked. + :rtype: None + """ + combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in make_label_combos(label_groups)] + combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in make_label_combos(label_groups)] + combo_nhomalt = [ + ht.info[f"{prefix}nhomalt_{x}"] for x in make_label_combos(label_groups) + ] + + group = label_groups.pop("group")[0] + alt_groups = "_".join( + sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x)) + ) + + annot_dict = { + f"sum_AC_{group}_{alt_groups}": hl.sum(combo_AC), + f"sum_AN_{group}_{alt_groups}": hl.sum(combo_AN), + f"sum_nhomalt_{group}_{alt_groups}": hl.sum(combo_nhomalt), + } + + ht = ht.annotate(**annot_dict) + + for subfield in ["AC", "AN", "nhomalt"]: + if not subpop: + generic_field_check( + ht, + ( + ht.info[f"{prefix}{subfield}_{group}"] + != ht[f"sum_{subfield}_{group}_{alt_groups}"] + ), + f"{prefix}{subfield}_{group} = sum({subfield}_{group}_{alt_groups})", + [ + f"info.{prefix}{subfield}_{group}", + f"sum_{subfield}_{group}_{alt_groups}", + ], + verbose, + ) + else: + generic_field_check( + ht, + ( + ht.info[f"{prefix}{subfield}_{group}_{subpop}"] + != ht[f"sum_{subfield}_{group}_{alt_groups}"] + ), + f"{prefix}{subfield}_{group}_{subpop} = sum({subfield}_{group}_{alt_groups})", + [ + f"info.{prefix}{subfield}_{group}_{subpop}", + f"sum_{subfield}_{group}_{alt_groups}", + ], + verbose, + ) From 706864d1322c6d5916a7024b8d24fd1a11aed517 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 11:38:18 -0400 Subject: [PATCH 11/61] Moved make combo header text to vcf.py --- gnomad/utils/vcf.py | 58 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 6bee6707e..47f02659a 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -1,6 +1,9 @@ -import hail as hl -from typing import Dict, List import logging +from typing import Dict, List + +import hail as hl + +from gnomad.sample_qc.ancestry import POP_NAMES logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger(__name__) @@ -416,6 +419,53 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression return filters_dict +def make_combo_header_text( + preposition: str, + group_types: List[str], + combo_fields: List[str], + prefix: str, +) -> str: + """ + Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset. + + For example, if preposition is "for", group_types is ["group", "pop", "sex"], and combo_fields is ["adj", "afr", "female"], + this function will return the string " for female samples of African-American/African ancestry". + + :param preposition: Relevant preposition to precede automatically generated text. + :param group_types: List of grouping types, e.g. "sex" or "pop". + :param combo_fields: List of the specific values for each grouping type, for which the text is being generated. + :param prefix: Prefix string indicating sample subset. + :return: String with automatically generated description text for a given set of combo fields. + """ + combo_dict = dict(zip(group_types, combo_fields)) + header_text = " " + preposition + + if len(combo_dict.keys()) == 1: + if combo_dict["group"] == "adj": + return "" + + if "sex" in combo_dict.keys(): + header_text = header_text + " " + combo_dict["sex"] + + header_text = header_text + " samples" + + if "subpop" in combo_dict.keys(): + header_text = header_text + f" of {POP_NAMES[combo_dict['subpop']]} ancestry" + combo_dict.pop("pop") + + if "pop" in combo_dict.keys(): + header_text = header_text + f" of {POP_NAMES[combo_dict['pop']]} ancestry" + + if "gnomad" in prefix: + header_text = header_text + " in gnomAD" + + if "group" in group_types: + if combo_dict["group"] == "raw": + header_text = header_text + ", before removing low-confidence genotypes" + + return header_text + + def make_info_dict( prefix: str, label_groups: Dict[str, str] = None, @@ -529,11 +579,11 @@ def make_info_dict( combo_dict = { f"{prefix}faf95_{combo}": { "Number": "A", - "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}", + "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}", }, f"{prefix}faf99_{combo}": { "Number": "A", - "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}", + "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}", }, } info_dict.update(combo_dict) From d058743bc00d23acbfa764c2912c04e851da527b Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 11:42:12 -0400 Subject: [PATCH 12/61] Fixed imports and reformatted with black --- gnomad/utils/vcf.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 47f02659a..002b791df 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -1,3 +1,5 @@ +import copy +import itertools import logging from typing import Dict, List @@ -420,10 +422,7 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression def make_combo_header_text( - preposition: str, - group_types: List[str], - combo_fields: List[str], - prefix: str, + preposition: str, group_types: List[str], combo_fields: List[str], prefix: str, ) -> str: """ Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset. @@ -616,7 +615,7 @@ def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str def make_vcf_filter_dict( - ht: hl.Table, snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float + snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float ) -> Dict[str, str]: """ Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations. @@ -650,10 +649,10 @@ def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str, :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values. """ edges_dict = { - "het": "|".join( + f"{prefix}het": "|".join( map(lambda x: f"{x:.1f}", ht.head(1).age_hist_het.collect()[0].bin_edges) ), - "hom": "|".join( + f"{prefix}hom": "|".join( map(lambda x: f"{x:.1f}", ht.head(1).age_hist_hom.collect()[0].bin_edges) ), } From d1bbacd9bae46a62e9906ca28afd06c03a760593 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 11:47:25 -0400 Subject: [PATCH 13/61] Changed docstring for make_hist_bin_edges_expr --- gnomad/utils/vcf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 002b791df..f2c980ce4 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -639,13 +639,13 @@ def make_vcf_filter_dict( return filter_dict -def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str, str]: +def make_hist_bin_edges_expr(ht: hl.Table, prefix: str) -> Dict[str, str]: """ Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string separated by pipe delimiters. :param ht: Table containing histogram variant annotations. - :param prefix: Prefix text for age histogram bin edges. Default is gnomad_. + :param prefix: Prefix text for age histogram bin edges. :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values. """ edges_dict = { From 424fcc18711be84726e4ec6e999aaa433845c4fb Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 13:02:24 -0400 Subject: [PATCH 14/61] Added set female metrics to NA --- gnomad/utils/vcf.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index f2c980ce4..46edfda14 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -1,7 +1,7 @@ import copy import itertools import logging -from typing import Dict, List +from typing import Dict, List, Union import hail as hl @@ -793,3 +793,35 @@ def sample_sum_check( ], verbose, ) + + +def set_female_y_metrics_to_na( + t: Union[hl.Table, hl.MatrixTable], faf: bool = True, +) -> Dict[str, hl.expr.Int32Expression]: + """ + Set AC, AN, and nhomalt Y variant annotations for females to NA (instead of 0). + + :param Table/MatrixTable t: Table/MatrixTable containing female variant annotations. + :return: Dictionary with reset annotations + :rtype: Dict[str, hl.expr.Int32Expression] + """ + metrics = list(t.row.info) + female_metrics = [x for x in metrics if "_female" in x] + female_metrics = [ + x for x in female_metrics if ("nhomalt" in x) or ("AC" in x) or ("AN" in x) + ] + + if faf: + female_metrics.extend([x for x in female_metrics if "faf" in x]) + + female_metrics_dict = {} + for metric in female_metrics: + female_metrics_dict.update( + { + f"{metric}": hl.or_missing( + (t.locus.contig.in_y_nonpar() | t.locus.contig.in_y_par()), + t.info[f"{metric}"], + ) + } + ) + return female_metrics_dict From d397b87054502ee7edcea7a365d4787235abe12f Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 13:06:08 -0400 Subject: [PATCH 15/61] Updated docstring in set female metrics to na --- gnomad/utils/vcf.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 46edfda14..010c9adba 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -799,11 +799,10 @@ def set_female_y_metrics_to_na( t: Union[hl.Table, hl.MatrixTable], faf: bool = True, ) -> Dict[str, hl.expr.Int32Expression]: """ - Set AC, AN, and nhomalt Y variant annotations for females to NA (instead of 0). + Set AC, AN, and nhomalt chrY variant annotations for females to NA (instead of 0). - :param Table/MatrixTable t: Table/MatrixTable containing female variant annotations. + :param t: Table/MatrixTable containing female variant annotations. :return: Dictionary with reset annotations - :rtype: Dict[str, hl.expr.Int32Expression] """ metrics = list(t.row.info) female_metrics = [x for x in metrics if "_female" in x] From ec0ddd6a2ad564df848c44be4de8216212b17a00 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 12:15:21 -0400 Subject: [PATCH 16/61] Removed transmitted singleton and sibling singleton from SITE_FIELDS --- gnomad/utils/vcf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 010c9adba..150125429 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -58,9 +58,7 @@ "MQRankSum", "QD", "ReadPosRankSum", - "sibling_singleton", "SOR", - "transmitted_singleton", "VarDP", ] """ From 647d8e6b88c5b49f00d62280d03bc0299f804758 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:19:26 -0400 Subject: [PATCH 17/61] Updated make_label_combos docstring and fixed values for some constants/their descriptions --- gnomad/utils/vcf.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 150125429..690d37a7a 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -14,6 +14,7 @@ SORT_ORDER = ["popmax", "group", "pop", "subpop", "sex"] """ Order to sort subgroupings during VCF export. +Ensures that INFO labels in VCF are in desired order (e.g., raw_AC_afr_female). """ GROUPS = ["adj", "raw"] @@ -28,12 +29,13 @@ FAF_POPS = ["afr", "amr", "eas", "nfe", "sas"] """ -Populations that are included in filtering allele frequency calculations. Used in VCF export. +Global populations that are included in filtering allele frequency (faf) calculations. Used in VCF export. """ SEXES = ["male", "female"] """ Sample sexes used in VCF export. +Used to stratify frequency annotations (AC, AN, AF) for each sex. """ AS_FIELDS = [ @@ -77,9 +79,12 @@ Allele-type annotations. """ -REGION_TYPE_FIELDS = ["lcr", "nonpar"] +REGION_FLAG_FIELDS = ["decoy", "lcr", "nonpar", "segdup"] """ Annotations about variant region type. + +.. note:: + decoy and segdup resource files do not currently exist for GrCh38/hg38. """ RF_FIELDS = [ @@ -91,7 +96,7 @@ "tp", ] """ -Annotations specific to the random forest model. +Annotations specific to the variant QC using a random forest model. """ VQSR_FIELDS = ["AS_VQSLOD", "AS_culprit", "NEGATIVE_TRAIN_SITE", "POSITIVE_TRAIN_SITE"] @@ -147,10 +152,12 @@ "Number": "A", "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model", }, + "decoy": {"Description": "Variant falls within a reference decoy region"}, "lcr": {"Description": "Variant falls within a low complexity region"}, "nonpar": { "Description": "Variant (on sex chromosome) falls outside a pseudoautosomal region" }, + "segdup": {"Description": "Variant falls within a segmental duplication region"}, "rf_positive_label": { "Description": "Variant was labelled as a positive example for training of random forest model" }, @@ -339,18 +346,24 @@ def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpressi return info_mt.filter_cols(False) -def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]: +def make_label_combos( + label_groups: Dict[str, List[str]], sort_order: List[str] = SORT_ORDER, +) -> List[str]: """ Make combinations of all possible labels for a supplied dictionary of label groups. - :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + For example, if label_groups is `{"sex": ["male", "female"], "pop": ["afr", "nfe", "amr"]}`, + this function will return `["afr_male", "afr_female", "nfe_male", "nfe_female", "amr_male", "amr_female']` + + :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). + :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. :return: list of all possible combinations of values for the supplied label groupings. """ copy_label_groups = copy.deepcopy(label_groups) if len(copy_label_groups) == 1: return [item for sublist in copy_label_groups.values() for item in sublist] - anchor_group = sorted(copy_label_groups.keys(), key=lambda x: SORT_ORDER.index(x))[ + anchor_group = sorted(copy_label_groups.keys(), key=lambda x: sort_order.index(x))[ 0 ] anchor_val = copy_label_groups.pop(anchor_group) From 973c5ef1dd17a4fc803c5ad0d98ce83032f0bca1 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:21:43 -0400 Subject: [PATCH 18/61] Updated docstring for generic_field_check --- gnomad/utils/vcf.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 690d37a7a..8f8e099e0 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -374,17 +374,23 @@ def make_label_combos( def generic_field_check( - ht: hl.Table, cond_expr, check_description, display_fields, verbose + ht: hl.Table, + cond_expr: hl.expr.BooleanExpression, + check_description: str, + display_fields: List[str], + verbose: bool, ) -> None: """ - Check a generic logical condition involving annotations in a Hail Table and print the results to terminal + Check a generic logical condition involving annotations in a Hail Table and print the results to terminal. + + Displays the number of rows in the Table that return False for a condition. :param ht: Table containing annotations to be checked. - :param cond_expr: logical expression referring to annotations in ht to be checked. + :param cond_expr: Logical expression referring to annotations in ht to be checked. :param check_description: String describing the condition being checked; is displayed in terminal summary message. :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes); these fields are also displayed if verbose is True. - :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False, + :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks. """ ht_orig = ht From ae3fcf4e93f4a4aaccd26ea7c305f68989cc6144 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:24:03 -0400 Subject: [PATCH 19/61] Updated docstring for make_filters_sanity_check_expr --- gnomad/utils/vcf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 8f8e099e0..ecbc12965 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -410,7 +410,8 @@ def generic_field_check( def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]: """ Make Hail Expressions to measure % variants filtered under varying conditions of interest. - :param ht: Hail Table containing 'filter' annotation to be examined. + + :param ht: Table containing 'filter' annotation to be examined. :return: Dictionary containing Hail aggregation expressions to measure filter flags. """ filters_dict = { From d3967aadb88272d02d27e925ae0f5a7bf9645b1a Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:32:55 -0400 Subject: [PATCH 20/61] Updated make_filters_sanity_check_expr --- gnomad/utils/vcf.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index ecbc12965..1ffc677af 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -409,32 +409,37 @@ def generic_field_check( def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]: """ - Make Hail Expressions to measure % variants filtered under varying conditions of interest. + Make Hail expressions to measure % variants filtered under varying conditions of interest. + + Checks for: + - Total number of variants + - Fraction of variants removed by any filter + - Fraction of variants removed because of InbreedingCoefficient filter in combination with any other filter + - Fraction of varinats removed because of AC0 filter in combination with any other filter + - Fraction of variants removed because of random forest filtering in combination with any other filter + - Fraction of variants removed only because of InbreedingCoefficient filter + - Fraction of variants removed only because of AC0 filter + - Fraction of variants removed only because of random forest filtering + :param ht: Table containing 'filter' annotation to be examined. :return: Dictionary containing Hail aggregation expressions to measure filter flags. """ filters_dict = { "n": hl.agg.count(), - "frac_any_filter": hl.agg.count_where(ht.is_filtered) / hl.agg.count(), - "frac_inbreed_coeff": hl.agg.count_where( - ht.filters.contains("inbreeding_coeff") - ) - / hl.agg.count(), - "frac_ac0": hl.agg.count_where(ht.filters.contains("AC0")) / hl.agg.count(), - "frac_rf": hl.agg.count_where(ht.filters.contains("rf")) / hl.agg.count(), - "frac_inbreed_coeff_only": hl.agg.count_where( + "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) == 0), + "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("inbreeding_coeff")), + "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")), + "frac_rf": hl.agg.fraction(ht.filters.contains("rf")), + "frac_inbreed_coeff_only": hl.agg.fraction( ht.filters.contains("inbreeding_coeff") & (ht.filters.length() == 1) - ) - / hl.agg.count(), - "frac_ac0_only": hl.agg.count_where( + ), + "frac_ac0_only": hl.agg.fraction( ht.filters.contains("AC0") & (ht.filters.length() == 1) - ) - / hl.agg.count(), - "frac_rf_only": hl.agg.count_where( + ), + "frac_rf_only": hl.agg.fraction( ht.filters.contains("rf") & (ht.filters.length() == 1) - ) - / hl.agg.count(), + ), } return filters_dict From 5313ea1dea9082248b0856a7950b235eb2d02de1 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:38:49 -0400 Subject: [PATCH 21/61] Updated docstring for make_combo_header_text --- gnomad/utils/vcf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 1ffc677af..df25494aa 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -454,7 +454,7 @@ def make_combo_header_text( this function will return the string " for female samples of African-American/African ancestry". :param preposition: Relevant preposition to precede automatically generated text. - :param group_types: List of grouping types, e.g. "sex" or "pop". + :param group_types: List of grouping types. Possible values in this list are: "group", "pop", "sex", "subpop". :param combo_fields: List of the specific values for each grouping type, for which the text is being generated. :param prefix: Prefix string indicating sample subset. :return: String with automatically generated description text for a given set of combo fields. @@ -462,11 +462,11 @@ def make_combo_header_text( combo_dict = dict(zip(group_types, combo_fields)) header_text = " " + preposition - if len(combo_dict.keys()) == 1: + if len(combo_dict) == 1: if combo_dict["group"] == "adj": return "" - if "sex" in combo_dict.keys(): + if "sex" in combo_dict: header_text = header_text + " " + combo_dict["sex"] header_text = header_text + " samples" From b556441e58771e7b0b0571d4ae3b6c0231767b61 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:42:42 -0400 Subject: [PATCH 22/61] Updated docstring for make_info_dict --- gnomad/utils/vcf.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index df25494aa..9be64db77 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -500,7 +500,13 @@ def make_info_dict( """ Generate dictionary of Number and Description attributes of VCF INFO fields. - Used to populate the VCF header during export. + Used to populate the INFO fields of the VCF header during export. + + Creates: + - INFO fields for age histograms (bin freq, n_smaller, and n_larger for heterozygous and homozygous variant carriers) + - INFO fields for popmax AC, AN, AF, nhomalt, and popmax population + - INFO fields for AC, AN, AF, nhomalt for each combination of sample population, sex, and subpopulation, both for adj and raw data + - INFO fields for filtering allele frequency (faf) annotations :param prefix: gnomAD or empty string. :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, From 13ac89e786e0ecef2ae30cd2ff62808e47689e3d Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:54:07 -0400 Subject: [PATCH 23/61] Updated make_combo_header_text to take dict as input --- gnomad/utils/vcf.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 9be64db77..d4fe986fa 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -445,7 +445,7 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression def make_combo_header_text( - preposition: str, group_types: List[str], combo_fields: List[str], prefix: str, + preposition: str, combo_dict: Dict[str, str], prefix: str, ) -> str: """ Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset. @@ -454,12 +454,12 @@ def make_combo_header_text( this function will return the string " for female samples of African-American/African ancestry". :param preposition: Relevant preposition to precede automatically generated text. - :param group_types: List of grouping types. Possible values in this list are: "group", "pop", "sex", "subpop". - :param combo_fields: List of the specific values for each grouping type, for which the text is being generated. + :param combo_dict: Dict with grouping types as keys and values for grouping type as values. This function generates text for these values. + Possible grouping types are: "group", "pop", "sex", and "subpop". + Example input: {"pop": "afr", "sex": "female"} :param prefix: Prefix string indicating sample subset. :return: String with automatically generated description text for a given set of combo fields. """ - combo_dict = dict(zip(group_types, combo_fields)) header_text = " " + preposition if len(combo_dict) == 1: @@ -471,17 +471,17 @@ def make_combo_header_text( header_text = header_text + " samples" - if "subpop" in combo_dict.keys(): + if "subpop" in combo_dict: header_text = header_text + f" of {POP_NAMES[combo_dict['subpop']]} ancestry" combo_dict.pop("pop") - if "pop" in combo_dict.keys(): + if "pop" in combo_dict: header_text = header_text + f" of {POP_NAMES[combo_dict['pop']]} ancestry" if "gnomad" in prefix: header_text = header_text + " in gnomAD" - if "group" in group_types: + if "group" in combo_dict: if combo_dict["group"] == "raw": header_text = header_text + ", before removing low-confidence genotypes" @@ -489,7 +489,7 @@ def make_combo_header_text( def make_info_dict( - prefix: str, + prefix: str = "", label_groups: Dict[str, str] = None, bin_edges: Dict[str, str] = None, faf: bool = False, @@ -508,7 +508,7 @@ def make_info_dict( - INFO fields for AC, AN, AF, nhomalt for each combination of sample population, sex, and subpopulation, both for adj and raw data - INFO fields for filtering allele frequency (faf) annotations - :param prefix: gnomAD or empty string. + :param prefix: Prefix string for data, e.g. "gnomAD". Default is empty string. :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). :param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation. @@ -583,35 +583,36 @@ def make_info_dict( for combo in combos: combo_fields = combo.split("_") + group_dict = dict(zip(group_types, combo_fields)) if not faf: combo_dict = { f"{prefix}AC_{combo}": { "Number": "A", - "Description": f"Alternate allele count{make_combo_header_text('for', group_types, combo_fields, prefix)}", + "Description": f"Alternate allele count{make_combo_header_text('for', group_dict, prefix)}", }, f"{prefix}AN_{combo}": { "Number": "1", - "Description": f"Total number of alleles{make_combo_header_text('in', group_types, combo_fields, prefix)}", + "Description": f"Total number of alleles{make_combo_header_text('in', group_dict, prefix)}", }, f"{prefix}AF_{combo}": { "Number": "A", - "Description": f"Alternate allele frequency{make_combo_header_text('in', group_types, combo_fields, prefix)}", + "Description": f"Alternate allele frequency{make_combo_header_text('in', group_dict, prefix)}", }, f"{prefix}nhomalt_{combo}": { "Number": "A", - "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_types, combo_fields, prefix)}", + "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_dict, prefix)}", }, } else: combo_dict = { f"{prefix}faf95_{combo}": { "Number": "A", - "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}", + "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_dict, prefix)}", }, f"{prefix}faf99_{combo}": { "Number": "A", - "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}", + "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_dict, prefix)}", }, } info_dict.update(combo_dict) From 1202e2ebca096f25dec87b1d8a255cf052ec5f05 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:57:03 -0400 Subject: [PATCH 24/61] Updated make_info_dict to pass in sort order --- gnomad/utils/vcf.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index d4fe986fa..e6401e831 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -496,6 +496,7 @@ def make_info_dict( popmax: bool = False, description_text: str = "", age_hist_data: str = None, + sort_order: List[str] = SORT_ORDER, ) -> Dict[str, Dict[str, str]]: """ Generate dictionary of Number and Description attributes of VCF INFO fields. @@ -514,8 +515,9 @@ def make_info_dict( :param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation. :param faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations. :param popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations. - :param description_text: Optinal text to append to the end of age and popmax descriptions. + :param description_text: Optional text to append to the end of age and popmax descriptions. Needs to start with a space if specified. :param str age_hist_data: Pipe-delimited string of age histograms, from `get_age_distributions`. + :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. :return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes. """ if prefix != "": @@ -578,7 +580,7 @@ def make_info_dict( info_dict.update(popmax_dict) else: - group_types = sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x)) + group_types = sorted(label_groups.keys(), key=lambda x: sort_order.index(x)) combos = make_label_combos(label_groups) for combo in combos: From 759cfac4806001a5c664ccee6fd7c1e061ac8c47 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 29 Jul 2020 09:54:46 -0400 Subject: [PATCH 25/61] Addressed rest of review comments --- gnomad/utils/vcf.py | 95 +++++++++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 37 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index e6401e831..37165c08e 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -621,29 +621,41 @@ def make_info_dict( return info_dict -def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]: +def add_as_info_dict( + info_dict: Dict[str, Dict[str, str]] = INFO_DICT, as_fields: List[str] = AS_FIELDS +) -> Dict[str, Dict[str, str]]: """ Updates info dictionary with allele-specific terms and their descriptions. Used in VCF export. - :param INFO_DICT: Dictionary containing site-level annotations and their descriptions. + :param info_dict: Dictionary containing site-level annotations and their descriptions. Default is INFO_DICT. + :param as_fields: List containing allele-specific fields to be added to info_dict. Default is AS_FIELDS. :return: Dictionary with allele specific annotations, their descriptions, and their VCF number field. """ prefix_text = "Allele-specific" - AS_DICT = {} + as_dict = {} + + for field in as_fields: + + try: + # Strip AS_ from field name + site_field = field[3:] - for field in AS_FIELDS: - # Strip AS_ from field name - site_field = field[3:] + # Get site description from info dictionary and make first letter lower case + first_letter = info_dict[site_field]["Description"][0].lower() + rest_of_description = info_dict[site_field]["Description"][1:] - AS_DICT[field] = {} - AS_DICT[field]["Number"] = "A" - AS_DICT[field][ - "Description" - ] = f"{prefix_text} {INFO_DICT[site_field]['Description'][0].lower()}{INFO_DICT[site_field]['Description'][1:]}" + as_dict[field] = {} + as_dict[field]["Number"] = "A" + as_dict[field][ + "Description" + ] = f"{prefix_text} {first_letter}{rest_of_description}" - return AS_DICT + except KeyError: + raise ValueError(f"{field} is not present in input info dictionary!") + + return as_dict def make_vcf_filter_dict( @@ -652,7 +664,12 @@ def make_vcf_filter_dict( """ Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations. - :param ht: Table containing global annotations of the Random Forests SNP and indel cutoffs. + Generates descriptions for: + - AC0 filter + - InbreedingCoeff filter + - RF filter + - PASS (passed all variant filters) + :param snp_cutoff: Minimum SNP cutoff score from random forest model. :param indel_cutoff: Minimum indel cutoff score from random forest model. :param inbreeding_cutoff: Inbreeding coefficient hard cutoff. @@ -671,43 +688,45 @@ def make_vcf_filter_dict( return filter_dict -def make_hist_bin_edges_expr(ht: hl.Table, prefix: str) -> Dict[str, str]: +def make_hist_bin_edges_expr( + ht: hl.Table, hists: List[str] = HISTS, prefix: str = "" +) -> Dict[str, str]: """ Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string separated by pipe delimiters. :param ht: Table containing histogram variant annotations. - :param prefix: Prefix text for age histogram bin edges. + :param hists: List of variant histogram annotations. Default is HISTS. + :param prefix: Prefix text for age histogram bin edges. Default is empty string. :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values. """ + # Add underscore to prefix if it isn't empty + if prefix != "": + prefix += "_" + edges_dict = { - f"{prefix}het": "|".join( - map(lambda x: f"{x:.1f}", ht.head(1).age_hist_het.collect()[0].bin_edges) - ), - f"{prefix}hom": "|".join( - map(lambda x: f"{x:.1f}", ht.head(1).age_hist_hom.collect()[0].bin_edges) - ), + f"{prefix}{call_type}": "|".join( + map( + lambda x: f"{x:.1f}", + ht.head(1)[f"age_hist_{call_type}"].collect()[0].bin_edges, + ) + ) + for call_type in ["het", "hom"] } - for hist in HISTS: + + for hist in hists: + + hist_name = hist # Parse hists calculated on both raw and adj-filtered data for hist_type in ["adj_qual_hists", "qual_hists"]: - hist_name = hist + if "adj" in hist_type: hist_name = f"{hist}_adj" - edges_dict[hist_name] = ( - "|".join( - map( - lambda x: f"{x:.2f}", - ht.head(1)[hist_type][hist].collect()[0].bin_edges, - ) - ) - if "ab" in hist - else "|".join( - map( - lambda x: str(int(x)), - ht.head(1)[hist_type][hist].collect()[0].bin_edges, - ) + edges_dict[hist_name] = "|".join( + map( + lambda x: f"{x:.2f}" if "ab" in hist else str(int(x)), + ht.head(1)[hist_type][hist].collect()[0].bin_edges, ) ) return edges_dict @@ -763,6 +782,7 @@ def sample_sum_check( label_groups: Dict[str, List[str]], verbose: bool, subpop: bool = None, + sort_order: List[str] = SORT_ORDER, ) -> None: """ Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version; @@ -775,6 +795,7 @@ def sample_sum_check( :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks. :param str subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked. + :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. :rtype: None """ combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in make_label_combos(label_groups)] @@ -785,7 +806,7 @@ def sample_sum_check( group = label_groups.pop("group")[0] alt_groups = "_".join( - sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x)) + sorted(label_groups.keys(), key=lambda x: sort_order.index(x)) ) annot_dict = { From fb14af8527e433581553c8615246882abc0e1902 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Thu, 30 Jul 2020 13:50:58 -0400 Subject: [PATCH 26/61] Add BaseQRankSum to SITE_FIELDS const --- gnomad/utils/vcf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 37165c08e..dd868e355 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -54,6 +54,7 @@ """ SITE_FIELDS = [ + "BaseQRankSum", "FS", "InbreedingCoeff", "MQ", From 8d1f38c32110159bea11ce7ffcc9b8a090b0b2f6 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Thu, 30 Jul 2020 16:55:15 -0400 Subject: [PATCH 27/61] Addressed review comments --- gnomad/utils/vcf.py | 47 +++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index dd868e355..903a90040 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -85,7 +85,7 @@ Annotations about variant region type. .. note:: - decoy and segdup resource files do not currently exist for GrCh38/hg38. + decoy and segdup resource files do not currently exist for GRCh38/hg38. """ RF_FIELDS = [ @@ -145,13 +145,11 @@ "VarDP": { "Description": "Depth over variant genotypes (does not include depth of reference samples)" }, - "AS_VQSLOD": { - "Number": "A", - "Description": "Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model", + "VQSLOD": { + "Description": "Log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model", }, - "AS_VQSR_culprit": { - "Number": "A", - "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model", + "VQSR_culprit": { + "Description": "Worst-performing annotation in the VQSR Gaussian mixture model", }, "decoy": {"Description": "Variant falls within a reference decoy region"}, "lcr": {"Description": "Variant falls within a low complexity region"}, @@ -177,19 +175,16 @@ "variant_type": { "Description": "Variant type (snv, indel, multi-snv, multi-indel, or mixed)" }, - "allele_type": { - "Number": "A", - "Description": "Allele type (snv, insertion, deletion, or mixed)", - }, + "allele_type": {"Description": "Allele type (snv, insertion, deletion, or mixed)",}, "n_alt_alleles": { - "Number": "A", + "Number": 1, "Description": "Total number of alternate alleles observed at variant locus", }, "was_mixed": {"Description": "Variant type was mixed"}, "has_star": { "Description": "Variant locus coincides with a spanning deletion (represented by a star) observed elsewhere in the callset" }, - "pab_max": { + "AS_pab_max": { "Number": "A", "Description": "Maximum p-value over callset for binomial test of observed allele balance for a heterozygous genotype, given expectation of 0.5", }, @@ -384,7 +379,12 @@ def generic_field_check( """ Check a generic logical condition involving annotations in a Hail Table and print the results to terminal. - Displays the number of rows in the Table that return False for a condition. + Displays the number of rows in the Table that return False for a condition (`cond_expr`). + + .. note:: + This function checks for the number of rows that violate the `cond_expr` and prints the desired condition (`check_description`) to the terminal. + `cond_expr` and `check_description` are opposites and should never be the same. + :param ht: Table containing annotations to be checked. :param cond_expr: Logical expression referring to annotations in ht to be checked. @@ -428,7 +428,7 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression """ filters_dict = { "n": hl.agg.count(), - "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) == 0), + "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) != 0), "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("inbreeding_coeff")), "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")), "frac_rf": hl.agg.fraction(ht.filters.contains("rf")), @@ -542,7 +542,7 @@ def make_info_dict( }, f"{prefix}age_hist_hom_bin_freq": { "Number": "A", - "Description": f"Histogram of ages of homozygous alternate individuals{description_text}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", + "Description": f"Histogram of ages of homozygous alternate individuals{description_text}; bin edges are: {bin_edges['hom']}; total number of individuals of any genotype bin: {age_hist_data}", }, f"{prefix}age_hist_hom_n_smaller": { "Number": "A", @@ -571,7 +571,7 @@ def make_info_dict( }, f"{prefix}AF_popmax": { "Number": "A", - "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_text}", + "Description": f"Maximum allele frequency across populations{description_text}", }, f"{prefix}nhomalt_popmax": { "Number": "A", @@ -634,7 +634,6 @@ def add_as_info_dict( :param as_fields: List containing allele-specific fields to be added to info_dict. Default is AS_FIELDS. :return: Dictionary with allele specific annotations, their descriptions, and their VCF number field. """ - prefix_text = "Allele-specific" as_dict = {} for field in as_fields: @@ -651,7 +650,7 @@ def add_as_info_dict( as_dict[field]["Number"] = "A" as_dict[field][ "Description" - ] = f"{prefix_text} {first_letter}{rest_of_description}" + ] = f"Allele-specific {first_letter}{rest_of_description}" except KeyError: raise ValueError(f"{field} is not present in input info dictionary!") @@ -850,7 +849,7 @@ def sample_sum_check( def set_female_y_metrics_to_na( - t: Union[hl.Table, hl.MatrixTable], faf: bool = True, + t: Union[hl.Table, hl.MatrixTable], ) -> Dict[str, hl.expr.Int32Expression]: """ Set AC, AN, and nhomalt chrY variant annotations for females to NA (instead of 0). @@ -860,19 +859,13 @@ def set_female_y_metrics_to_na( """ metrics = list(t.row.info) female_metrics = [x for x in metrics if "_female" in x] - female_metrics = [ - x for x in female_metrics if ("nhomalt" in x) or ("AC" in x) or ("AN" in x) - ] - - if faf: - female_metrics.extend([x for x in female_metrics if "faf" in x]) female_metrics_dict = {} for metric in female_metrics: female_metrics_dict.update( { f"{metric}": hl.or_missing( - (t.locus.contig.in_y_nonpar() | t.locus.contig.in_y_par()), + (~t.locus.contig.in_y_nonpar() | ~t.locus.contig.in_y_par()), t.info[f"{metric}"], ) } From 92dbedfdb3f5194de05d96d5de94d33dbf286f29 Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Fri, 31 Jul 2020 10:19:41 -0400 Subject: [PATCH 28/61] Update cache and setup-python Actions (#244) * Update checkout and setup-python actions * Install wheel in CI workflow --- .github/workflows/ci.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 30dac2faa..df7e193c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,11 +12,11 @@ jobs: - name: Checkout uses: actions/checkout@v2 - name: Setup Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: 3.7 - name: Use pip cache - uses: actions/cache@v1 + uses: actions/cache@v2 with: path: ~/.cache/pip key: pip-${{ hashFiles('**/requirements*.txt') }} @@ -24,6 +24,7 @@ jobs: pip- - name: Install dependencies run: | + pip install wheel pip install -r requirements.txt pip install -r requirements-dev.txt - name: Run Pylint @@ -49,6 +50,7 @@ jobs: pip- - name: Install dependencies run: | + pip install wheel pip install -r requirements.txt pip install -r docs/requirements.txt - name: Build docs From d18e2bbd7fddc7bc995e83cc67c7aa270522bfcf Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 21 Jul 2020 11:56:25 -0400 Subject: [PATCH 29/61] Added constants, make_label_combos, generic_field_check, and make_filters_sanity_check_expr to vcf.py from gnomad_qc. Also added INFO_DICT, FORMAT_DICT, and make_vcf_filter_dict from ukb repo --- gnomad/utils/vcf.py | 286 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 286 insertions(+) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 43c4b6200..8138e0a07 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -6,6 +6,43 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) + +GROUPS = ["adj", "raw"] +""" +Group names used to generate labels for high quality genotypes and all raw genotypes. Used in VCF export. +""" + +HISTS = ["gq_hist_alt", "gq_hist_all", "dp_hist_alt", "dp_hist_all", "ab_hist_alt"] +""" +Quality histograms used in VCF export. +""" + +POPS = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"] +""" +Continental population names used in VCF export. +""" + +NFE_SUBPOPS = ["onf", "bgr", "swe", "nwe", "seu", "est"] +""" +gnomAD subpopulations for NFE population. Used in VCF export. +""" + +EAS_SUBPOPS = ["kor", "oea", "jpn"] +""" +gnomAD subpopulations for EAS population. Used in VCF export. +""" + +FAF_POPS = ["afr", "amr", "eas", "nfe", "sas"] +""" +Populations that are included in filtering allele frequency calculations. Used in VCF export. +""" + +SEXES = ["male", "female"] +""" +Sample sexes used in VCF export. +""" + + INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [ "AS_QUALapprox", "AS_VarDP", @@ -14,6 +51,148 @@ "AS_SB_TABLE", ] +INFO_DICT = { + "FS": { + "Description": "Phred-scaled p-value of Fisher's exact test for strand bias" + }, + "InbreedingCoeff": { + "Description": "Inbreeding coefficient, the excess heterozygosity at a variant site, computed as 1 - (the number of heterozygous genotypes)/(the number of heterozygous genotypes expected under Hardy-Weinberg equilibrium)" + }, + "MQ": { + "Description": "Root mean square of the mapping quality of reads across all samples" + }, + "MQRankSum": { + "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference read mapping qualities" + }, + "QD": { + "Description": "Variant call confidence normalized by depth of sample reads supporting a variant" + }, + "ReadPosRankSum": { + "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference read position bias" + }, + "SOR": {"Description": "Strand bias estimated by the symmetric odds ratio test"}, + "VQSR_POSITIVE_TRAIN_SITE": { + "Description": "Variant was used to build the positive training set of high-quality variants for VQSR" + }, + "VQSR_NEGATIVE_TRAIN_SITE": { + "Description": "Variant was used to build the negative training set of low-quality variants for VQSR" + }, + "AS_BaseQRankSum": { + "Number": "A", + "Description": "Allele-specific Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities", + }, + "ClippingRankSum": { + "Number": "A", + "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference number of hard clipped bases", + }, + "VarDP": { + "Description": "Depth over variant genotypes (does not include depth of reference samples)" + }, + "AS_VarDP": { + "Number": "A", + "Description": "Allele-specific depth over variant genotypes (does not include depth of reference samples)", + }, + "AS_VQSLOD": { + "Number": "A", + "Description": "Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model", + }, + "AS_VQSR_culprit": { + "Number": "A", + "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model", + }, + "lcr": {"Description": "Variant falls within a low complexity region"}, + "fail_interval_qc": { + "Description": f"Variant falls within a region where less than {INTERVAL_QC_PARAMETERS[0]}% of samples had a mean coverage of {INTERVAL_QC_PARAMETERS[1]}X" + }, + "nonpar": { + "Description": "Variant (on sex chromosome) falls outside a pseudoautosomal region" + }, + "rf_positive_label": { + "Description": "Variant was labelled as a positive example for training of random forest model" + }, + "rf_negative_label": { + "Description": "Variant was labelled as a negative example for training of random forest model" + }, + "rf_label": {"Description": "Random forest training label"}, + "rf_train": {"Description": "Variant was used in training random forest model"}, + "rf_tp_probability": { + "Description": "Probability of a called variant being a true variant as determined by random forest model" + }, + "transmitted_singleton": { + "Description": "Variant was a callset-wide doubleton that was transmitted within a family from a parent to a child (i.e., a singleton amongst unrelated samples in cohort)" + }, + "original_alleles": {"Description": "Alleles before splitting multiallelics"}, + "variant_type": { + "Description": "Variant type (snv, indel, multi-snv, multi-indel, or mixed)" + }, + "allele_type": { + "Number": "A", + "Description": "Allele type (snv, insertion, deletion, or mixed)", + }, + "n_alt_alleles": { + "Number": "A", + "Description": "Total number of alternate alleles observed at variant locus", + }, + "was_mixed": {"Description": "Variant type was mixed"}, + "has_star": { + "Description": "Variant locus coincides with a spanning deletion (represented by a star) observed elsewhere in the callset" + }, + "pab_max": { + "Number": "A", + "Description": "Maximum p-value over callset for binomial test of observed allele balance for a heterozygous genotype, given expectation of 0.5", + }, +} +""" +Dictionary used during VCF export to export row (variant) annotations. +""" + +FORMAT_DICT = { + "GT": {"Description": "Genotype", "Number": "1", "Type": "String"}, + "AD": { + "Description": "Allelic depths for the ref and alt alleles in the order listed", + "Number": "R", + "Type": "Integer", + }, + "DP": { + "Description": "Approximate read depth (reads with MQ=255 or with bad mates are filtered)", + "Number": "1", + "Type": "Integer", + }, + "GQ": { + "Description": "Phred-scaled confidence that the genotype assignment is correct. Value is the difference between the second lowest PL and the lowest PL (always normalized to 0).", + "Number": "1", + "Type": "Integer", + }, + "MIN_DP": { + "Description": "Minimum DP observed within the GVCF block", + "Number": "1", + "Type": "Integer", + }, + "PGT": { + "Description": "Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another", + "Number": "1", + "Type": "String", + }, + "PID": { + "Description": "Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group", + "Number": "1", + "Type": "String", + }, + "PL": { + "Description": "Normalized, phred-scaled likelihoods for genotypes as defined in the VCF specification", + "Number": "G", + "Type": "Integer", + }, + "SB": { + "Description": "Per-sample component statistics which comprise the Fisher's exact test to detect strand bias. Values are: depth of reference allele on forward strand, depth of reference allele on reverse strand, depth of alternate allele on forward strand, depth of alternate allele on reverse strand.", + "Number": "4", + "Type": "Integer", + }, +} +""" +Dictionary used during VCF export to export MatrixTable entries. +""" + def ht_to_vcf_mt( info_ht: hl.Table, @@ -92,3 +271,110 @@ def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpressi # Create an MT with no cols so that we acn export to VCF info_mt = info_ht.to_matrix_table_row_major(columns=["s"], entry_field_name="s") return info_mt.filter_cols(False) + + +def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]: + """ + Make combinations of all possible labels for a supplied dictionary of label groups + :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]) + :return: list of all possible combinations of values for the supplied label groupings + :rtype: list[str] + """ + copy_label_groups = copy.deepcopy(label_groups) + if len(copy_label_groups) == 1: + return [item for sublist in copy_label_groups.values() for item in sublist] + anchor_group = sorted(copy_label_groups.keys(), key=lambda x: SORT_ORDER.index(x))[ + 0 + ] + anchor_val = copy_label_groups.pop(anchor_group) + combos = [] + for x, y in itertools.product(anchor_val, make_label_combos(copy_label_groups)): + combos.append("{0}_{1}".format(x, y)) + return combos + + +def generic_field_check( + ht: hl.Table, cond_expr, check_description, display_fields, verbose +): + """ + Check a generic logical condition involving annotations in a Hail Table and print the results to terminal + :param Table ht: Table containing annotations to be checked + :param Expression cond_expr: logical expression referring to annotations in ht to be checked + :param str check_description: String describing the condition being checked; is displayed in terminal summary message + :param list of str display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes); + these fields are also displayed if verbose is True + :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False, + show only top values of annotations that fail checks + :rtype: None + """ + ht_orig = ht + ht = ht.filter(cond_expr) + n_fail = ht.count() + if n_fail > 0: + logger.info(f"Found {n_fail} sites that fail {check_description} check:") + ht = ht.flatten() + ht.select("locus", "alleles", *display_fields).show() + else: + logger.info(f"PASSED {check_description} check") + if verbose: + ht_orig = ht_orig.flatten() + ht_orig.select(*display_fields).show() + + +def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]: + """ + Make Hail Expressions to measure % variants filtered under varying conditions of interest + :param Table ht: Hail Table containing 'filter' annotation to be examined + :return: Dictionary containing Hail aggregation expressions to measure filter flags + :rtype: Dict of str: Expression + """ + filters_dict = { + "n": hl.agg.count(), + "frac_any_filter": hl.agg.count_where(ht.is_filtered) / hl.agg.count(), + "frac_inbreed_coeff": hl.agg.count_where( + ht.filters.contains("inbreeding_coeff") + ) + / hl.agg.count(), + "frac_ac0": hl.agg.count_where(ht.filters.contains("AC0")) / hl.agg.count(), + "frac_rf": hl.agg.count_where(ht.filters.contains("rf")) / hl.agg.count(), + "frac_inbreed_coeff_only": hl.agg.count_where( + ht.filters.contains("inbreeding_coeff") & (ht.filters.length() == 1) + ) + / hl.agg.count(), + "frac_ac0_only": hl.agg.count_where( + ht.filters.contains("AC0") & (ht.filters.length() == 1) + ) + / hl.agg.count(), + "frac_rf_only": hl.agg.count_where( + ht.filters.contains("rf") & (ht.filters.length() == 1) + ) + / hl.agg.count(), + } + return filters_dict + + +def make_vcf_filter_dict( + ht: hl.Table, snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float +) -> Dict[str, str]: + """ + Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations. + + :param Table ht: Table containing global annotations of the Random Forests SNP and indel cutoffs. + :param float snp_cutoff: Minimum SNP cutoff score from random forest model. + :param float indel_cutoff: Minimum indel cutoff score from random forest model. + :param float inbreeding_cutoff: Inbreeding coefficient hard cutoff. + :return: Dictionary keyed by VCF FILTER annotations, where values are Dictionaries of Number and Description attributes. + :rtype: Dict[str, str] + """ + filter_dict = { + "AC0": { + "Description": "Allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB < 0.2 for het calls)" + }, + "InbreedingCoeff": {"Description": f"InbreedingCoeff < {inbreeding_cutoff}"}, + "RF": { + "Description": f"Failed random forest filtering thresholds of {snp_cutoff}, {indel_cutoff} (probabilities of being a true positive variant) for SNPs, indels" + }, + "PASS": {"Description": "Passed all variant filters"}, + } + return filter_dict From 26956b969acf13748fda1510f679c7b67649d2ce Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 21 Jul 2020 12:42:48 -0400 Subject: [PATCH 30/61] Added AS_FIELDS, SITE_FIELDS constants and added function add_as_info_dict to update INFO_DICT with AS fields and their values --- gnomad/utils/vcf.py | 71 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 14 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 8138e0a07..a19aa215a 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -42,6 +42,36 @@ Sample sexes used in VCF export. """ +AS_FIELDS = [ + "AS_BaseQRankSum", + "AS_FS", + "AS_MQ", + "AS_MQRankSum", + "AS_pab_max", + "AS_QD", + "AS_ReadPosRankSum", + "AS_SOR", + "AS_VarDP", +] +""" +Allele-specific variant annotations. +""" + +SITE_FIELDS = [ + "FS", + "InbreedingCoeff", + "MQ", + "MQRankSum", + "QD", + "ReadPosRankSum", + "sibling_singleton", + "SOR", + "transmitted_singleton", + "VarDP", +] +""" +Site level variant annotations. +""" INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [ "AS_QUALapprox", @@ -77,21 +107,12 @@ "VQSR_NEGATIVE_TRAIN_SITE": { "Description": "Variant was used to build the negative training set of low-quality variants for VQSR" }, - "AS_BaseQRankSum": { - "Number": "A", - "Description": "Allele-specific Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities", - }, - "ClippingRankSum": { - "Number": "A", - "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference number of hard clipped bases", + "BaseQRankSum": { + "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities", }, "VarDP": { "Description": "Depth over variant genotypes (does not include depth of reference samples)" }, - "AS_VarDP": { - "Number": "A", - "Description": "Allele-specific depth over variant genotypes (does not include depth of reference samples)", - }, "AS_VQSLOD": { "Number": "A", "Description": "Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model", @@ -101,9 +122,6 @@ "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model", }, "lcr": {"Description": "Variant falls within a low complexity region"}, - "fail_interval_qc": { - "Description": f"Variant falls within a region where less than {INTERVAL_QC_PARAMETERS[0]}% of samples had a mean coverage of {INTERVAL_QC_PARAMETERS[1]}X" - }, "nonpar": { "Description": "Variant (on sex chromosome) falls outside a pseudoautosomal region" }, @@ -354,6 +372,31 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression return filters_dict +def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> None: + """ + Updates info dictionary with allele-specific terms and their descriptions. + + Used in VCF export. + + :param INFO_DICT: Dictionary containing site-level annotations and their descriptions. + :return: None + """ + prefix_text = "Allele-specific" + AS_DICT = {} + + for field in AS_FIELDS: + # Strip AS_ from field name + site_field = field[3:] + + AS_DICT[field] = {} + AS_DICT[field]["Number"] = "A" + AS_DICT[field][ + "Description" + ] = f"{prefix_text} {INFO_DICT[site_field]['Description'][0].lower()}{INFO_DICT[site_field]['Description'][1:]}" + + INFO_DICT.update(AS_DICT) + + def make_vcf_filter_dict( ht: hl.Table, snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float ) -> Dict[str, str]: From 02f4f5633ef6144b9f298c55bf0d0aad09c5f4b3 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 21 Jul 2020 13:06:25 -0400 Subject: [PATCH 31/61] Updated make_as_info_dict and added more constants (RF_FIELDS, VQSR_FIELDS, REGION_TYPE_FIELDS, ALLELE_TYPE_FIELDS --- gnomad/utils/vcf.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index a19aa215a..d27e8ea00 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -73,6 +73,33 @@ Site level variant annotations. """ +ALLELE_TYPE_FIELDS = ["allele_type", "has_star", "n_alt_alleles", "original_alleles", "variant_type", "was_mixed"] +""" +Allele-type annotations. +""" + +REGION_TYPE_FIELDS = ["lcr", "nonpar"] +""" +Annotations about variant region type. +""" + +RF_FIELDS = [ + "fail_hard_filters", + "filters", + "rf_label", + "rf_train", + "rf_probability", + "tp", +] +""" +Annotations specific to the random forest model. +""" + +VQSR_FIELDS = ["AS_VQSLOD", "AS_culprit", "NEGATIVE_TRAIN_SITE", "POSITIVE_TRAIN_SITE"] +""" +Annotations specific to VQSR. +""" + INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [ "AS_QUALapprox", "AS_VarDP", @@ -372,14 +399,14 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression return filters_dict -def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> None: +def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]: """ Updates info dictionary with allele-specific terms and their descriptions. Used in VCF export. :param INFO_DICT: Dictionary containing site-level annotations and their descriptions. - :return: None + :return: Dictionary with allele specific annotations, their descriptions, and their VCF number field. """ prefix_text = "Allele-specific" AS_DICT = {} @@ -394,7 +421,7 @@ def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> None: "Description" ] = f"{prefix_text} {INFO_DICT[site_field]['Description'][0].lower()}{INFO_DICT[site_field]['Description'][1:]}" - INFO_DICT.update(AS_DICT) + return AS_DICT def make_vcf_filter_dict( From e7edd87713a93f2a76226d5a1f8b7a609c7c2260 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 21 Jul 2020 13:27:26 -0400 Subject: [PATCH 32/61] Added constants for entries to select during export --- gnomad/utils/vcf.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index d27e8ea00..cbcbe8b28 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -73,7 +73,14 @@ Site level variant annotations. """ -ALLELE_TYPE_FIELDS = ["allele_type", "has_star", "n_alt_alleles", "original_alleles", "variant_type", "was_mixed"] +ALLELE_TYPE_FIELDS = [ + "allele_type", + "has_star", + "n_alt_alleles", + "original_alleles", + "variant_type", + "was_mixed", +] """ Allele-type annotations. """ @@ -191,6 +198,28 @@ Dictionary used during VCF export to export row (variant) annotations. """ +ENTRIES = ["GT", "GQ", "DP", "AD", "MIN_DP", "PGT", "PID", "PL", "SB"] +""" +Densified entries to be selected during VCF export. +""" + +SPARSE_ENTRIES = [ + "DP", + "GQ", + "LA", + "LAD", + "LGT", + "LPGT", + "LPL", + "MIN_DP", + "PID", + "RGQ", + "SB", +] +""" +Sparse entries to be selected and densified during VCF export. +""" + FORMAT_DICT = { "GT": {"Description": "Genotype", "Number": "1", "Type": "String"}, "AD": { From 922a67476694c4af6b73e7c46eb67355ad10540d Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 21 Jul 2020 17:00:18 -0400 Subject: [PATCH 33/61] Added make_info_dict to vcf.py --- gnomad/utils/vcf.py | 125 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index cbcbe8b28..afeff1b27 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -428,6 +428,131 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression return filters_dict +def make_info_dict( + prefix: str, + label_groups: Dict[str, str] = None, + bin_edges: Dict[str, str] = None, + faf: bool = False, + popmax: bool = False, + description_text: str = "", + age_hist_data: str = None, +) -> Dict[str, Dict[str, str]]: + """ + Generate dictionary of Number and Description attributes of VCF INFO fields. + + Used to populate the VCF header during export. + + :param str prefix: gnomAD or empty string. + :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). + :param dict bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation. + :param bool faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations. + :param bool popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations. + :param bool description_text: Optinal text to append to the end of age and popmax descriptions. + :param str age_hist_data: Pipe-delimited string of age histograms, from `get_age_distributions`. + :return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes. + :rtype: Dict[str, Dict[str, str]] + """ + if prefix != "": + prefix = f"{prefix}_" + + info_dict = dict() + + if age_hist_data: + age_hist_dict = { + f"{prefix}age_hist_het_bin_freq": { + "Number": "A", + "Description": f"Histogram of ages of heterozygous individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", + }, + f"{prefix}age_hist_het_n_smaller": { + "Number": "A", + "Description": f"Count of age values falling below lowest histogram bin edge for heterozygous individuals{description_test}", + }, + f"{prefix}age_hist_het_n_larger": { + "Number": "A", + "Description": f"Count of age values falling above highest histogram bin edge for heterozygous individuals{description_test}", + }, + f"{prefix}age_hist_hom_bin_freq": { + "Number": "A", + "Description": f"Histogram of ages of homozygous alternate individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", + }, + f"{prefix}age_hist_hom_n_smaller": { + "Number": "A", + "Description": f"Count of age values falling below lowest histogram bin edge for homozygous alternate individuals{description_test}", + }, + f"{prefix}age_hist_hom_n_larger": { + "Number": "A", + "Description": f"Count of age values falling above highest histogram bin edge for homozygous alternate individuals{description_test}", + }, + } + info_dict.update(age_hist_dict) + + if popmax: + popmax_dict = { + f"{prefix}popmax": { + "Number": "A", + "Description": f"Population with maximum AF{description_test}", + }, + f"{prefix}AC_popmax": { + "Number": "A", + "Description": f"Allele count in the population with the maximum AF{description_test}", + }, + f"{prefix}AN_popmax": { + "Number": "A", + "Description": f"Total number of alleles in the population with the maximum AF{description_test}", + }, + f"{prefix}AF_popmax": { + "Number": "A", + "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_test}", + }, + f"{prefix}nhomalt_popmax": { + "Number": "A", + "Description": f"Count of homozygous individuals in the population with the maximum allele frequency{description_test}", + }, + } + info_dict.update(popmax_dict) + + else: + group_types = sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x)) + combos = make_label_combos(label_groups) + + for combo in combos: + combo_fields = combo.split("_") + + if not faf: + combo_dict = { + f"{prefix}AC_{combo}": { + "Number": "A", + "Description": f"Alternate allele count{make_combo_header_text('for', group_types, combo_fields, prefix)}", + }, + f"{prefix}AN_{combo}": { + "Number": "1", + "Description": f"Total number of alleles{make_combo_header_text('in', group_types, combo_fields, prefix)}", + }, + f"{prefix}AF_{combo}": { + "Number": "A", + "Description": f"Alternate allele frequency{make_combo_header_text('in', group_types, combo_fields, prefix)}", + }, + f"{prefix}nhomalt_{combo}": { + "Number": "A", + "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_types, combo_fields, prefix)}", + }, + } + else: + combo_dict = { + f"{prefix}faf95_{combo}": { + "Number": "A", + "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}", + }, + f"{prefix}faf99_{combo}": { + "Number": "A", + "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}", + }, + } + info_dict.update(combo_dict) + return info_dict + + def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]: """ Updates info dictionary with allele-specific terms and their descriptions. From 9ff11ca11c9c72cd5c15e00c0e0677b4ea14eb10 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 09:48:18 -0400 Subject: [PATCH 34/61] Removed unnecessary pop constants (can be imported from ancestry.py) --- gnomad/utils/vcf.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index afeff1b27..3022a63e5 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -17,21 +17,6 @@ Quality histograms used in VCF export. """ -POPS = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"] -""" -Continental population names used in VCF export. -""" - -NFE_SUBPOPS = ["onf", "bgr", "swe", "nwe", "seu", "est"] -""" -gnomAD subpopulations for NFE population. Used in VCF export. -""" - -EAS_SUBPOPS = ["kor", "oea", "jpn"] -""" -gnomAD subpopulations for EAS population. Used in VCF export. -""" - FAF_POPS = ["afr", "amr", "eas", "nfe", "sas"] """ Populations that are included in filtering allele frequency calculations. Used in VCF export. From 1bb7162e438f680bb5216babe99ab7d0cf352450 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 10:20:33 -0400 Subject: [PATCH 35/61] Added make hist bin edges expr function to vcf.py, also removed types from docstrings --- gnomad/utils/vcf.py | 78 +++++++++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 20 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 3022a63e5..081996506 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -355,17 +355,16 @@ def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]: def generic_field_check( ht: hl.Table, cond_expr, check_description, display_fields, verbose -): +) -> None: """ Check a generic logical condition involving annotations in a Hail Table and print the results to terminal - :param Table ht: Table containing annotations to be checked - :param Expression cond_expr: logical expression referring to annotations in ht to be checked - :param str check_description: String describing the condition being checked; is displayed in terminal summary message - :param list of str display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes); + :param ht: Table containing annotations to be checked + :param cond_expr: logical expression referring to annotations in ht to be checked + :param check_description: String describing the condition being checked; is displayed in terminal summary message + :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes); these fields are also displayed if verbose is True :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks - :rtype: None """ ht_orig = ht ht = ht.filter(cond_expr) @@ -384,9 +383,8 @@ def generic_field_check( def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]: """ Make Hail Expressions to measure % variants filtered under varying conditions of interest - :param Table ht: Hail Table containing 'filter' annotation to be examined + :param ht: Hail Table containing 'filter' annotation to be examined :return: Dictionary containing Hail aggregation expressions to measure filter flags - :rtype: Dict of str: Expression """ filters_dict = { "n": hl.agg.count(), @@ -427,16 +425,15 @@ def make_info_dict( Used to populate the VCF header during export. - :param str prefix: gnomAD or empty string. - :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + :param prefix: gnomAD or empty string. + :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). - :param dict bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation. - :param bool faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations. - :param bool popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations. - :param bool description_text: Optinal text to append to the end of age and popmax descriptions. + :param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation. + :param faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations. + :param popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations. + :param description_text: Optinal text to append to the end of age and popmax descriptions. :param str age_hist_data: Pipe-delimited string of age histograms, from `get_age_distributions`. :return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes. - :rtype: Dict[str, Dict[str, str]] """ if prefix != "": prefix = f"{prefix}_" @@ -569,12 +566,11 @@ def make_vcf_filter_dict( """ Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations. - :param Table ht: Table containing global annotations of the Random Forests SNP and indel cutoffs. - :param float snp_cutoff: Minimum SNP cutoff score from random forest model. - :param float indel_cutoff: Minimum indel cutoff score from random forest model. - :param float inbreeding_cutoff: Inbreeding coefficient hard cutoff. + :param ht: Table containing global annotations of the Random Forests SNP and indel cutoffs. + :param snp_cutoff: Minimum SNP cutoff score from random forest model. + :param indel_cutoff: Minimum indel cutoff score from random forest model. + :param inbreeding_cutoff: Inbreeding coefficient hard cutoff. :return: Dictionary keyed by VCF FILTER annotations, where values are Dictionaries of Number and Description attributes. - :rtype: Dict[str, str] """ filter_dict = { "AC0": { @@ -587,3 +583,45 @@ def make_vcf_filter_dict( "PASS": {"Description": "Passed all variant filters"}, } return filter_dict + + +def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str, str]: + """ + Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string + separated by pipe delimiters. + + :param ht: Table containing histogram variant annotations. + :param prefix: Prefix text for age histogram bin edges. Default is gnomad_. + :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values. + """ + edges_dict = { + "het": "|".join( + map(lambda x: f"{x:.1f}", ht.head(1).age_hist_het.collect()[0].bin_edges) + ), + "hom": "|".join( + map(lambda x: f"{x:.1f}", ht.head(1).age_hist_hom.collect()[0].bin_edges) + ), + } + for hist in HISTS: + + # Parse hists calculated on both raw and adj-filtered data + for hist_type in ["adj_qual_hists", "qual_hists"]: + hist_name = hist + if "adj" in hist_type: + hist_name = f"{hist}_adj" + edges_dict[hist_name] = ( + "|".join( + map( + lambda x: f"{x:.2f}", + ht.head(1)[hist_type][hist].collect()[0].bin_edges, + ) + ) + if "ab" in hist + else "|".join( + map( + lambda x: str(int(x)), + ht.head(1)[hist_type][hist].collect()[0].bin_edges, + ) + ) + ) + return edges_dict From 48bf3bbec6afe8a01dcc2d96b0ad31fc460cc215 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 10:29:54 -0400 Subject: [PATCH 36/61] Added make hist dict --- gnomad/utils/vcf.py | 69 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 12 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 081996506..85cacf397 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -334,11 +334,11 @@ def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpressi def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]: """ - Make combinations of all possible labels for a supplied dictionary of label groups + Make combinations of all possible labels for a supplied dictionary of label groups. + :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, - e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]) - :return: list of all possible combinations of values for the supplied label groupings - :rtype: list[str] + e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). + :return: list of all possible combinations of values for the supplied label groupings. """ copy_label_groups = copy.deepcopy(label_groups) if len(copy_label_groups) == 1: @@ -358,13 +358,14 @@ def generic_field_check( ) -> None: """ Check a generic logical condition involving annotations in a Hail Table and print the results to terminal - :param ht: Table containing annotations to be checked - :param cond_expr: logical expression referring to annotations in ht to be checked - :param check_description: String describing the condition being checked; is displayed in terminal summary message + + :param ht: Table containing annotations to be checked. + :param cond_expr: logical expression referring to annotations in ht to be checked. + :param check_description: String describing the condition being checked; is displayed in terminal summary message. :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes); - these fields are also displayed if verbose is True + these fields are also displayed if verbose is True. :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False, - show only top values of annotations that fail checks + show only top values of annotations that fail checks. """ ht_orig = ht ht = ht.filter(cond_expr) @@ -382,9 +383,9 @@ def generic_field_check( def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]: """ - Make Hail Expressions to measure % variants filtered under varying conditions of interest - :param ht: Hail Table containing 'filter' annotation to be examined - :return: Dictionary containing Hail aggregation expressions to measure filter flags + Make Hail Expressions to measure % variants filtered under varying conditions of interest. + :param ht: Hail Table containing 'filter' annotation to be examined. + :return: Dictionary containing Hail aggregation expressions to measure filter flags. """ filters_dict = { "n": hl.agg.count(), @@ -625,3 +626,47 @@ def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str, ) ) return edges_dict + + +def make_hist_dict(bin_edges: Dict[str, Dict[str, str]], adj: bool) -> Dict[str, str]: + """ + Generate dictionary of Number and Description attributes to be used in the VCF header, specifically for histogram annotations. + + :param bin_edges: Dictionary keyed by histogram annotation name, with corresponding string-reformatted bin edges for values. + :param adj: Whether to create a header dict for raw or adj quality histograms. + :return: Dictionary keyed by VCF INFO annotations, where values are Dictionaries of Number and Description attributes. + """ + header_hist_dict = {} + for hist in HISTS: + + # Get hists for both raw and adj data + if adj: + hist = f"{hist}_adj" + + edges = bin_edges[hist] + hist_fields = hist.split("_") + hist_text = hist_fields[0].upper() + + if hist_fields[2] == "alt": + hist_text = hist_text + " in heterozygous individuals" + if adj: + hist_text = hist_text + " calculated on high quality genotypes" + + hist_dict = { + f"{hist}_bin_freq": { + "Number": "A", + "Description": f"Histogram for {hist_text}; bin edges are: {edges}", + }, + f"{hist}_n_smaller": { + "Number": "A", + "Description": f"Count of {hist_fields[0].upper()} values falling below lowest histogram bin edge {hist_text}", + }, + f"{hist}_n_larger": { + "Number": "A", + "Description": f"Count of {hist_fields[0].upper()} values falling above highest histogram bin edge {hist_text}", + }, + } + + header_hist_dict.update(hist_dict) + + return header_hist_dict From e736da01d58eaf0de3c15a852be4bdfc9fbb848a Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 10:33:02 -0400 Subject: [PATCH 37/61] Updated changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e4ab4b5f1..78388cbf2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * Fix for error in `compute_quantile_bin` that caused incorrect binning when a single score overlapped multiple bins. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238) * Removed assumption of `snv` annotation from `compute_quantile_bin`. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238) * Fixed `create_binned_ht` because it produced a "Cannot combine expressions from different source objects error". [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238) +* Added constants and functions relevant to VCF export [(#241)](https://github.com/broadinstitute/gnomad_methods/pull/241) ## Version 0.4.0 - July 9th, 2020 From ffaa908a3cf895349e6816ae62b94f803b96f7ff Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 10:56:30 -0400 Subject: [PATCH 38/61] Added SORT_ORDER and sample_sum_check to vcf.py --- gnomad/utils/vcf.py | 98 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 86 insertions(+), 12 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 85cacf397..6bee6707e 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -1,11 +1,15 @@ import hail as hl -from typing import List +from typing import Dict, List import logging logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) +SORT_ORDER = ["popmax", "group", "pop", "subpop", "sex"] +""" +Order to sort subgroupings during VCF export. +""" GROUPS = ["adj", "raw"] """ @@ -445,27 +449,27 @@ def make_info_dict( age_hist_dict = { f"{prefix}age_hist_het_bin_freq": { "Number": "A", - "Description": f"Histogram of ages of heterozygous individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", + "Description": f"Histogram of ages of heterozygous individuals{description_text}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", }, f"{prefix}age_hist_het_n_smaller": { "Number": "A", - "Description": f"Count of age values falling below lowest histogram bin edge for heterozygous individuals{description_test}", + "Description": f"Count of age values falling below lowest histogram bin edge for heterozygous individuals{description_text}", }, f"{prefix}age_hist_het_n_larger": { "Number": "A", - "Description": f"Count of age values falling above highest histogram bin edge for heterozygous individuals{description_test}", + "Description": f"Count of age values falling above highest histogram bin edge for heterozygous individuals{description_text}", }, f"{prefix}age_hist_hom_bin_freq": { "Number": "A", - "Description": f"Histogram of ages of homozygous alternate individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", + "Description": f"Histogram of ages of homozygous alternate individuals{description_text}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", }, f"{prefix}age_hist_hom_n_smaller": { "Number": "A", - "Description": f"Count of age values falling below lowest histogram bin edge for homozygous alternate individuals{description_test}", + "Description": f"Count of age values falling below lowest histogram bin edge for homozygous alternate individuals{description_text}", }, f"{prefix}age_hist_hom_n_larger": { "Number": "A", - "Description": f"Count of age values falling above highest histogram bin edge for homozygous alternate individuals{description_test}", + "Description": f"Count of age values falling above highest histogram bin edge for homozygous alternate individuals{description_text}", }, } info_dict.update(age_hist_dict) @@ -474,23 +478,23 @@ def make_info_dict( popmax_dict = { f"{prefix}popmax": { "Number": "A", - "Description": f"Population with maximum AF{description_test}", + "Description": f"Population with maximum AF{description_text}", }, f"{prefix}AC_popmax": { "Number": "A", - "Description": f"Allele count in the population with the maximum AF{description_test}", + "Description": f"Allele count in the population with the maximum AF{description_text}", }, f"{prefix}AN_popmax": { "Number": "A", - "Description": f"Total number of alleles in the population with the maximum AF{description_test}", + "Description": f"Total number of alleles in the population with the maximum AF{description_text}", }, f"{prefix}AF_popmax": { "Number": "A", - "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_test}", + "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_text}", }, f"{prefix}nhomalt_popmax": { "Number": "A", - "Description": f"Count of homozygous individuals in the population with the maximum allele frequency{description_test}", + "Description": f"Count of homozygous individuals in the population with the maximum allele frequency{description_text}", }, } info_dict.update(popmax_dict) @@ -670,3 +674,73 @@ def make_hist_dict(bin_edges: Dict[str, Dict[str, str]], adj: bool) -> Dict[str, header_hist_dict.update(hist_dict) return header_hist_dict + + +def sample_sum_check( + ht: hl.Table, + prefix: str, + label_groups: Dict[str, List[str]], + verbose: bool, + subpop: bool = None, +) -> None: + """ + Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version; + display results from checking the sum of the specified annotations in the terminal. + + :param Table ht: Table containing annotations to be summed. + :param str prefix: String indicating sample subset. + :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). + :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False, + show only top values of annotations that fail checks. + :param str subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked. + :rtype: None + """ + combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in make_label_combos(label_groups)] + combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in make_label_combos(label_groups)] + combo_nhomalt = [ + ht.info[f"{prefix}nhomalt_{x}"] for x in make_label_combos(label_groups) + ] + + group = label_groups.pop("group")[0] + alt_groups = "_".join( + sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x)) + ) + + annot_dict = { + f"sum_AC_{group}_{alt_groups}": hl.sum(combo_AC), + f"sum_AN_{group}_{alt_groups}": hl.sum(combo_AN), + f"sum_nhomalt_{group}_{alt_groups}": hl.sum(combo_nhomalt), + } + + ht = ht.annotate(**annot_dict) + + for subfield in ["AC", "AN", "nhomalt"]: + if not subpop: + generic_field_check( + ht, + ( + ht.info[f"{prefix}{subfield}_{group}"] + != ht[f"sum_{subfield}_{group}_{alt_groups}"] + ), + f"{prefix}{subfield}_{group} = sum({subfield}_{group}_{alt_groups})", + [ + f"info.{prefix}{subfield}_{group}", + f"sum_{subfield}_{group}_{alt_groups}", + ], + verbose, + ) + else: + generic_field_check( + ht, + ( + ht.info[f"{prefix}{subfield}_{group}_{subpop}"] + != ht[f"sum_{subfield}_{group}_{alt_groups}"] + ), + f"{prefix}{subfield}_{group}_{subpop} = sum({subfield}_{group}_{alt_groups})", + [ + f"info.{prefix}{subfield}_{group}_{subpop}", + f"sum_{subfield}_{group}_{alt_groups}", + ], + verbose, + ) From 00b92ec5208de64a0492981fef61de94f3014df4 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 11:38:18 -0400 Subject: [PATCH 39/61] Moved make combo header text to vcf.py --- gnomad/utils/vcf.py | 58 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 6bee6707e..47f02659a 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -1,6 +1,9 @@ -import hail as hl -from typing import Dict, List import logging +from typing import Dict, List + +import hail as hl + +from gnomad.sample_qc.ancestry import POP_NAMES logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger(__name__) @@ -416,6 +419,53 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression return filters_dict +def make_combo_header_text( + preposition: str, + group_types: List[str], + combo_fields: List[str], + prefix: str, +) -> str: + """ + Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset. + + For example, if preposition is "for", group_types is ["group", "pop", "sex"], and combo_fields is ["adj", "afr", "female"], + this function will return the string " for female samples of African-American/African ancestry". + + :param preposition: Relevant preposition to precede automatically generated text. + :param group_types: List of grouping types, e.g. "sex" or "pop". + :param combo_fields: List of the specific values for each grouping type, for which the text is being generated. + :param prefix: Prefix string indicating sample subset. + :return: String with automatically generated description text for a given set of combo fields. + """ + combo_dict = dict(zip(group_types, combo_fields)) + header_text = " " + preposition + + if len(combo_dict.keys()) == 1: + if combo_dict["group"] == "adj": + return "" + + if "sex" in combo_dict.keys(): + header_text = header_text + " " + combo_dict["sex"] + + header_text = header_text + " samples" + + if "subpop" in combo_dict.keys(): + header_text = header_text + f" of {POP_NAMES[combo_dict['subpop']]} ancestry" + combo_dict.pop("pop") + + if "pop" in combo_dict.keys(): + header_text = header_text + f" of {POP_NAMES[combo_dict['pop']]} ancestry" + + if "gnomad" in prefix: + header_text = header_text + " in gnomAD" + + if "group" in group_types: + if combo_dict["group"] == "raw": + header_text = header_text + ", before removing low-confidence genotypes" + + return header_text + + def make_info_dict( prefix: str, label_groups: Dict[str, str] = None, @@ -529,11 +579,11 @@ def make_info_dict( combo_dict = { f"{prefix}faf95_{combo}": { "Number": "A", - "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}", + "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}", }, f"{prefix}faf99_{combo}": { "Number": "A", - "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}", + "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}", }, } info_dict.update(combo_dict) From 6ed92b0a5a71c8b105f5efb2661cc3f5d6b96543 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 11:42:12 -0400 Subject: [PATCH 40/61] Fixed imports and reformatted with black --- gnomad/utils/vcf.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 47f02659a..002b791df 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -1,3 +1,5 @@ +import copy +import itertools import logging from typing import Dict, List @@ -420,10 +422,7 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression def make_combo_header_text( - preposition: str, - group_types: List[str], - combo_fields: List[str], - prefix: str, + preposition: str, group_types: List[str], combo_fields: List[str], prefix: str, ) -> str: """ Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset. @@ -616,7 +615,7 @@ def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str def make_vcf_filter_dict( - ht: hl.Table, snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float + snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float ) -> Dict[str, str]: """ Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations. @@ -650,10 +649,10 @@ def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str, :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values. """ edges_dict = { - "het": "|".join( + f"{prefix}het": "|".join( map(lambda x: f"{x:.1f}", ht.head(1).age_hist_het.collect()[0].bin_edges) ), - "hom": "|".join( + f"{prefix}hom": "|".join( map(lambda x: f"{x:.1f}", ht.head(1).age_hist_hom.collect()[0].bin_edges) ), } From 191b073727cbfcb05c3515a9a139994dcae6b6cd Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 11:47:25 -0400 Subject: [PATCH 41/61] Changed docstring for make_hist_bin_edges_expr --- gnomad/utils/vcf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 002b791df..f2c980ce4 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -639,13 +639,13 @@ def make_vcf_filter_dict( return filter_dict -def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str, str]: +def make_hist_bin_edges_expr(ht: hl.Table, prefix: str) -> Dict[str, str]: """ Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string separated by pipe delimiters. :param ht: Table containing histogram variant annotations. - :param prefix: Prefix text for age histogram bin edges. Default is gnomad_. + :param prefix: Prefix text for age histogram bin edges. :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values. """ edges_dict = { From d8a3e7a7db9450e85f060a15a84effeea538ef24 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 13:02:24 -0400 Subject: [PATCH 42/61] Added set female metrics to NA --- gnomad/utils/vcf.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index f2c980ce4..46edfda14 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -1,7 +1,7 @@ import copy import itertools import logging -from typing import Dict, List +from typing import Dict, List, Union import hail as hl @@ -793,3 +793,35 @@ def sample_sum_check( ], verbose, ) + + +def set_female_y_metrics_to_na( + t: Union[hl.Table, hl.MatrixTable], faf: bool = True, +) -> Dict[str, hl.expr.Int32Expression]: + """ + Set AC, AN, and nhomalt Y variant annotations for females to NA (instead of 0). + + :param Table/MatrixTable t: Table/MatrixTable containing female variant annotations. + :return: Dictionary with reset annotations + :rtype: Dict[str, hl.expr.Int32Expression] + """ + metrics = list(t.row.info) + female_metrics = [x for x in metrics if "_female" in x] + female_metrics = [ + x for x in female_metrics if ("nhomalt" in x) or ("AC" in x) or ("AN" in x) + ] + + if faf: + female_metrics.extend([x for x in female_metrics if "faf" in x]) + + female_metrics_dict = {} + for metric in female_metrics: + female_metrics_dict.update( + { + f"{metric}": hl.or_missing( + (t.locus.contig.in_y_nonpar() | t.locus.contig.in_y_par()), + t.info[f"{metric}"], + ) + } + ) + return female_metrics_dict From 51a5a73a47d2786e4f1b3e022dc9f084a5f97e34 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 22 Jul 2020 13:06:08 -0400 Subject: [PATCH 43/61] Updated docstring in set female metrics to na --- gnomad/utils/vcf.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 46edfda14..010c9adba 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -799,11 +799,10 @@ def set_female_y_metrics_to_na( t: Union[hl.Table, hl.MatrixTable], faf: bool = True, ) -> Dict[str, hl.expr.Int32Expression]: """ - Set AC, AN, and nhomalt Y variant annotations for females to NA (instead of 0). + Set AC, AN, and nhomalt chrY variant annotations for females to NA (instead of 0). - :param Table/MatrixTable t: Table/MatrixTable containing female variant annotations. + :param t: Table/MatrixTable containing female variant annotations. :return: Dictionary with reset annotations - :rtype: Dict[str, hl.expr.Int32Expression] """ metrics = list(t.row.info) female_metrics = [x for x in metrics if "_female" in x] From 6b1969dd943fc53445362cfc03976a0c39386039 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 12:15:21 -0400 Subject: [PATCH 44/61] Removed transmitted singleton and sibling singleton from SITE_FIELDS --- gnomad/utils/vcf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 010c9adba..150125429 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -58,9 +58,7 @@ "MQRankSum", "QD", "ReadPosRankSum", - "sibling_singleton", "SOR", - "transmitted_singleton", "VarDP", ] """ From 2af3587c0a9597b4a2aa0a80a168e0b86587599d Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:19:26 -0400 Subject: [PATCH 45/61] Updated make_label_combos docstring and fixed values for some constants/their descriptions --- gnomad/utils/vcf.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 150125429..690d37a7a 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -14,6 +14,7 @@ SORT_ORDER = ["popmax", "group", "pop", "subpop", "sex"] """ Order to sort subgroupings during VCF export. +Ensures that INFO labels in VCF are in desired order (e.g., raw_AC_afr_female). """ GROUPS = ["adj", "raw"] @@ -28,12 +29,13 @@ FAF_POPS = ["afr", "amr", "eas", "nfe", "sas"] """ -Populations that are included in filtering allele frequency calculations. Used in VCF export. +Global populations that are included in filtering allele frequency (faf) calculations. Used in VCF export. """ SEXES = ["male", "female"] """ Sample sexes used in VCF export. +Used to stratify frequency annotations (AC, AN, AF) for each sex. """ AS_FIELDS = [ @@ -77,9 +79,12 @@ Allele-type annotations. """ -REGION_TYPE_FIELDS = ["lcr", "nonpar"] +REGION_FLAG_FIELDS = ["decoy", "lcr", "nonpar", "segdup"] """ Annotations about variant region type. + +.. note:: + decoy and segdup resource files do not currently exist for GrCh38/hg38. """ RF_FIELDS = [ @@ -91,7 +96,7 @@ "tp", ] """ -Annotations specific to the random forest model. +Annotations specific to the variant QC using a random forest model. """ VQSR_FIELDS = ["AS_VQSLOD", "AS_culprit", "NEGATIVE_TRAIN_SITE", "POSITIVE_TRAIN_SITE"] @@ -147,10 +152,12 @@ "Number": "A", "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model", }, + "decoy": {"Description": "Variant falls within a reference decoy region"}, "lcr": {"Description": "Variant falls within a low complexity region"}, "nonpar": { "Description": "Variant (on sex chromosome) falls outside a pseudoautosomal region" }, + "segdup": {"Description": "Variant falls within a segmental duplication region"}, "rf_positive_label": { "Description": "Variant was labelled as a positive example for training of random forest model" }, @@ -339,18 +346,24 @@ def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpressi return info_mt.filter_cols(False) -def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]: +def make_label_combos( + label_groups: Dict[str, List[str]], sort_order: List[str] = SORT_ORDER, +) -> List[str]: """ Make combinations of all possible labels for a supplied dictionary of label groups. - :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + For example, if label_groups is `{"sex": ["male", "female"], "pop": ["afr", "nfe", "amr"]}`, + this function will return `["afr_male", "afr_female", "nfe_male", "nfe_female", "amr_male", "amr_female']` + + :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). + :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. :return: list of all possible combinations of values for the supplied label groupings. """ copy_label_groups = copy.deepcopy(label_groups) if len(copy_label_groups) == 1: return [item for sublist in copy_label_groups.values() for item in sublist] - anchor_group = sorted(copy_label_groups.keys(), key=lambda x: SORT_ORDER.index(x))[ + anchor_group = sorted(copy_label_groups.keys(), key=lambda x: sort_order.index(x))[ 0 ] anchor_val = copy_label_groups.pop(anchor_group) From ca577082dc1aa9db67357e7b4a072eb9428e60b4 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:21:43 -0400 Subject: [PATCH 46/61] Updated docstring for generic_field_check --- gnomad/utils/vcf.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 690d37a7a..8f8e099e0 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -374,17 +374,23 @@ def make_label_combos( def generic_field_check( - ht: hl.Table, cond_expr, check_description, display_fields, verbose + ht: hl.Table, + cond_expr: hl.expr.BooleanExpression, + check_description: str, + display_fields: List[str], + verbose: bool, ) -> None: """ - Check a generic logical condition involving annotations in a Hail Table and print the results to terminal + Check a generic logical condition involving annotations in a Hail Table and print the results to terminal. + + Displays the number of rows in the Table that return False for a condition. :param ht: Table containing annotations to be checked. - :param cond_expr: logical expression referring to annotations in ht to be checked. + :param cond_expr: Logical expression referring to annotations in ht to be checked. :param check_description: String describing the condition being checked; is displayed in terminal summary message. :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes); these fields are also displayed if verbose is True. - :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False, + :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks. """ ht_orig = ht From 4428b0ccd5e32d5f4eb6fc63153eff8d9a3c457b Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:24:03 -0400 Subject: [PATCH 47/61] Updated docstring for make_filters_sanity_check_expr --- gnomad/utils/vcf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 8f8e099e0..ecbc12965 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -410,7 +410,8 @@ def generic_field_check( def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]: """ Make Hail Expressions to measure % variants filtered under varying conditions of interest. - :param ht: Hail Table containing 'filter' annotation to be examined. + + :param ht: Table containing 'filter' annotation to be examined. :return: Dictionary containing Hail aggregation expressions to measure filter flags. """ filters_dict = { From 75e997d5ebcd14cb7ff8b23a555d34e53be522ea Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:32:55 -0400 Subject: [PATCH 48/61] Updated make_filters_sanity_check_expr --- gnomad/utils/vcf.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index ecbc12965..1ffc677af 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -409,32 +409,37 @@ def generic_field_check( def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]: """ - Make Hail Expressions to measure % variants filtered under varying conditions of interest. + Make Hail expressions to measure % variants filtered under varying conditions of interest. + + Checks for: + - Total number of variants + - Fraction of variants removed by any filter + - Fraction of variants removed because of InbreedingCoefficient filter in combination with any other filter + - Fraction of varinats removed because of AC0 filter in combination with any other filter + - Fraction of variants removed because of random forest filtering in combination with any other filter + - Fraction of variants removed only because of InbreedingCoefficient filter + - Fraction of variants removed only because of AC0 filter + - Fraction of variants removed only because of random forest filtering + :param ht: Table containing 'filter' annotation to be examined. :return: Dictionary containing Hail aggregation expressions to measure filter flags. """ filters_dict = { "n": hl.agg.count(), - "frac_any_filter": hl.agg.count_where(ht.is_filtered) / hl.agg.count(), - "frac_inbreed_coeff": hl.agg.count_where( - ht.filters.contains("inbreeding_coeff") - ) - / hl.agg.count(), - "frac_ac0": hl.agg.count_where(ht.filters.contains("AC0")) / hl.agg.count(), - "frac_rf": hl.agg.count_where(ht.filters.contains("rf")) / hl.agg.count(), - "frac_inbreed_coeff_only": hl.agg.count_where( + "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) == 0), + "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("inbreeding_coeff")), + "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")), + "frac_rf": hl.agg.fraction(ht.filters.contains("rf")), + "frac_inbreed_coeff_only": hl.agg.fraction( ht.filters.contains("inbreeding_coeff") & (ht.filters.length() == 1) - ) - / hl.agg.count(), - "frac_ac0_only": hl.agg.count_where( + ), + "frac_ac0_only": hl.agg.fraction( ht.filters.contains("AC0") & (ht.filters.length() == 1) - ) - / hl.agg.count(), - "frac_rf_only": hl.agg.count_where( + ), + "frac_rf_only": hl.agg.fraction( ht.filters.contains("rf") & (ht.filters.length() == 1) - ) - / hl.agg.count(), + ), } return filters_dict From fdfed1adb48685aefb22b5c720bd25c5de1233d6 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:38:49 -0400 Subject: [PATCH 49/61] Updated docstring for make_combo_header_text --- gnomad/utils/vcf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 1ffc677af..df25494aa 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -454,7 +454,7 @@ def make_combo_header_text( this function will return the string " for female samples of African-American/African ancestry". :param preposition: Relevant preposition to precede automatically generated text. - :param group_types: List of grouping types, e.g. "sex" or "pop". + :param group_types: List of grouping types. Possible values in this list are: "group", "pop", "sex", "subpop". :param combo_fields: List of the specific values for each grouping type, for which the text is being generated. :param prefix: Prefix string indicating sample subset. :return: String with automatically generated description text for a given set of combo fields. @@ -462,11 +462,11 @@ def make_combo_header_text( combo_dict = dict(zip(group_types, combo_fields)) header_text = " " + preposition - if len(combo_dict.keys()) == 1: + if len(combo_dict) == 1: if combo_dict["group"] == "adj": return "" - if "sex" in combo_dict.keys(): + if "sex" in combo_dict: header_text = header_text + " " + combo_dict["sex"] header_text = header_text + " samples" From 2c47c635faa68fffcd6f52970fc46b0d8c65b6fd Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:42:42 -0400 Subject: [PATCH 50/61] Updated docstring for make_info_dict --- gnomad/utils/vcf.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index df25494aa..9be64db77 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -500,7 +500,13 @@ def make_info_dict( """ Generate dictionary of Number and Description attributes of VCF INFO fields. - Used to populate the VCF header during export. + Used to populate the INFO fields of the VCF header during export. + + Creates: + - INFO fields for age histograms (bin freq, n_smaller, and n_larger for heterozygous and homozygous variant carriers) + - INFO fields for popmax AC, AN, AF, nhomalt, and popmax population + - INFO fields for AC, AN, AF, nhomalt for each combination of sample population, sex, and subpopulation, both for adj and raw data + - INFO fields for filtering allele frequency (faf) annotations :param prefix: gnomAD or empty string. :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, From 41fb66d4b2765669a57e706522f9e73875fbc3a3 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:54:07 -0400 Subject: [PATCH 51/61] Updated make_combo_header_text to take dict as input --- gnomad/utils/vcf.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 9be64db77..d4fe986fa 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -445,7 +445,7 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression def make_combo_header_text( - preposition: str, group_types: List[str], combo_fields: List[str], prefix: str, + preposition: str, combo_dict: Dict[str, str], prefix: str, ) -> str: """ Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset. @@ -454,12 +454,12 @@ def make_combo_header_text( this function will return the string " for female samples of African-American/African ancestry". :param preposition: Relevant preposition to precede automatically generated text. - :param group_types: List of grouping types. Possible values in this list are: "group", "pop", "sex", "subpop". - :param combo_fields: List of the specific values for each grouping type, for which the text is being generated. + :param combo_dict: Dict with grouping types as keys and values for grouping type as values. This function generates text for these values. + Possible grouping types are: "group", "pop", "sex", and "subpop". + Example input: {"pop": "afr", "sex": "female"} :param prefix: Prefix string indicating sample subset. :return: String with automatically generated description text for a given set of combo fields. """ - combo_dict = dict(zip(group_types, combo_fields)) header_text = " " + preposition if len(combo_dict) == 1: @@ -471,17 +471,17 @@ def make_combo_header_text( header_text = header_text + " samples" - if "subpop" in combo_dict.keys(): + if "subpop" in combo_dict: header_text = header_text + f" of {POP_NAMES[combo_dict['subpop']]} ancestry" combo_dict.pop("pop") - if "pop" in combo_dict.keys(): + if "pop" in combo_dict: header_text = header_text + f" of {POP_NAMES[combo_dict['pop']]} ancestry" if "gnomad" in prefix: header_text = header_text + " in gnomAD" - if "group" in group_types: + if "group" in combo_dict: if combo_dict["group"] == "raw": header_text = header_text + ", before removing low-confidence genotypes" @@ -489,7 +489,7 @@ def make_combo_header_text( def make_info_dict( - prefix: str, + prefix: str = "", label_groups: Dict[str, str] = None, bin_edges: Dict[str, str] = None, faf: bool = False, @@ -508,7 +508,7 @@ def make_info_dict( - INFO fields for AC, AN, AF, nhomalt for each combination of sample population, sex, and subpopulation, both for adj and raw data - INFO fields for filtering allele frequency (faf) annotations - :param prefix: gnomAD or empty string. + :param prefix: Prefix string for data, e.g. "gnomAD". Default is empty string. :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). :param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation. @@ -583,35 +583,36 @@ def make_info_dict( for combo in combos: combo_fields = combo.split("_") + group_dict = dict(zip(group_types, combo_fields)) if not faf: combo_dict = { f"{prefix}AC_{combo}": { "Number": "A", - "Description": f"Alternate allele count{make_combo_header_text('for', group_types, combo_fields, prefix)}", + "Description": f"Alternate allele count{make_combo_header_text('for', group_dict, prefix)}", }, f"{prefix}AN_{combo}": { "Number": "1", - "Description": f"Total number of alleles{make_combo_header_text('in', group_types, combo_fields, prefix)}", + "Description": f"Total number of alleles{make_combo_header_text('in', group_dict, prefix)}", }, f"{prefix}AF_{combo}": { "Number": "A", - "Description": f"Alternate allele frequency{make_combo_header_text('in', group_types, combo_fields, prefix)}", + "Description": f"Alternate allele frequency{make_combo_header_text('in', group_dict, prefix)}", }, f"{prefix}nhomalt_{combo}": { "Number": "A", - "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_types, combo_fields, prefix)}", + "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_dict, prefix)}", }, } else: combo_dict = { f"{prefix}faf95_{combo}": { "Number": "A", - "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}", + "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_dict, prefix)}", }, f"{prefix}faf99_{combo}": { "Number": "A", - "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}", + "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_dict, prefix)}", }, } info_dict.update(combo_dict) From 2ddd8f17aa83eb750c12cd5044d4a4f5e6f61243 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 28 Jul 2020 16:57:03 -0400 Subject: [PATCH 52/61] Updated make_info_dict to pass in sort order --- gnomad/utils/vcf.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index d4fe986fa..e6401e831 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -496,6 +496,7 @@ def make_info_dict( popmax: bool = False, description_text: str = "", age_hist_data: str = None, + sort_order: List[str] = SORT_ORDER, ) -> Dict[str, Dict[str, str]]: """ Generate dictionary of Number and Description attributes of VCF INFO fields. @@ -514,8 +515,9 @@ def make_info_dict( :param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation. :param faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations. :param popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations. - :param description_text: Optinal text to append to the end of age and popmax descriptions. + :param description_text: Optional text to append to the end of age and popmax descriptions. Needs to start with a space if specified. :param str age_hist_data: Pipe-delimited string of age histograms, from `get_age_distributions`. + :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. :return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes. """ if prefix != "": @@ -578,7 +580,7 @@ def make_info_dict( info_dict.update(popmax_dict) else: - group_types = sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x)) + group_types = sorted(label_groups.keys(), key=lambda x: sort_order.index(x)) combos = make_label_combos(label_groups) for combo in combos: From 3a80a41383e86cee97292824b049ac8be6348e55 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Wed, 29 Jul 2020 09:54:46 -0400 Subject: [PATCH 53/61] Addressed rest of review comments --- gnomad/utils/vcf.py | 95 +++++++++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 37 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index e6401e831..37165c08e 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -621,29 +621,41 @@ def make_info_dict( return info_dict -def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]: +def add_as_info_dict( + info_dict: Dict[str, Dict[str, str]] = INFO_DICT, as_fields: List[str] = AS_FIELDS +) -> Dict[str, Dict[str, str]]: """ Updates info dictionary with allele-specific terms and their descriptions. Used in VCF export. - :param INFO_DICT: Dictionary containing site-level annotations and their descriptions. + :param info_dict: Dictionary containing site-level annotations and their descriptions. Default is INFO_DICT. + :param as_fields: List containing allele-specific fields to be added to info_dict. Default is AS_FIELDS. :return: Dictionary with allele specific annotations, their descriptions, and their VCF number field. """ prefix_text = "Allele-specific" - AS_DICT = {} + as_dict = {} + + for field in as_fields: + + try: + # Strip AS_ from field name + site_field = field[3:] - for field in AS_FIELDS: - # Strip AS_ from field name - site_field = field[3:] + # Get site description from info dictionary and make first letter lower case + first_letter = info_dict[site_field]["Description"][0].lower() + rest_of_description = info_dict[site_field]["Description"][1:] - AS_DICT[field] = {} - AS_DICT[field]["Number"] = "A" - AS_DICT[field][ - "Description" - ] = f"{prefix_text} {INFO_DICT[site_field]['Description'][0].lower()}{INFO_DICT[site_field]['Description'][1:]}" + as_dict[field] = {} + as_dict[field]["Number"] = "A" + as_dict[field][ + "Description" + ] = f"{prefix_text} {first_letter}{rest_of_description}" - return AS_DICT + except KeyError: + raise ValueError(f"{field} is not present in input info dictionary!") + + return as_dict def make_vcf_filter_dict( @@ -652,7 +664,12 @@ def make_vcf_filter_dict( """ Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations. - :param ht: Table containing global annotations of the Random Forests SNP and indel cutoffs. + Generates descriptions for: + - AC0 filter + - InbreedingCoeff filter + - RF filter + - PASS (passed all variant filters) + :param snp_cutoff: Minimum SNP cutoff score from random forest model. :param indel_cutoff: Minimum indel cutoff score from random forest model. :param inbreeding_cutoff: Inbreeding coefficient hard cutoff. @@ -671,43 +688,45 @@ def make_vcf_filter_dict( return filter_dict -def make_hist_bin_edges_expr(ht: hl.Table, prefix: str) -> Dict[str, str]: +def make_hist_bin_edges_expr( + ht: hl.Table, hists: List[str] = HISTS, prefix: str = "" +) -> Dict[str, str]: """ Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string separated by pipe delimiters. :param ht: Table containing histogram variant annotations. - :param prefix: Prefix text for age histogram bin edges. + :param hists: List of variant histogram annotations. Default is HISTS. + :param prefix: Prefix text for age histogram bin edges. Default is empty string. :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values. """ + # Add underscore to prefix if it isn't empty + if prefix != "": + prefix += "_" + edges_dict = { - f"{prefix}het": "|".join( - map(lambda x: f"{x:.1f}", ht.head(1).age_hist_het.collect()[0].bin_edges) - ), - f"{prefix}hom": "|".join( - map(lambda x: f"{x:.1f}", ht.head(1).age_hist_hom.collect()[0].bin_edges) - ), + f"{prefix}{call_type}": "|".join( + map( + lambda x: f"{x:.1f}", + ht.head(1)[f"age_hist_{call_type}"].collect()[0].bin_edges, + ) + ) + for call_type in ["het", "hom"] } - for hist in HISTS: + + for hist in hists: + + hist_name = hist # Parse hists calculated on both raw and adj-filtered data for hist_type in ["adj_qual_hists", "qual_hists"]: - hist_name = hist + if "adj" in hist_type: hist_name = f"{hist}_adj" - edges_dict[hist_name] = ( - "|".join( - map( - lambda x: f"{x:.2f}", - ht.head(1)[hist_type][hist].collect()[0].bin_edges, - ) - ) - if "ab" in hist - else "|".join( - map( - lambda x: str(int(x)), - ht.head(1)[hist_type][hist].collect()[0].bin_edges, - ) + edges_dict[hist_name] = "|".join( + map( + lambda x: f"{x:.2f}" if "ab" in hist else str(int(x)), + ht.head(1)[hist_type][hist].collect()[0].bin_edges, ) ) return edges_dict @@ -763,6 +782,7 @@ def sample_sum_check( label_groups: Dict[str, List[str]], verbose: bool, subpop: bool = None, + sort_order: List[str] = SORT_ORDER, ) -> None: """ Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version; @@ -775,6 +795,7 @@ def sample_sum_check( :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks. :param str subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked. + :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. :rtype: None """ combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in make_label_combos(label_groups)] @@ -785,7 +806,7 @@ def sample_sum_check( group = label_groups.pop("group")[0] alt_groups = "_".join( - sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x)) + sorted(label_groups.keys(), key=lambda x: sort_order.index(x)) ) annot_dict = { From 1e48d1219afe581e70291e2163f27b62be547231 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Thu, 30 Jul 2020 13:50:58 -0400 Subject: [PATCH 54/61] Add BaseQRankSum to SITE_FIELDS const --- gnomad/utils/vcf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 37165c08e..dd868e355 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -54,6 +54,7 @@ """ SITE_FIELDS = [ + "BaseQRankSum", "FS", "InbreedingCoeff", "MQ", From ff61379f8ff0432a51eaebfd92d1f3891500cffe Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Thu, 30 Jul 2020 16:55:15 -0400 Subject: [PATCH 55/61] Addressed review comments --- gnomad/utils/vcf.py | 47 +++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index dd868e355..903a90040 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -85,7 +85,7 @@ Annotations about variant region type. .. note:: - decoy and segdup resource files do not currently exist for GrCh38/hg38. + decoy and segdup resource files do not currently exist for GRCh38/hg38. """ RF_FIELDS = [ @@ -145,13 +145,11 @@ "VarDP": { "Description": "Depth over variant genotypes (does not include depth of reference samples)" }, - "AS_VQSLOD": { - "Number": "A", - "Description": "Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model", + "VQSLOD": { + "Description": "Log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model", }, - "AS_VQSR_culprit": { - "Number": "A", - "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model", + "VQSR_culprit": { + "Description": "Worst-performing annotation in the VQSR Gaussian mixture model", }, "decoy": {"Description": "Variant falls within a reference decoy region"}, "lcr": {"Description": "Variant falls within a low complexity region"}, @@ -177,19 +175,16 @@ "variant_type": { "Description": "Variant type (snv, indel, multi-snv, multi-indel, or mixed)" }, - "allele_type": { - "Number": "A", - "Description": "Allele type (snv, insertion, deletion, or mixed)", - }, + "allele_type": {"Description": "Allele type (snv, insertion, deletion, or mixed)",}, "n_alt_alleles": { - "Number": "A", + "Number": 1, "Description": "Total number of alternate alleles observed at variant locus", }, "was_mixed": {"Description": "Variant type was mixed"}, "has_star": { "Description": "Variant locus coincides with a spanning deletion (represented by a star) observed elsewhere in the callset" }, - "pab_max": { + "AS_pab_max": { "Number": "A", "Description": "Maximum p-value over callset for binomial test of observed allele balance for a heterozygous genotype, given expectation of 0.5", }, @@ -384,7 +379,12 @@ def generic_field_check( """ Check a generic logical condition involving annotations in a Hail Table and print the results to terminal. - Displays the number of rows in the Table that return False for a condition. + Displays the number of rows in the Table that return False for a condition (`cond_expr`). + + .. note:: + This function checks for the number of rows that violate the `cond_expr` and prints the desired condition (`check_description`) to the terminal. + `cond_expr` and `check_description` are opposites and should never be the same. + :param ht: Table containing annotations to be checked. :param cond_expr: Logical expression referring to annotations in ht to be checked. @@ -428,7 +428,7 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression """ filters_dict = { "n": hl.agg.count(), - "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) == 0), + "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) != 0), "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("inbreeding_coeff")), "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")), "frac_rf": hl.agg.fraction(ht.filters.contains("rf")), @@ -542,7 +542,7 @@ def make_info_dict( }, f"{prefix}age_hist_hom_bin_freq": { "Number": "A", - "Description": f"Histogram of ages of homozygous alternate individuals{description_text}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}", + "Description": f"Histogram of ages of homozygous alternate individuals{description_text}; bin edges are: {bin_edges['hom']}; total number of individuals of any genotype bin: {age_hist_data}", }, f"{prefix}age_hist_hom_n_smaller": { "Number": "A", @@ -571,7 +571,7 @@ def make_info_dict( }, f"{prefix}AF_popmax": { "Number": "A", - "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_text}", + "Description": f"Maximum allele frequency across populations{description_text}", }, f"{prefix}nhomalt_popmax": { "Number": "A", @@ -634,7 +634,6 @@ def add_as_info_dict( :param as_fields: List containing allele-specific fields to be added to info_dict. Default is AS_FIELDS. :return: Dictionary with allele specific annotations, their descriptions, and their VCF number field. """ - prefix_text = "Allele-specific" as_dict = {} for field in as_fields: @@ -651,7 +650,7 @@ def add_as_info_dict( as_dict[field]["Number"] = "A" as_dict[field][ "Description" - ] = f"{prefix_text} {first_letter}{rest_of_description}" + ] = f"Allele-specific {first_letter}{rest_of_description}" except KeyError: raise ValueError(f"{field} is not present in input info dictionary!") @@ -850,7 +849,7 @@ def sample_sum_check( def set_female_y_metrics_to_na( - t: Union[hl.Table, hl.MatrixTable], faf: bool = True, + t: Union[hl.Table, hl.MatrixTable], ) -> Dict[str, hl.expr.Int32Expression]: """ Set AC, AN, and nhomalt chrY variant annotations for females to NA (instead of 0). @@ -860,19 +859,13 @@ def set_female_y_metrics_to_na( """ metrics = list(t.row.info) female_metrics = [x for x in metrics if "_female" in x] - female_metrics = [ - x for x in female_metrics if ("nhomalt" in x) or ("AC" in x) or ("AN" in x) - ] - - if faf: - female_metrics.extend([x for x in female_metrics if "faf" in x]) female_metrics_dict = {} for metric in female_metrics: female_metrics_dict.update( { f"{metric}": hl.or_missing( - (t.locus.contig.in_y_nonpar() | t.locus.contig.in_y_par()), + (~t.locus.contig.in_y_nonpar() | ~t.locus.contig.in_y_par()), t.info[f"{metric}"], ) } From ca49ac51d8abd72e42120ea26ee3f207e12ffad9 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Fri, 31 Jul 2020 10:38:35 -0400 Subject: [PATCH 56/61] Updated docstring for sample_sum_check --- gnomad/utils/vcf.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 903a90040..5b17cd31c 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -788,15 +788,15 @@ def sample_sum_check( Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version; display results from checking the sum of the specified annotations in the terminal. - :param Table ht: Table containing annotations to be summed. - :param str prefix: String indicating sample subset. - :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + :param ht: Table containing annotations to be summed. + :param prefix: String indicating sample subset. + :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). - :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False, + :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, show only top values of annotations that fail checks. - :param str subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked. + :param subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked. :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. - :rtype: None + :return: None """ combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in make_label_combos(label_groups)] combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in make_label_combos(label_groups)] From 261c38ddf6b6701a47305a2ae61bf3922577997d Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Mon, 3 Aug 2020 10:11:55 -0400 Subject: [PATCH 57/61] Updated some docstrings addressing review comments --- gnomad/utils/vcf.py | 73 +++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 5b17cd31c..3d42d5181 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -48,6 +48,7 @@ "AS_ReadPosRankSum", "AS_SOR", "AS_VarDP", + "InbreedingCoeff", ] """ Allele-specific variant annotations. @@ -56,7 +57,6 @@ SITE_FIELDS = [ "BaseQRankSum", "FS", - "InbreedingCoeff", "MQ", "MQRankSum", "QD", @@ -379,11 +379,13 @@ def generic_field_check( """ Check a generic logical condition involving annotations in a Hail Table and print the results to terminal. - Displays the number of rows in the Table that return False for a condition (`cond_expr`). + Displays the number of rows in the Table that match the `cond_expr` and fail to be the desired condition (`check_description`). .. note:: - This function checks for the number of rows that violate the `cond_expr` and prints the desired condition (`check_description`) to the terminal. - `cond_expr` and `check_description` are opposites and should never be the same. + `cond_expr` and `check_description` are opposites and should never be the same. + E.g., If `cond_expr` filters for instances where the raw AC is less than adj AC, + then it is checking sites that fail to be the desired condition (`check_description`) + of having a raw AC greater than or equal to the adj AC. :param ht: Table containing annotations to be checked. @@ -414,39 +416,41 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression Checks for: - Total number of variants - - Fraction of variants removed by any filter - - Fraction of variants removed because of InbreedingCoefficient filter in combination with any other filter - - Fraction of varinats removed because of AC0 filter in combination with any other filter - - Fraction of variants removed because of random forest filtering in combination with any other filter - - Fraction of variants removed only because of InbreedingCoefficient filter - - Fraction of variants removed only because of AC0 filter - - Fraction of variants removed only because of random forest filtering - + - Fraction of variants removed due to: + - Any filter + - Inbreeding coefficient filter in combination with any other filter + - AC0 filter in combination with any other filter + - Random forest filtering in combination with any other filter + - Only inbreeding coefficient filter + - Only AC0 filter + - Only random forest filtering :param ht: Table containing 'filter' annotation to be examined. - :return: Dictionary containing Hail aggregation expressions to measure filter flags. + :return: Dictionary containing Hail aggregation expressions to examine filter flags. """ - filters_dict = { + return { "n": hl.agg.count(), "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) != 0), - "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("inbreeding_coeff")), + "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("InbreedingCoeff")), "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")), - "frac_rf": hl.agg.fraction(ht.filters.contains("rf")), + "frac_rf": hl.agg.fraction(ht.filters.contains("RF")), "frac_inbreed_coeff_only": hl.agg.fraction( - ht.filters.contains("inbreeding_coeff") & (ht.filters.length() == 1) + ht.filters.contains("InbreedingCoeff") & (ht.filters.length() == 1) ), "frac_ac0_only": hl.agg.fraction( ht.filters.contains("AC0") & (ht.filters.length() == 1) ), "frac_rf_only": hl.agg.fraction( - ht.filters.contains("rf") & (ht.filters.length() == 1) + ht.filters.contains("RF") & (ht.filters.length() == 1) ), } - return filters_dict def make_combo_header_text( - preposition: str, combo_dict: Dict[str, str], prefix: str, + preposition: str, + combo_dict: Dict[str, str], + prefix: str, + pop_names: Dict[str, str] = POP_NAMES, ) -> str: """ Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset. @@ -459,6 +463,7 @@ def make_combo_header_text( Possible grouping types are: "group", "pop", "sex", and "subpop". Example input: {"pop": "afr", "sex": "female"} :param prefix: Prefix string indicating sample subset. + :param pop_names: Dict with global population names (keys) and population descriptions (values). Default is POP_NAMES. :return: String with automatically generated description text for a given set of combo fields. """ header_text = " " + preposition @@ -473,11 +478,11 @@ def make_combo_header_text( header_text = header_text + " samples" if "subpop" in combo_dict: - header_text = header_text + f" of {POP_NAMES[combo_dict['subpop']]} ancestry" + header_text = header_text + f" of {pop_names[combo_dict['subpop']]} ancestry" combo_dict.pop("pop") if "pop" in combo_dict: - header_text = header_text + f" of {POP_NAMES[combo_dict['pop']]} ancestry" + header_text = header_text + f" of {pop_names[combo_dict['pop']]} ancestry" if "gnomad" in prefix: header_text = header_text + " in gnomAD" @@ -588,34 +593,37 @@ def make_info_dict( combo_fields = combo.split("_") group_dict = dict(zip(group_types, combo_fields)) + for_combo = make_combo_header_text("for", group_dict, prefix) + in_combo = make_combo_header_text("in", group_dict, prefix) + if not faf: combo_dict = { f"{prefix}AC_{combo}": { "Number": "A", - "Description": f"Alternate allele count{make_combo_header_text('for', group_dict, prefix)}", + "Description": f"Alternate allele count{for_combo}", }, f"{prefix}AN_{combo}": { "Number": "1", - "Description": f"Total number of alleles{make_combo_header_text('in', group_dict, prefix)}", + "Description": f"Total number of alleles{in_combo}", }, f"{prefix}AF_{combo}": { "Number": "A", - "Description": f"Alternate allele frequency{make_combo_header_text('in', group_dict, prefix)}", + "Description": f"Alternate allele frequency{in_combo}", }, f"{prefix}nhomalt_{combo}": { "Number": "A", - "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_dict, prefix)}", + "Description": f"Count of homozygous individuals{in_combo}", }, } else: combo_dict = { f"{prefix}faf95_{combo}": { "Number": "A", - "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_dict, prefix)}", + "Description": f"Filtering allele frequency (using Poisson 95% CI) {for_combo}", }, f"{prefix}faf99_{combo}": { "Number": "A", - "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_dict, prefix)}", + "Description": f"Filtering allele frequency (using Poisson 99% CI) {for_combo}", }, } info_dict.update(combo_dict) @@ -798,11 +806,10 @@ def sample_sum_check( :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. :return: None """ - combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in make_label_combos(label_groups)] - combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in make_label_combos(label_groups)] - combo_nhomalt = [ - ht.info[f"{prefix}nhomalt_{x}"] for x in make_label_combos(label_groups) - ] + label_combos = make_label_combos(label_groups) + combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in label_combos] + combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in label_combos] + combo_nhomalt = [ht.info[f"{prefix}nhomalt_{x}"] for x in label_combos] group = label_groups.pop("group")[0] alt_groups = "_".join( From d176f68aed2bfb81fcd2a343e55bac2c4f6ddf20 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Mon, 3 Aug 2020 13:24:45 -0400 Subject: [PATCH 58/61] Created assessment folder and moved sample_sum_check, generic_field_check, make_filters_sanity_check_expr to assessment --- gnomad/assessment/__init__.py | 1 + gnomad/assessment/sanity_checks.py | 159 +++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 gnomad/assessment/__init__.py create mode 100644 gnomad/assessment/sanity_checks.py diff --git a/gnomad/assessment/__init__.py b/gnomad/assessment/__init__.py new file mode 100644 index 000000000..2ae7a6039 --- /dev/null +++ b/gnomad/assessment/__init__.py @@ -0,0 +1 @@ +from gnomad.assessment import sanity_checks diff --git a/gnomad/assessment/sanity_checks.py b/gnomad/assessment/sanity_checks.py new file mode 100644 index 000000000..c6eb2d478 --- /dev/null +++ b/gnomad/assessment/sanity_checks.py @@ -0,0 +1,159 @@ +import logging +from typing import Dict, List + +import hail as hl + +from gnomad.utils.vcf import make_label_combos, SORT_ORDER + + +logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def generic_field_check( + ht: hl.Table, + cond_expr: hl.expr.BooleanExpression, + check_description: str, + display_fields: List[str], + verbose: bool, +) -> None: + """ + Check a generic logical condition involving annotations in a Hail Table and print the results to terminal. + + Displays the number of rows in the Table that match the `cond_expr` and fail to be the desired condition (`check_description`). + + .. note:: + `cond_expr` and `check_description` are opposites and should never be the same. + E.g., If `cond_expr` filters for instances where the raw AC is less than adj AC, + then it is checking sites that fail to be the desired condition (`check_description`) + of having a raw AC greater than or equal to the adj AC. + + + :param ht: Table containing annotations to be checked. + :param cond_expr: Logical expression referring to annotations in ht to be checked. + :param check_description: String describing the condition being checked; is displayed in terminal summary message. + :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes); + these fields are also displayed if verbose is True. + :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, + show only top values of annotations that fail checks. + """ + ht_orig = ht + ht = ht.filter(cond_expr) + n_fail = ht.count() + if n_fail > 0: + logger.info(f"Found {n_fail} sites that fail {check_description} check:") + ht = ht.flatten() + ht.select("locus", "alleles", *display_fields).show() + else: + logger.info(f"PASSED {check_description} check") + if verbose: + ht_orig = ht_orig.flatten() + ht_orig.select(*display_fields).show() + + +def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]: + """ + Make Hail expressions to measure % variants filtered under varying conditions of interest. + + Checks for: + - Total number of variants + - Fraction of variants removed due to: + - Any filter + - Inbreeding coefficient filter in combination with any other filter + - AC0 filter in combination with any other filter + - Random forest filtering in combination with any other filter + - Only inbreeding coefficient filter + - Only AC0 filter + - Only random forest filtering + + :param ht: Table containing 'filter' annotation to be examined. + :return: Dictionary containing Hail aggregation expressions to examine filter flags. + """ + return { + "n": hl.agg.count(), + "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) != 0), + "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("InbreedingCoeff")), + "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")), + "frac_rf": hl.agg.fraction(ht.filters.contains("RF")), + "frac_inbreed_coeff_only": hl.agg.fraction( + ht.filters.contains("InbreedingCoeff") & (ht.filters.length() == 1) + ), + "frac_ac0_only": hl.agg.fraction( + ht.filters.contains("AC0") & (ht.filters.length() == 1) + ), + "frac_rf_only": hl.agg.fraction( + ht.filters.contains("RF") & (ht.filters.length() == 1) + ), + } + + +def sample_sum_check( + ht: hl.Table, + prefix: str, + label_groups: Dict[str, List[str]], + verbose: bool, + subpop: bool = None, + sort_order: List[str] = SORT_ORDER, +) -> None: + """ + Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version; + display results from checking the sum of the specified annotations in the terminal. + + :param ht: Table containing annotations to be summed. + :param prefix: String indicating sample subset. + :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, + e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). + :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, + show only top values of annotations that fail checks. + :param subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked. + :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. + :return: None + """ + label_combos = make_label_combos(label_groups) + combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in label_combos] + combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in label_combos] + combo_nhomalt = [ht.info[f"{prefix}nhomalt_{x}"] for x in label_combos] + + group = label_groups.pop("group")[0] + alt_groups = "_".join( + sorted(label_groups.keys(), key=lambda x: sort_order.index(x)) + ) + + annot_dict = { + f"sum_AC_{group}_{alt_groups}": hl.sum(combo_AC), + f"sum_AN_{group}_{alt_groups}": hl.sum(combo_AN), + f"sum_nhomalt_{group}_{alt_groups}": hl.sum(combo_nhomalt), + } + + ht = ht.annotate(**annot_dict) + + for subfield in ["AC", "AN", "nhomalt"]: + if not subpop: + generic_field_check( + ht, + ( + ht.info[f"{prefix}{subfield}_{group}"] + != ht[f"sum_{subfield}_{group}_{alt_groups}"] + ), + f"{prefix}{subfield}_{group} = sum({subfield}_{group}_{alt_groups})", + [ + f"info.{prefix}{subfield}_{group}", + f"sum_{subfield}_{group}_{alt_groups}", + ], + verbose, + ) + else: + generic_field_check( + ht, + ( + ht.info[f"{prefix}{subfield}_{group}_{subpop}"] + != ht[f"sum_{subfield}_{group}_{alt_groups}"] + ), + f"{prefix}{subfield}_{group}_{subpop} = sum({subfield}_{group}_{alt_groups})", + [ + f"info.{prefix}{subfield}_{group}_{subpop}", + f"sum_{subfield}_{group}_{alt_groups}", + ], + verbose, + ) From 1640b3f16ceeccadf72eb375ddc64ef0fb1fd174 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Mon, 3 Aug 2020 13:26:16 -0400 Subject: [PATCH 59/61] Forgot to commit changes in vcf.py when moving to assessment --- gnomad/utils/vcf.py | 148 -------------------------------------------- 1 file changed, 148 deletions(-) diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py index 3d42d5181..1147813eb 100644 --- a/gnomad/utils/vcf.py +++ b/gnomad/utils/vcf.py @@ -369,83 +369,6 @@ def make_label_combos( return combos -def generic_field_check( - ht: hl.Table, - cond_expr: hl.expr.BooleanExpression, - check_description: str, - display_fields: List[str], - verbose: bool, -) -> None: - """ - Check a generic logical condition involving annotations in a Hail Table and print the results to terminal. - - Displays the number of rows in the Table that match the `cond_expr` and fail to be the desired condition (`check_description`). - - .. note:: - `cond_expr` and `check_description` are opposites and should never be the same. - E.g., If `cond_expr` filters for instances where the raw AC is less than adj AC, - then it is checking sites that fail to be the desired condition (`check_description`) - of having a raw AC greater than or equal to the adj AC. - - - :param ht: Table containing annotations to be checked. - :param cond_expr: Logical expression referring to annotations in ht to be checked. - :param check_description: String describing the condition being checked; is displayed in terminal summary message. - :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes); - these fields are also displayed if verbose is True. - :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, - show only top values of annotations that fail checks. - """ - ht_orig = ht - ht = ht.filter(cond_expr) - n_fail = ht.count() - if n_fail > 0: - logger.info(f"Found {n_fail} sites that fail {check_description} check:") - ht = ht.flatten() - ht.select("locus", "alleles", *display_fields).show() - else: - logger.info(f"PASSED {check_description} check") - if verbose: - ht_orig = ht_orig.flatten() - ht_orig.select(*display_fields).show() - - -def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]: - """ - Make Hail expressions to measure % variants filtered under varying conditions of interest. - - Checks for: - - Total number of variants - - Fraction of variants removed due to: - - Any filter - - Inbreeding coefficient filter in combination with any other filter - - AC0 filter in combination with any other filter - - Random forest filtering in combination with any other filter - - Only inbreeding coefficient filter - - Only AC0 filter - - Only random forest filtering - - :param ht: Table containing 'filter' annotation to be examined. - :return: Dictionary containing Hail aggregation expressions to examine filter flags. - """ - return { - "n": hl.agg.count(), - "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) != 0), - "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("InbreedingCoeff")), - "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")), - "frac_rf": hl.agg.fraction(ht.filters.contains("RF")), - "frac_inbreed_coeff_only": hl.agg.fraction( - ht.filters.contains("InbreedingCoeff") & (ht.filters.length() == 1) - ), - "frac_ac0_only": hl.agg.fraction( - ht.filters.contains("AC0") & (ht.filters.length() == 1) - ), - "frac_rf_only": hl.agg.fraction( - ht.filters.contains("RF") & (ht.filters.length() == 1) - ), - } - - def make_combo_header_text( preposition: str, combo_dict: Dict[str, str], @@ -784,77 +707,6 @@ def make_hist_dict(bin_edges: Dict[str, Dict[str, str]], adj: bool) -> Dict[str, return header_hist_dict -def sample_sum_check( - ht: hl.Table, - prefix: str, - label_groups: Dict[str, List[str]], - verbose: bool, - subpop: bool = None, - sort_order: List[str] = SORT_ORDER, -) -> None: - """ - Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version; - display results from checking the sum of the specified annotations in the terminal. - - :param ht: Table containing annotations to be summed. - :param prefix: String indicating sample subset. - :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping, - e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]). - :param verbose: If True, show top values of annotations being checked, including checks that pass; if False, - show only top values of annotations that fail checks. - :param subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked. - :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER. - :return: None - """ - label_combos = make_label_combos(label_groups) - combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in label_combos] - combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in label_combos] - combo_nhomalt = [ht.info[f"{prefix}nhomalt_{x}"] for x in label_combos] - - group = label_groups.pop("group")[0] - alt_groups = "_".join( - sorted(label_groups.keys(), key=lambda x: sort_order.index(x)) - ) - - annot_dict = { - f"sum_AC_{group}_{alt_groups}": hl.sum(combo_AC), - f"sum_AN_{group}_{alt_groups}": hl.sum(combo_AN), - f"sum_nhomalt_{group}_{alt_groups}": hl.sum(combo_nhomalt), - } - - ht = ht.annotate(**annot_dict) - - for subfield in ["AC", "AN", "nhomalt"]: - if not subpop: - generic_field_check( - ht, - ( - ht.info[f"{prefix}{subfield}_{group}"] - != ht[f"sum_{subfield}_{group}_{alt_groups}"] - ), - f"{prefix}{subfield}_{group} = sum({subfield}_{group}_{alt_groups})", - [ - f"info.{prefix}{subfield}_{group}", - f"sum_{subfield}_{group}_{alt_groups}", - ], - verbose, - ) - else: - generic_field_check( - ht, - ( - ht.info[f"{prefix}{subfield}_{group}_{subpop}"] - != ht[f"sum_{subfield}_{group}_{alt_groups}"] - ), - f"{prefix}{subfield}_{group}_{subpop} = sum({subfield}_{group}_{alt_groups})", - [ - f"info.{prefix}{subfield}_{group}_{subpop}", - f"sum_{subfield}_{group}_{alt_groups}", - ], - verbose, - ) - - def set_female_y_metrics_to_na( t: Union[hl.Table, hl.MatrixTable], ) -> Dict[str, hl.expr.Int32Expression]: From 6e67800cfb00c9f8856dd224d3bb73a1b67ae508 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Mon, 3 Aug 2020 13:52:49 -0400 Subject: [PATCH 60/61] Updated generic field check docstring --- gnomad/assessment/sanity_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/assessment/sanity_checks.py b/gnomad/assessment/sanity_checks.py index c6eb2d478..6ffdbe7ab 100644 --- a/gnomad/assessment/sanity_checks.py +++ b/gnomad/assessment/sanity_checks.py @@ -22,6 +22,7 @@ def generic_field_check( Check a generic logical condition involving annotations in a Hail Table and print the results to terminal. Displays the number of rows in the Table that match the `cond_expr` and fail to be the desired condition (`check_description`). + If the number of rows that match the `cond_expr` is 0, then the Table passes that check; otherwise, it fails. .. note:: `cond_expr` and `check_description` are opposites and should never be the same. @@ -29,7 +30,6 @@ def generic_field_check( then it is checking sites that fail to be the desired condition (`check_description`) of having a raw AC greater than or equal to the adj AC. - :param ht: Table containing annotations to be checked. :param cond_expr: Logical expression referring to annotations in ht to be checked. :param check_description: String describing the condition being checked; is displayed in terminal summary message. From f021255ccf801406044374ddd978ab4ee6915690 Mon Sep 17 00:00:00 2001 From: Katherine Chao Date: Tue, 4 Aug 2020 09:40:11 -0400 Subject: [PATCH 61/61] Added option to check for additional filters to make_filters_sanity_check_expr --- gnomad/assessment/sanity_checks.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/gnomad/assessment/sanity_checks.py b/gnomad/assessment/sanity_checks.py index 6ffdbe7ab..492ac3a14 100644 --- a/gnomad/assessment/sanity_checks.py +++ b/gnomad/assessment/sanity_checks.py @@ -1,5 +1,5 @@ import logging -from typing import Dict, List +from typing import Dict, List, Optional import hail as hl @@ -52,7 +52,9 @@ def generic_field_check( ht_orig.select(*display_fields).show() -def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]: +def make_filters_sanity_check_expr( + ht: hl.Table, extra_filter_checks: Optional[Dict[str, hl.expr.Expression]] = None +) -> Dict[str, hl.expr.Expression]: """ Make Hail expressions to measure % variants filtered under varying conditions of interest. @@ -68,9 +70,10 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression - Only random forest filtering :param ht: Table containing 'filter' annotation to be examined. + :param extra_filter_checks: Optional dictionary containing filter condition name (key) extra filter expressions (value) to be examined. :return: Dictionary containing Hail aggregation expressions to examine filter flags. """ - return { + filters_dict = { "n": hl.agg.count(), "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) != 0), "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("InbreedingCoeff")), @@ -86,6 +89,10 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression ht.filters.contains("RF") & (ht.filters.length() == 1) ), } + if extra_filter_checks: + filters_dict.update(extra_filter_checks) + + return filters_dict def sample_sum_check(