From 60b86406fb4828a42bb50bcf944f516552fca13d Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 21 Jul 2020 11:56:25 -0400
Subject: [PATCH 01/61] Added constants, make_label_combos,
 generic_field_check, and make_filters_sanity_check_expr to vcf.py from
 gnomad_qc. Also added INFO_DICT, FORMAT_DICT, and make_vcf_filter_dict from
 ukb repo

---
 gnomad/utils/vcf.py | 286 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 286 insertions(+)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 43c4b6200..8138e0a07 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -6,6 +6,43 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
+
+GROUPS = ["adj", "raw"]
+"""
+Group names used to generate labels for high quality genotypes and all raw genotypes. Used in VCF export.
+"""
+
+HISTS = ["gq_hist_alt", "gq_hist_all", "dp_hist_alt", "dp_hist_all", "ab_hist_alt"]
+"""
+Quality histograms used in VCF export.
+"""
+
+POPS = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"]
+"""
+Continental population names used in VCF export.
+"""
+
+NFE_SUBPOPS = ["onf", "bgr", "swe", "nwe", "seu", "est"]
+"""
+gnomAD subpopulations for NFE population. Used in VCF export.
+"""
+
+EAS_SUBPOPS = ["kor", "oea", "jpn"]
+"""
+gnomAD subpopulations for EAS population. Used in VCF export.
+"""
+
+FAF_POPS = ["afr", "amr", "eas", "nfe", "sas"]
+"""
+Populations that are included in filtering allele frequency calculations. Used in VCF export.
+"""
+
+SEXES = ["male", "female"]
+"""
+Sample sexes used in VCF export.
+"""
+
+
 INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [
     "AS_QUALapprox",
     "AS_VarDP",
@@ -14,6 +51,148 @@
     "AS_SB_TABLE",
 ]
 
+INFO_DICT = {
+    "FS": {
+        "Description": "Phred-scaled p-value of Fisher's exact test for strand bias"
+    },
+    "InbreedingCoeff": {
+        "Description": "Inbreeding coefficient, the excess heterozygosity at a variant site, computed as 1 - (the number of heterozygous genotypes)/(the number of heterozygous genotypes expected under Hardy-Weinberg equilibrium)"
+    },
+    "MQ": {
+        "Description": "Root mean square of the mapping quality of reads across all samples"
+    },
+    "MQRankSum": {
+        "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference read mapping qualities"
+    },
+    "QD": {
+        "Description": "Variant call confidence normalized by depth of sample reads supporting a variant"
+    },
+    "ReadPosRankSum": {
+        "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference read position bias"
+    },
+    "SOR": {"Description": "Strand bias estimated by the symmetric odds ratio test"},
+    "VQSR_POSITIVE_TRAIN_SITE": {
+        "Description": "Variant was used to build the positive training set of high-quality variants for VQSR"
+    },
+    "VQSR_NEGATIVE_TRAIN_SITE": {
+        "Description": "Variant was used to build the negative training set of low-quality variants for VQSR"
+    },
+    "AS_BaseQRankSum": {
+        "Number": "A",
+        "Description": "Allele-specific Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities",
+    },
+    "ClippingRankSum": {
+        "Number": "A",
+        "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference number of hard clipped bases",
+    },
+    "VarDP": {
+        "Description": "Depth over variant genotypes (does not include depth of reference samples)"
+    },
+    "AS_VarDP": {
+        "Number": "A",
+        "Description": "Allele-specific depth over variant genotypes (does not include depth of reference samples)",
+    },
+    "AS_VQSLOD": {
+        "Number": "A",
+        "Description": "Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model",
+    },
+    "AS_VQSR_culprit": {
+        "Number": "A",
+        "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model",
+    },
+    "lcr": {"Description": "Variant falls within a low complexity region"},
+    "fail_interval_qc": {
+        "Description": f"Variant falls within a region where less than {INTERVAL_QC_PARAMETERS[0]}% of samples had a mean coverage of {INTERVAL_QC_PARAMETERS[1]}X"
+    },
+    "nonpar": {
+        "Description": "Variant (on sex chromosome) falls outside a pseudoautosomal region"
+    },
+    "rf_positive_label": {
+        "Description": "Variant was labelled as a positive example for training of random forest model"
+    },
+    "rf_negative_label": {
+        "Description": "Variant was labelled as a negative example for training of random forest model"
+    },
+    "rf_label": {"Description": "Random forest training label"},
+    "rf_train": {"Description": "Variant was used in training random forest model"},
+    "rf_tp_probability": {
+        "Description": "Probability of a called variant being a true variant as determined by random forest model"
+    },
+    "transmitted_singleton": {
+        "Description": "Variant was a callset-wide doubleton that was transmitted within a family from a parent to a child (i.e., a singleton amongst unrelated samples in cohort)"
+    },
+    "original_alleles": {"Description": "Alleles before splitting multiallelics"},
+    "variant_type": {
+        "Description": "Variant type (snv, indel, multi-snv, multi-indel, or mixed)"
+    },
+    "allele_type": {
+        "Number": "A",
+        "Description": "Allele type (snv, insertion, deletion, or mixed)",
+    },
+    "n_alt_alleles": {
+        "Number": "A",
+        "Description": "Total number of alternate alleles observed at variant locus",
+    },
+    "was_mixed": {"Description": "Variant type was mixed"},
+    "has_star": {
+        "Description": "Variant locus coincides with a spanning deletion (represented by a star) observed elsewhere in the callset"
+    },
+    "pab_max": {
+        "Number": "A",
+        "Description": "Maximum p-value over callset for binomial test of observed allele balance for a heterozygous genotype, given expectation of 0.5",
+    },
+}
+"""
+Dictionary used during VCF export to export row (variant) annotations.
+"""
+
+FORMAT_DICT = {
+    "GT": {"Description": "Genotype", "Number": "1", "Type": "String"},
+    "AD": {
+        "Description": "Allelic depths for the ref and alt alleles in the order listed",
+        "Number": "R",
+        "Type": "Integer",
+    },
+    "DP": {
+        "Description": "Approximate read depth (reads with MQ=255 or with bad mates are filtered)",
+        "Number": "1",
+        "Type": "Integer",
+    },
+    "GQ": {
+        "Description": "Phred-scaled confidence that the genotype assignment is correct. Value is the difference between the second lowest PL and the lowest PL (always normalized to 0).",
+        "Number": "1",
+        "Type": "Integer",
+    },
+    "MIN_DP": {
+        "Description": "Minimum DP observed within the GVCF block",
+        "Number": "1",
+        "Type": "Integer",
+    },
+    "PGT": {
+        "Description": "Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another",
+        "Number": "1",
+        "Type": "String",
+    },
+    "PID": {
+        "Description": "Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group",
+        "Number": "1",
+        "Type": "String",
+    },
+    "PL": {
+        "Description": "Normalized, phred-scaled likelihoods for genotypes as defined in the VCF specification",
+        "Number": "G",
+        "Type": "Integer",
+    },
+    "SB": {
+        "Description": "Per-sample component statistics which comprise the Fisher's exact test to detect strand bias. Values are: depth of reference allele on forward strand, depth of reference allele on reverse strand, depth of alternate allele on forward strand, depth of alternate allele on reverse strand.",
+        "Number": "4",
+        "Type": "Integer",
+    },
+}
+"""
+Dictionary used during VCF export to export MatrixTable entries.
+"""
+
 
 def ht_to_vcf_mt(
     info_ht: hl.Table,
@@ -92,3 +271,110 @@ def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpressi
     # Create an MT with no cols so that we acn export to VCF
     info_mt = info_ht.to_matrix_table_row_major(columns=["s"], entry_field_name="s")
     return info_mt.filter_cols(False)
+
+
+def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]:
+    """
+    Make combinations of all possible labels for a supplied dictionary of label groups
+    :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
+        e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"])
+    :return: list of all possible combinations of values for the supplied label groupings
+    :rtype: list[str]
+    """
+    copy_label_groups = copy.deepcopy(label_groups)
+    if len(copy_label_groups) == 1:
+        return [item for sublist in copy_label_groups.values() for item in sublist]
+    anchor_group = sorted(copy_label_groups.keys(), key=lambda x: SORT_ORDER.index(x))[
+        0
+    ]
+    anchor_val = copy_label_groups.pop(anchor_group)
+    combos = []
+    for x, y in itertools.product(anchor_val, make_label_combos(copy_label_groups)):
+        combos.append("{0}_{1}".format(x, y))
+    return combos
+
+
+def generic_field_check(
+    ht: hl.Table, cond_expr, check_description, display_fields, verbose
+):
+    """
+    Check a generic logical condition involving annotations in a Hail Table and print the results to terminal
+    :param Table ht: Table containing annotations to be checked
+    :param Expression cond_expr: logical expression referring to annotations in ht to be checked
+    :param str check_description: String describing the condition being checked; is displayed in terminal summary message
+    :param list of str display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes);
+        these fields are also displayed if verbose is True
+    :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False,
+        show only top values of annotations that fail checks
+    :rtype: None
+    """
+    ht_orig = ht
+    ht = ht.filter(cond_expr)
+    n_fail = ht.count()
+    if n_fail > 0:
+        logger.info(f"Found {n_fail} sites that fail {check_description} check:")
+        ht = ht.flatten()
+        ht.select("locus", "alleles", *display_fields).show()
+    else:
+        logger.info(f"PASSED {check_description} check")
+        if verbose:
+            ht_orig = ht_orig.flatten()
+            ht_orig.select(*display_fields).show()
+
+
+def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]:
+    """
+    Make Hail Expressions to measure % variants filtered under varying conditions of interest
+    :param Table ht: Hail Table containing 'filter' annotation to be examined
+    :return: Dictionary containing Hail aggregation expressions to measure filter flags
+    :rtype: Dict of str: Expression
+    """
+    filters_dict = {
+        "n": hl.agg.count(),
+        "frac_any_filter": hl.agg.count_where(ht.is_filtered) / hl.agg.count(),
+        "frac_inbreed_coeff": hl.agg.count_where(
+            ht.filters.contains("inbreeding_coeff")
+        )
+        / hl.agg.count(),
+        "frac_ac0": hl.agg.count_where(ht.filters.contains("AC0")) / hl.agg.count(),
+        "frac_rf": hl.agg.count_where(ht.filters.contains("rf")) / hl.agg.count(),
+        "frac_inbreed_coeff_only": hl.agg.count_where(
+            ht.filters.contains("inbreeding_coeff") & (ht.filters.length() == 1)
+        )
+        / hl.agg.count(),
+        "frac_ac0_only": hl.agg.count_where(
+            ht.filters.contains("AC0") & (ht.filters.length() == 1)
+        )
+        / hl.agg.count(),
+        "frac_rf_only": hl.agg.count_where(
+            ht.filters.contains("rf") & (ht.filters.length() == 1)
+        )
+        / hl.agg.count(),
+    }
+    return filters_dict
+
+
+def make_vcf_filter_dict(
+    ht: hl.Table, snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float
+) -> Dict[str, str]:
+    """
+    Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations.
+
+    :param Table ht: Table containing global annotations of the Random Forests SNP and indel cutoffs.
+    :param float snp_cutoff: Minimum SNP cutoff score from random forest model.
+    :param float indel_cutoff: Minimum indel cutoff score from random forest model.
+    :param float inbreeding_cutoff: Inbreeding coefficient hard cutoff.
+    :return: Dictionary keyed by VCF FILTER annotations, where values are Dictionaries of Number and Description attributes.
+    :rtype: Dict[str, str]
+    """
+    filter_dict = {
+        "AC0": {
+            "Description": "Allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB < 0.2 for het calls)"
+        },
+        "InbreedingCoeff": {"Description": f"InbreedingCoeff < {inbreeding_cutoff}"},
+        "RF": {
+            "Description": f"Failed random forest filtering thresholds of {snp_cutoff}, {indel_cutoff} (probabilities of being a true positive variant) for SNPs, indels"
+        },
+        "PASS": {"Description": "Passed all variant filters"},
+    }
+    return filter_dict

From 3decb5f119ee18124aab2d4954298b68b2be6203 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 21 Jul 2020 12:42:48 -0400
Subject: [PATCH 02/61] Added AS_FIELDS, SITE_FIELDS constants and added
 function add_as_info_dict to update INFO_DICT with AS fields and their values

---
 gnomad/utils/vcf.py | 71 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 14 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 8138e0a07..a19aa215a 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -42,6 +42,36 @@
 Sample sexes used in VCF export.
 """
 
+AS_FIELDS = [
+    "AS_BaseQRankSum",
+    "AS_FS",
+    "AS_MQ",
+    "AS_MQRankSum",
+    "AS_pab_max",
+    "AS_QD",
+    "AS_ReadPosRankSum",
+    "AS_SOR",
+    "AS_VarDP",
+]
+"""
+Allele-specific variant annotations.
+"""
+
+SITE_FIELDS = [
+    "FS",
+    "InbreedingCoeff",
+    "MQ",
+    "MQRankSum",
+    "QD",
+    "ReadPosRankSum",
+    "sibling_singleton",
+    "SOR",
+    "transmitted_singleton",
+    "VarDP",
+]
+"""
+Site level variant annotations.
+"""
 
 INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [
     "AS_QUALapprox",
@@ -77,21 +107,12 @@
     "VQSR_NEGATIVE_TRAIN_SITE": {
         "Description": "Variant was used to build the negative training set of low-quality variants for VQSR"
     },
-    "AS_BaseQRankSum": {
-        "Number": "A",
-        "Description": "Allele-specific Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities",
-    },
-    "ClippingRankSum": {
-        "Number": "A",
-        "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference number of hard clipped bases",
+    "BaseQRankSum": {
+        "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities",
     },
     "VarDP": {
         "Description": "Depth over variant genotypes (does not include depth of reference samples)"
     },
-    "AS_VarDP": {
-        "Number": "A",
-        "Description": "Allele-specific depth over variant genotypes (does not include depth of reference samples)",
-    },
     "AS_VQSLOD": {
         "Number": "A",
         "Description": "Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model",
@@ -101,9 +122,6 @@
         "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model",
     },
     "lcr": {"Description": "Variant falls within a low complexity region"},
-    "fail_interval_qc": {
-        "Description": f"Variant falls within a region where less than {INTERVAL_QC_PARAMETERS[0]}% of samples had a mean coverage of {INTERVAL_QC_PARAMETERS[1]}X"
-    },
     "nonpar": {
         "Description": "Variant (on sex chromosome) falls outside a pseudoautosomal region"
     },
@@ -354,6 +372,31 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
     return filters_dict
 
 
+def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> None:
+    """
+    Updates info dictionary with allele-specific terms and their descriptions.
+
+    Used in VCF export.
+
+    :param INFO_DICT: Dictionary containing site-level annotations and their descriptions.
+    :return: None
+    """
+    prefix_text = "Allele-specific"
+    AS_DICT = {}
+
+    for field in AS_FIELDS:
+        # Strip AS_ from field name
+        site_field = field[3:]
+
+        AS_DICT[field] = {}
+        AS_DICT[field]["Number"] = "A"
+        AS_DICT[field][
+            "Description"
+        ] = f"{prefix_text} {INFO_DICT[site_field]['Description'][0].lower()}{INFO_DICT[site_field]['Description'][1:]}"
+
+    INFO_DICT.update(AS_DICT)
+
+
 def make_vcf_filter_dict(
     ht: hl.Table, snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float
 ) -> Dict[str, str]:

From c4ad6b254d1a51ff475b55aabe94eafd354c3fd0 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 21 Jul 2020 13:06:25 -0400
Subject: [PATCH 03/61] Updated make_as_info_dict and added more constants
 (RF_FIELDS, VQSR_FIELDS, REGION_TYPE_FIELDS, ALLELE_TYPE_FIELDS

---
 gnomad/utils/vcf.py | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index a19aa215a..d27e8ea00 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -73,6 +73,33 @@
 Site level variant annotations.
 """
 
+ALLELE_TYPE_FIELDS = ["allele_type", "has_star", "n_alt_alleles", "original_alleles", "variant_type", "was_mixed"]
+"""
+Allele-type annotations.
+"""
+
+REGION_TYPE_FIELDS = ["lcr", "nonpar"]
+"""
+Annotations about variant region type.
+"""
+
+RF_FIELDS = [
+    "fail_hard_filters",
+    "filters",
+    "rf_label",
+    "rf_train",
+    "rf_probability",
+    "tp",
+]
+"""
+Annotations specific to the random forest model.
+"""
+
+VQSR_FIELDS = ["AS_VQSLOD", "AS_culprit", "NEGATIVE_TRAIN_SITE", "POSITIVE_TRAIN_SITE"]
+"""
+Annotations specific to VQSR.
+"""
+
 INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [
     "AS_QUALapprox",
     "AS_VarDP",
@@ -372,14 +399,14 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
     return filters_dict
 
 
-def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> None:
+def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
     """
     Updates info dictionary with allele-specific terms and their descriptions.
 
     Used in VCF export.
 
     :param INFO_DICT: Dictionary containing site-level annotations and their descriptions.
-    :return: None
+    :return: Dictionary with allele specific annotations, their descriptions, and their VCF number field.
     """
     prefix_text = "Allele-specific"
     AS_DICT = {}
@@ -394,7 +421,7 @@ def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> None:
             "Description"
         ] = f"{prefix_text} {INFO_DICT[site_field]['Description'][0].lower()}{INFO_DICT[site_field]['Description'][1:]}"
 
-    INFO_DICT.update(AS_DICT)
+    return AS_DICT
 
 
 def make_vcf_filter_dict(

From 6c9b95fbddfc98966e0720f41ae630eb941973e1 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 21 Jul 2020 13:27:26 -0400
Subject: [PATCH 04/61] Added constants for entries to select during export

---
 gnomad/utils/vcf.py | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index d27e8ea00..cbcbe8b28 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -73,7 +73,14 @@
 Site level variant annotations.
 """
 
-ALLELE_TYPE_FIELDS = ["allele_type", "has_star", "n_alt_alleles", "original_alleles", "variant_type", "was_mixed"]
+ALLELE_TYPE_FIELDS = [
+    "allele_type",
+    "has_star",
+    "n_alt_alleles",
+    "original_alleles",
+    "variant_type",
+    "was_mixed",
+]
 """
 Allele-type annotations.
 """
@@ -191,6 +198,28 @@
 Dictionary used during VCF export to export row (variant) annotations.
 """
 
+ENTRIES = ["GT", "GQ", "DP", "AD", "MIN_DP", "PGT", "PID", "PL", "SB"]
+"""
+Densified entries to be selected during VCF export.
+"""
+
+SPARSE_ENTRIES = [
+    "DP",
+    "GQ",
+    "LA",
+    "LAD",
+    "LGT",
+    "LPGT",
+    "LPL",
+    "MIN_DP",
+    "PID",
+    "RGQ",
+    "SB",
+]
+"""
+Sparse entries to be selected and densified during VCF export.
+"""
+
 FORMAT_DICT = {
     "GT": {"Description": "Genotype", "Number": "1", "Type": "String"},
     "AD": {

From a9c0354b648fb5a9e37440bd5c7b1deed578d891 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 21 Jul 2020 17:00:18 -0400
Subject: [PATCH 05/61] Added make_info_dict to vcf.py

---
 gnomad/utils/vcf.py | 125 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index cbcbe8b28..afeff1b27 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -428,6 +428,131 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
     return filters_dict
 
 
+def make_info_dict(
+    prefix: str,
+    label_groups: Dict[str, str] = None,
+    bin_edges: Dict[str, str] = None,
+    faf: bool = False,
+    popmax: bool = False,
+    description_text: str = "",
+    age_hist_data: str = None,
+) -> Dict[str, Dict[str, str]]:
+    """
+    Generate dictionary of Number and Description attributes of VCF INFO fields.
+
+    Used to populate the VCF header during export.
+    
+    :param str prefix: gnomAD or empty string.
+    :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
+        e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
+    :param dict bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation.
+    :param bool faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations.
+    :param bool popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations.
+    :param bool description_text: Optinal text to append to the end of age and popmax descriptions. 
+    :param str age_hist_data: Pipe-delimited string of age histograms, from `get_age_distributions`.
+    :return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes.
+    :rtype: Dict[str, Dict[str, str]]
+    """
+    if prefix != "":
+        prefix = f"{prefix}_"
+
+    info_dict = dict()
+
+    if age_hist_data:
+        age_hist_dict = {
+            f"{prefix}age_hist_het_bin_freq": {
+                "Number": "A",
+                "Description": f"Histogram of ages of heterozygous individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
+            },
+            f"{prefix}age_hist_het_n_smaller": {
+                "Number": "A",
+                "Description": f"Count of age values falling below lowest histogram bin edge for heterozygous individuals{description_test}",
+            },
+            f"{prefix}age_hist_het_n_larger": {
+                "Number": "A",
+                "Description": f"Count of age values falling above highest histogram bin edge for heterozygous individuals{description_test}",
+            },
+            f"{prefix}age_hist_hom_bin_freq": {
+                "Number": "A",
+                "Description": f"Histogram of ages of homozygous alternate individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
+            },
+            f"{prefix}age_hist_hom_n_smaller": {
+                "Number": "A",
+                "Description": f"Count of age values falling below lowest histogram bin edge for homozygous alternate individuals{description_test}",
+            },
+            f"{prefix}age_hist_hom_n_larger": {
+                "Number": "A",
+                "Description": f"Count of age values falling above highest histogram bin edge for homozygous alternate individuals{description_test}",
+            },
+        }
+        info_dict.update(age_hist_dict)
+
+    if popmax:
+        popmax_dict = {
+            f"{prefix}popmax": {
+                "Number": "A",
+                "Description": f"Population with maximum AF{description_test}",
+            },
+            f"{prefix}AC_popmax": {
+                "Number": "A",
+                "Description": f"Allele count in the population with the maximum AF{description_test}",
+            },
+            f"{prefix}AN_popmax": {
+                "Number": "A",
+                "Description": f"Total number of alleles in the population with the maximum AF{description_test}",
+            },
+            f"{prefix}AF_popmax": {
+                "Number": "A",
+                "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_test}",
+            },
+            f"{prefix}nhomalt_popmax": {
+                "Number": "A",
+                "Description": f"Count of homozygous individuals in the population with the maximum allele frequency{description_test}",
+            },
+        }
+        info_dict.update(popmax_dict)
+
+    else:
+        group_types = sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x))
+        combos = make_label_combos(label_groups)
+
+        for combo in combos:
+            combo_fields = combo.split("_")
+
+            if not faf:
+                combo_dict = {
+                    f"{prefix}AC_{combo}": {
+                        "Number": "A",
+                        "Description": f"Alternate allele count{make_combo_header_text('for', group_types, combo_fields, prefix)}",
+                    },
+                    f"{prefix}AN_{combo}": {
+                        "Number": "1",
+                        "Description": f"Total number of alleles{make_combo_header_text('in', group_types, combo_fields, prefix)}",
+                    },
+                    f"{prefix}AF_{combo}": {
+                        "Number": "A",
+                        "Description": f"Alternate allele frequency{make_combo_header_text('in', group_types, combo_fields, prefix)}",
+                    },
+                    f"{prefix}nhomalt_{combo}": {
+                        "Number": "A",
+                        "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_types, combo_fields, prefix)}",
+                    },
+                }
+            else:
+                combo_dict = {
+                    f"{prefix}faf95_{combo}": {
+                        "Number": "A",
+                        "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}",
+                    },
+                    f"{prefix}faf99_{combo}": {
+                        "Number": "A",
+                        "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}",
+                    },
+                }
+            info_dict.update(combo_dict)
+    return info_dict
+
+
 def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
     """
     Updates info dictionary with allele-specific terms and their descriptions.

From 20f5d52e30fad160d8cab322ea6c51422edbd375 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 09:48:18 -0400
Subject: [PATCH 06/61] Removed unnecessary pop constants (can be imported from
 ancestry.py)

---
 gnomad/utils/vcf.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index afeff1b27..3022a63e5 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -17,21 +17,6 @@
 Quality histograms used in VCF export.
 """
 
-POPS = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"]
-"""
-Continental population names used in VCF export.
-"""
-
-NFE_SUBPOPS = ["onf", "bgr", "swe", "nwe", "seu", "est"]
-"""
-gnomAD subpopulations for NFE population. Used in VCF export.
-"""
-
-EAS_SUBPOPS = ["kor", "oea", "jpn"]
-"""
-gnomAD subpopulations for EAS population. Used in VCF export.
-"""
-
 FAF_POPS = ["afr", "amr", "eas", "nfe", "sas"]
 """
 Populations that are included in filtering allele frequency calculations. Used in VCF export.

From 2117d82c7ca015c539d28bbbfaa5f1f5bf00facf Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 10:20:33 -0400
Subject: [PATCH 07/61] Added make hist bin edges expr function to vcf.py, also
 removed types from docstrings

---
 gnomad/utils/vcf.py | 78 +++++++++++++++++++++++++++++++++------------
 1 file changed, 58 insertions(+), 20 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 3022a63e5..081996506 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -355,17 +355,16 @@ def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]:
 
 def generic_field_check(
     ht: hl.Table, cond_expr, check_description, display_fields, verbose
-):
+) -> None:
     """
     Check a generic logical condition involving annotations in a Hail Table and print the results to terminal
-    :param Table ht: Table containing annotations to be checked
-    :param Expression cond_expr: logical expression referring to annotations in ht to be checked
-    :param str check_description: String describing the condition being checked; is displayed in terminal summary message
-    :param list of str display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes);
+    :param ht: Table containing annotations to be checked
+    :param cond_expr: logical expression referring to annotations in ht to be checked
+    :param check_description: String describing the condition being checked; is displayed in terminal summary message
+    :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes);
         these fields are also displayed if verbose is True
     :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False,
         show only top values of annotations that fail checks
-    :rtype: None
     """
     ht_orig = ht
     ht = ht.filter(cond_expr)
@@ -384,9 +383,8 @@ def generic_field_check(
 def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]:
     """
     Make Hail Expressions to measure % variants filtered under varying conditions of interest
-    :param Table ht: Hail Table containing 'filter' annotation to be examined
+    :param ht: Hail Table containing 'filter' annotation to be examined
     :return: Dictionary containing Hail aggregation expressions to measure filter flags
-    :rtype: Dict of str: Expression
     """
     filters_dict = {
         "n": hl.agg.count(),
@@ -427,16 +425,15 @@ def make_info_dict(
 
     Used to populate the VCF header during export.
     
-    :param str prefix: gnomAD or empty string.
-    :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
+    :param prefix: gnomAD or empty string.
+    :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
         e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
-    :param dict bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation.
-    :param bool faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations.
-    :param bool popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations.
-    :param bool description_text: Optinal text to append to the end of age and popmax descriptions. 
+    :param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation.
+    :param faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations.
+    :param popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations.
+    :param description_text: Optinal text to append to the end of age and popmax descriptions. 
     :param str age_hist_data: Pipe-delimited string of age histograms, from `get_age_distributions`.
     :return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes.
-    :rtype: Dict[str, Dict[str, str]]
     """
     if prefix != "":
         prefix = f"{prefix}_"
@@ -569,12 +566,11 @@ def make_vcf_filter_dict(
     """
     Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations.
 
-    :param Table ht: Table containing global annotations of the Random Forests SNP and indel cutoffs.
-    :param float snp_cutoff: Minimum SNP cutoff score from random forest model.
-    :param float indel_cutoff: Minimum indel cutoff score from random forest model.
-    :param float inbreeding_cutoff: Inbreeding coefficient hard cutoff.
+    :param ht: Table containing global annotations of the Random Forests SNP and indel cutoffs.
+    :param snp_cutoff: Minimum SNP cutoff score from random forest model.
+    :param indel_cutoff: Minimum indel cutoff score from random forest model.
+    :param inbreeding_cutoff: Inbreeding coefficient hard cutoff.
     :return: Dictionary keyed by VCF FILTER annotations, where values are Dictionaries of Number and Description attributes.
-    :rtype: Dict[str, str]
     """
     filter_dict = {
         "AC0": {
@@ -587,3 +583,45 @@ def make_vcf_filter_dict(
         "PASS": {"Description": "Passed all variant filters"},
     }
     return filter_dict
+
+
+def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str, str]:
+    """
+    Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string
+    separated by pipe delimiters.
+
+    :param ht: Table containing histogram variant annotations.
+    :param prefix: Prefix text for age histogram bin edges. Default is gnomad_.
+    :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values.
+    """
+    edges_dict = {
+        "het": "|".join(
+            map(lambda x: f"{x:.1f}", ht.head(1).age_hist_het.collect()[0].bin_edges)
+        ),
+        "hom": "|".join(
+            map(lambda x: f"{x:.1f}", ht.head(1).age_hist_hom.collect()[0].bin_edges)
+        ),
+    }
+    for hist in HISTS:
+
+        # Parse hists calculated on both raw and adj-filtered data
+        for hist_type in ["adj_qual_hists", "qual_hists"]:
+            hist_name = hist
+            if "adj" in hist_type:
+                hist_name = f"{hist}_adj"
+            edges_dict[hist_name] = (
+                "|".join(
+                    map(
+                        lambda x: f"{x:.2f}",
+                        ht.head(1)[hist_type][hist].collect()[0].bin_edges,
+                    )
+                )
+                if "ab" in hist
+                else "|".join(
+                    map(
+                        lambda x: str(int(x)),
+                        ht.head(1)[hist_type][hist].collect()[0].bin_edges,
+                    )
+                )
+            )
+    return edges_dict

From 78084a7422b5b9e532101abafa71fedd076624f7 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 10:29:54 -0400
Subject: [PATCH 08/61] Added make hist dict

---
 gnomad/utils/vcf.py | 69 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 57 insertions(+), 12 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 081996506..85cacf397 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -334,11 +334,11 @@ def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpressi
 
 def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]:
     """
-    Make combinations of all possible labels for a supplied dictionary of label groups
+    Make combinations of all possible labels for a supplied dictionary of label groups.
+
     :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
-        e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"])
-    :return: list of all possible combinations of values for the supplied label groupings
-    :rtype: list[str]
+        e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
+    :return: list of all possible combinations of values for the supplied label groupings.
     """
     copy_label_groups = copy.deepcopy(label_groups)
     if len(copy_label_groups) == 1:
@@ -358,13 +358,14 @@ def generic_field_check(
 ) -> None:
     """
     Check a generic logical condition involving annotations in a Hail Table and print the results to terminal
-    :param ht: Table containing annotations to be checked
-    :param cond_expr: logical expression referring to annotations in ht to be checked
-    :param check_description: String describing the condition being checked; is displayed in terminal summary message
+
+    :param ht: Table containing annotations to be checked.
+    :param cond_expr: logical expression referring to annotations in ht to be checked.
+    :param check_description: String describing the condition being checked; is displayed in terminal summary message.
     :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes);
-        these fields are also displayed if verbose is True
+        these fields are also displayed if verbose is True.
     :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False,
-        show only top values of annotations that fail checks
+        show only top values of annotations that fail checks.
     """
     ht_orig = ht
     ht = ht.filter(cond_expr)
@@ -382,9 +383,9 @@ def generic_field_check(
 
 def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]:
     """
-    Make Hail Expressions to measure % variants filtered under varying conditions of interest
-    :param ht: Hail Table containing 'filter' annotation to be examined
-    :return: Dictionary containing Hail aggregation expressions to measure filter flags
+    Make Hail Expressions to measure % variants filtered under varying conditions of interest.
+    :param ht: Hail Table containing 'filter' annotation to be examined.
+    :return: Dictionary containing Hail aggregation expressions to measure filter flags.
     """
     filters_dict = {
         "n": hl.agg.count(),
@@ -625,3 +626,47 @@ def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str,
                 )
             )
     return edges_dict
+
+
+def make_hist_dict(bin_edges: Dict[str, Dict[str, str]], adj: bool) -> Dict[str, str]:
+    """
+    Generate dictionary of Number and Description attributes to be used in the VCF header, specifically for histogram annotations.
+
+    :param bin_edges: Dictionary keyed by histogram annotation name, with corresponding string-reformatted bin edges for values.
+    :param adj: Whether to create a header dict for raw or adj quality histograms.
+    :return: Dictionary keyed by VCF INFO annotations, where values are Dictionaries of Number and Description attributes.
+    """
+    header_hist_dict = {}
+    for hist in HISTS:
+
+        # Get hists for both raw and adj data
+        if adj:
+            hist = f"{hist}_adj"
+
+        edges = bin_edges[hist]
+        hist_fields = hist.split("_")
+        hist_text = hist_fields[0].upper()
+
+        if hist_fields[2] == "alt":
+            hist_text = hist_text + " in heterozygous individuals"
+        if adj:
+            hist_text = hist_text + " calculated on high quality genotypes"
+
+        hist_dict = {
+            f"{hist}_bin_freq": {
+                "Number": "A",
+                "Description": f"Histogram for {hist_text}; bin edges are: {edges}",
+            },
+            f"{hist}_n_smaller": {
+                "Number": "A",
+                "Description": f"Count of {hist_fields[0].upper()} values falling below lowest histogram bin edge {hist_text}",
+            },
+            f"{hist}_n_larger": {
+                "Number": "A",
+                "Description": f"Count of {hist_fields[0].upper()} values falling above highest histogram bin edge {hist_text}",
+            },
+        }
+
+        header_hist_dict.update(hist_dict)
+
+    return header_hist_dict

From 5c698b7fd3650a41c0818d7af953bbb6e92f4ce6 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 10:33:02 -0400
Subject: [PATCH 09/61] Updated changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e4ab4b5f1..78388cbf2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@
 * Fix for error in `compute_quantile_bin` that caused incorrect binning when a single score overlapped multiple bins. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238)
 * Removed assumption of `snv` annotation from `compute_quantile_bin`. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238)
 * Fixed `create_binned_ht` because it produced a "Cannot combine expressions from different source objects error". [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238)
+* Added constants and functions relevant to VCF export [(#241)](https://github.com/broadinstitute/gnomad_methods/pull/241)
 
 ## Version 0.4.0 - July 9th, 2020
 

From e30d67b7bb3465b637a1079d0a15dfe7ce4ff875 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 10:56:30 -0400
Subject: [PATCH 10/61] Added SORT_ORDER and sample_sum_check to vcf.py

---
 gnomad/utils/vcf.py | 98 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 86 insertions(+), 12 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 85cacf397..6bee6707e 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -1,11 +1,15 @@
 import hail as hl
-from typing import List
+from typing import Dict, List
 import logging
 
 logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
+SORT_ORDER = ["popmax", "group", "pop", "subpop", "sex"]
+"""
+Order to sort subgroupings during VCF export.
+"""
 
 GROUPS = ["adj", "raw"]
 """
@@ -445,27 +449,27 @@ def make_info_dict(
         age_hist_dict = {
             f"{prefix}age_hist_het_bin_freq": {
                 "Number": "A",
-                "Description": f"Histogram of ages of heterozygous individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
+                "Description": f"Histogram of ages of heterozygous individuals{description_text}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
             },
             f"{prefix}age_hist_het_n_smaller": {
                 "Number": "A",
-                "Description": f"Count of age values falling below lowest histogram bin edge for heterozygous individuals{description_test}",
+                "Description": f"Count of age values falling below lowest histogram bin edge for heterozygous individuals{description_text}",
             },
             f"{prefix}age_hist_het_n_larger": {
                 "Number": "A",
-                "Description": f"Count of age values falling above highest histogram bin edge for heterozygous individuals{description_test}",
+                "Description": f"Count of age values falling above highest histogram bin edge for heterozygous individuals{description_text}",
             },
             f"{prefix}age_hist_hom_bin_freq": {
                 "Number": "A",
-                "Description": f"Histogram of ages of homozygous alternate individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
+                "Description": f"Histogram of ages of homozygous alternate individuals{description_text}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
             },
             f"{prefix}age_hist_hom_n_smaller": {
                 "Number": "A",
-                "Description": f"Count of age values falling below lowest histogram bin edge for homozygous alternate individuals{description_test}",
+                "Description": f"Count of age values falling below lowest histogram bin edge for homozygous alternate individuals{description_text}",
             },
             f"{prefix}age_hist_hom_n_larger": {
                 "Number": "A",
-                "Description": f"Count of age values falling above highest histogram bin edge for homozygous alternate individuals{description_test}",
+                "Description": f"Count of age values falling above highest histogram bin edge for homozygous alternate individuals{description_text}",
             },
         }
         info_dict.update(age_hist_dict)
@@ -474,23 +478,23 @@ def make_info_dict(
         popmax_dict = {
             f"{prefix}popmax": {
                 "Number": "A",
-                "Description": f"Population with maximum AF{description_test}",
+                "Description": f"Population with maximum AF{description_text}",
             },
             f"{prefix}AC_popmax": {
                 "Number": "A",
-                "Description": f"Allele count in the population with the maximum AF{description_test}",
+                "Description": f"Allele count in the population with the maximum AF{description_text}",
             },
             f"{prefix}AN_popmax": {
                 "Number": "A",
-                "Description": f"Total number of alleles in the population with the maximum AF{description_test}",
+                "Description": f"Total number of alleles in the population with the maximum AF{description_text}",
             },
             f"{prefix}AF_popmax": {
                 "Number": "A",
-                "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_test}",
+                "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_text}",
             },
             f"{prefix}nhomalt_popmax": {
                 "Number": "A",
-                "Description": f"Count of homozygous individuals in the population with the maximum allele frequency{description_test}",
+                "Description": f"Count of homozygous individuals in the population with the maximum allele frequency{description_text}",
             },
         }
         info_dict.update(popmax_dict)
@@ -670,3 +674,73 @@ def make_hist_dict(bin_edges: Dict[str, Dict[str, str]], adj: bool) -> Dict[str,
         header_hist_dict.update(hist_dict)
 
     return header_hist_dict
+
+
+def sample_sum_check(
+    ht: hl.Table,
+    prefix: str,
+    label_groups: Dict[str, List[str]],
+    verbose: bool,
+    subpop: bool = None,
+) -> None:
+    """
+    Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version;
+    display results from checking the sum of the specified annotations in the terminal.
+
+    :param Table ht: Table containing annotations to be summed.
+    :param str prefix: String indicating sample subset.
+    :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
+        e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
+    :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False,
+        show only top values of annotations that fail checks.
+    :param str subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked.
+    :rtype: None
+    """
+    combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in make_label_combos(label_groups)]
+    combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in make_label_combos(label_groups)]
+    combo_nhomalt = [
+        ht.info[f"{prefix}nhomalt_{x}"] for x in make_label_combos(label_groups)
+    ]
+
+    group = label_groups.pop("group")[0]
+    alt_groups = "_".join(
+        sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x))
+    )
+
+    annot_dict = {
+        f"sum_AC_{group}_{alt_groups}": hl.sum(combo_AC),
+        f"sum_AN_{group}_{alt_groups}": hl.sum(combo_AN),
+        f"sum_nhomalt_{group}_{alt_groups}": hl.sum(combo_nhomalt),
+    }
+
+    ht = ht.annotate(**annot_dict)
+
+    for subfield in ["AC", "AN", "nhomalt"]:
+        if not subpop:
+            generic_field_check(
+                ht,
+                (
+                    ht.info[f"{prefix}{subfield}_{group}"]
+                    != ht[f"sum_{subfield}_{group}_{alt_groups}"]
+                ),
+                f"{prefix}{subfield}_{group} = sum({subfield}_{group}_{alt_groups})",
+                [
+                    f"info.{prefix}{subfield}_{group}",
+                    f"sum_{subfield}_{group}_{alt_groups}",
+                ],
+                verbose,
+            )
+        else:
+            generic_field_check(
+                ht,
+                (
+                    ht.info[f"{prefix}{subfield}_{group}_{subpop}"]
+                    != ht[f"sum_{subfield}_{group}_{alt_groups}"]
+                ),
+                f"{prefix}{subfield}_{group}_{subpop} = sum({subfield}_{group}_{alt_groups})",
+                [
+                    f"info.{prefix}{subfield}_{group}_{subpop}",
+                    f"sum_{subfield}_{group}_{alt_groups}",
+                ],
+                verbose,
+            )

From 706864d1322c6d5916a7024b8d24fd1a11aed517 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 11:38:18 -0400
Subject: [PATCH 11/61] Moved make combo header text to vcf.py

---
 gnomad/utils/vcf.py | 58 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 6bee6707e..47f02659a 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -1,6 +1,9 @@
-import hail as hl
-from typing import Dict, List
 import logging
+from typing import Dict, List
+
+import hail as hl
+
+from gnomad.sample_qc.ancestry import POP_NAMES
 
 logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
 logger = logging.getLogger(__name__)
@@ -416,6 +419,53 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
     return filters_dict
 
 
+def make_combo_header_text(
+    preposition: str,
+    group_types: List[str],
+    combo_fields: List[str],
+    prefix: str,
+) -> str:
+    """
+    Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset.
+
+    For example, if preposition is "for", group_types is ["group", "pop", "sex"], and combo_fields is ["adj", "afr", "female"],
+    this function will return the string " for female samples of African-American/African ancestry".
+
+    :param preposition: Relevant preposition to precede automatically generated text.
+    :param group_types: List of grouping types, e.g. "sex" or "pop".
+    :param combo_fields: List of the specific values for each grouping type, for which the text is being generated.
+    :param prefix: Prefix string indicating sample subset.
+    :return: String with automatically generated description text for a given set of combo fields.
+    """
+    combo_dict = dict(zip(group_types, combo_fields))
+    header_text = " " + preposition
+
+    if len(combo_dict.keys()) == 1:
+        if combo_dict["group"] == "adj":
+            return ""
+
+    if "sex" in combo_dict.keys():
+        header_text = header_text + " " + combo_dict["sex"]
+
+    header_text = header_text + " samples"
+
+    if "subpop" in combo_dict.keys():
+        header_text = header_text + f" of {POP_NAMES[combo_dict['subpop']]} ancestry"
+        combo_dict.pop("pop")
+
+    if "pop" in combo_dict.keys():
+        header_text = header_text + f" of {POP_NAMES[combo_dict['pop']]} ancestry"
+
+    if "gnomad" in prefix:
+        header_text = header_text + " in gnomAD"
+
+    if "group" in group_types:
+        if combo_dict["group"] == "raw":
+            header_text = header_text + ", before removing low-confidence genotypes"
+
+    return header_text
+
+
 def make_info_dict(
     prefix: str,
     label_groups: Dict[str, str] = None,
@@ -529,11 +579,11 @@ def make_info_dict(
                 combo_dict = {
                     f"{prefix}faf95_{combo}": {
                         "Number": "A",
-                        "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}",
+                        "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}",
                     },
                     f"{prefix}faf99_{combo}": {
                         "Number": "A",
-                        "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}",
+                        "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}",
                     },
                 }
             info_dict.update(combo_dict)

From d058743bc00d23acbfa764c2912c04e851da527b Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 11:42:12 -0400
Subject: [PATCH 12/61] Fixed imports and reformatted with black

---
 gnomad/utils/vcf.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 47f02659a..002b791df 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -1,3 +1,5 @@
+import copy
+import itertools
 import logging
 from typing import Dict, List
 
@@ -420,10 +422,7 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
 
 
 def make_combo_header_text(
-    preposition: str,
-    group_types: List[str],
-    combo_fields: List[str],
-    prefix: str,
+    preposition: str, group_types: List[str], combo_fields: List[str], prefix: str,
 ) -> str:
     """
     Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset.
@@ -616,7 +615,7 @@ def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str
 
 
 def make_vcf_filter_dict(
-    ht: hl.Table, snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float
+    snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float
 ) -> Dict[str, str]:
     """
     Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations.
@@ -650,10 +649,10 @@ def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str,
     :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values.
     """
     edges_dict = {
-        "het": "|".join(
+        f"{prefix}het": "|".join(
             map(lambda x: f"{x:.1f}", ht.head(1).age_hist_het.collect()[0].bin_edges)
         ),
-        "hom": "|".join(
+        f"{prefix}hom": "|".join(
             map(lambda x: f"{x:.1f}", ht.head(1).age_hist_hom.collect()[0].bin_edges)
         ),
     }

From d1bbacd9bae46a62e9906ca28afd06c03a760593 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 11:47:25 -0400
Subject: [PATCH 13/61] Changed docstring for make_hist_bin_edges_expr

---
 gnomad/utils/vcf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 002b791df..f2c980ce4 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -639,13 +639,13 @@ def make_vcf_filter_dict(
     return filter_dict
 
 
-def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str, str]:
+def make_hist_bin_edges_expr(ht: hl.Table, prefix: str) -> Dict[str, str]:
     """
     Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string
     separated by pipe delimiters.
 
     :param ht: Table containing histogram variant annotations.
-    :param prefix: Prefix text for age histogram bin edges. Default is gnomad_.
+    :param prefix: Prefix text for age histogram bin edges. 
     :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values.
     """
     edges_dict = {

From 424fcc18711be84726e4ec6e999aaa433845c4fb Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 13:02:24 -0400
Subject: [PATCH 14/61] Added set female metrics to NA

---
 gnomad/utils/vcf.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index f2c980ce4..46edfda14 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -1,7 +1,7 @@
 import copy
 import itertools
 import logging
-from typing import Dict, List
+from typing import Dict, List, Union
 
 import hail as hl
 
@@ -793,3 +793,35 @@ def sample_sum_check(
                 ],
                 verbose,
             )
+
+
+def set_female_y_metrics_to_na(
+    t: Union[hl.Table, hl.MatrixTable], faf: bool = True,
+) -> Dict[str, hl.expr.Int32Expression]:
+    """
+    Set AC, AN, and nhomalt Y variant annotations for females to NA (instead of 0).
+
+    :param Table/MatrixTable t: Table/MatrixTable containing female variant annotations.
+    :return: Dictionary with reset annotations
+    :rtype: Dict[str, hl.expr.Int32Expression]
+    """
+    metrics = list(t.row.info)
+    female_metrics = [x for x in metrics if "_female" in x]
+    female_metrics = [
+        x for x in female_metrics if ("nhomalt" in x) or ("AC" in x) or ("AN" in x)
+    ]
+
+    if faf:
+        female_metrics.extend([x for x in female_metrics if "faf" in x])
+
+    female_metrics_dict = {}
+    for metric in female_metrics:
+        female_metrics_dict.update(
+            {
+                f"{metric}": hl.or_missing(
+                    (t.locus.contig.in_y_nonpar() | t.locus.contig.in_y_par()),
+                    t.info[f"{metric}"],
+                )
+            }
+        )
+    return female_metrics_dict

From d397b87054502ee7edcea7a365d4787235abe12f Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 13:06:08 -0400
Subject: [PATCH 15/61] Updated docstring in set female metrics to na

---
 gnomad/utils/vcf.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 46edfda14..010c9adba 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -799,11 +799,10 @@ def set_female_y_metrics_to_na(
     t: Union[hl.Table, hl.MatrixTable], faf: bool = True,
 ) -> Dict[str, hl.expr.Int32Expression]:
     """
-    Set AC, AN, and nhomalt Y variant annotations for females to NA (instead of 0).
+    Set AC, AN, and nhomalt chrY variant annotations for females to NA (instead of 0).
 
-    :param Table/MatrixTable t: Table/MatrixTable containing female variant annotations.
+    :param t: Table/MatrixTable containing female variant annotations.
     :return: Dictionary with reset annotations
-    :rtype: Dict[str, hl.expr.Int32Expression]
     """
     metrics = list(t.row.info)
     female_metrics = [x for x in metrics if "_female" in x]

From ec0ddd6a2ad564df848c44be4de8216212b17a00 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 12:15:21 -0400
Subject: [PATCH 16/61] Removed transmitted singleton and sibling singleton
 from SITE_FIELDS

---
 gnomad/utils/vcf.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 010c9adba..150125429 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -58,9 +58,7 @@
     "MQRankSum",
     "QD",
     "ReadPosRankSum",
-    "sibling_singleton",
     "SOR",
-    "transmitted_singleton",
     "VarDP",
 ]
 """

From 647d8e6b88c5b49f00d62280d03bc0299f804758 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:19:26 -0400
Subject: [PATCH 17/61] Updated make_label_combos docstring and fixed values
 for some constants/their descriptions

---
 gnomad/utils/vcf.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 150125429..690d37a7a 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -14,6 +14,7 @@
 SORT_ORDER = ["popmax", "group", "pop", "subpop", "sex"]
 """
 Order to sort subgroupings during VCF export.
+Ensures that INFO labels in VCF are in desired order (e.g., raw_AC_afr_female).
 """
 
 GROUPS = ["adj", "raw"]
@@ -28,12 +29,13 @@
 
 FAF_POPS = ["afr", "amr", "eas", "nfe", "sas"]
 """
-Populations that are included in filtering allele frequency calculations. Used in VCF export.
+Global populations that are included in filtering allele frequency (faf) calculations. Used in VCF export.
 """
 
 SEXES = ["male", "female"]
 """
 Sample sexes used in VCF export.
+Used to stratify frequency annotations (AC, AN, AF) for each sex.
 """
 
 AS_FIELDS = [
@@ -77,9 +79,12 @@
 Allele-type annotations.
 """
 
-REGION_TYPE_FIELDS = ["lcr", "nonpar"]
+REGION_FLAG_FIELDS = ["decoy", "lcr", "nonpar", "segdup"]
 """
 Annotations about variant region type.
+
+.. note::
+    decoy and segdup resource files do not currently exist for GrCh38/hg38.
 """
 
 RF_FIELDS = [
@@ -91,7 +96,7 @@
     "tp",
 ]
 """
-Annotations specific to the random forest model.
+Annotations specific to the variant QC using a random forest model.
 """
 
 VQSR_FIELDS = ["AS_VQSLOD", "AS_culprit", "NEGATIVE_TRAIN_SITE", "POSITIVE_TRAIN_SITE"]
@@ -147,10 +152,12 @@
         "Number": "A",
         "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model",
     },
+    "decoy": {"Description": "Variant falls within a reference decoy region"},
     "lcr": {"Description": "Variant falls within a low complexity region"},
     "nonpar": {
         "Description": "Variant (on sex chromosome) falls outside a pseudoautosomal region"
     },
+    "segdup": {"Description": "Variant falls within a segmental duplication region"},
     "rf_positive_label": {
         "Description": "Variant was labelled as a positive example for training of random forest model"
     },
@@ -339,18 +346,24 @@ def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpressi
     return info_mt.filter_cols(False)
 
 
-def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]:
+def make_label_combos(
+    label_groups: Dict[str, List[str]], sort_order: List[str] = SORT_ORDER,
+) -> List[str]:
     """
     Make combinations of all possible labels for a supplied dictionary of label groups.
 
-    :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
+    For example, if label_groups is `{"sex": ["male", "female"], "pop": ["afr", "nfe", "amr"]}`,
+    this function will return `["afr_male", "afr_female", "nfe_male", "nfe_female", "amr_male", "amr_female']`
+
+    :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
         e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
+    :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
     :return: list of all possible combinations of values for the supplied label groupings.
     """
     copy_label_groups = copy.deepcopy(label_groups)
     if len(copy_label_groups) == 1:
         return [item for sublist in copy_label_groups.values() for item in sublist]
-    anchor_group = sorted(copy_label_groups.keys(), key=lambda x: SORT_ORDER.index(x))[
+    anchor_group = sorted(copy_label_groups.keys(), key=lambda x: sort_order.index(x))[
         0
     ]
     anchor_val = copy_label_groups.pop(anchor_group)

From 973c5ef1dd17a4fc803c5ad0d98ce83032f0bca1 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:21:43 -0400
Subject: [PATCH 18/61] Updated docstring for generic_field_check

---
 gnomad/utils/vcf.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 690d37a7a..8f8e099e0 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -374,17 +374,23 @@ def make_label_combos(
 
 
 def generic_field_check(
-    ht: hl.Table, cond_expr, check_description, display_fields, verbose
+    ht: hl.Table,
+    cond_expr: hl.expr.BooleanExpression,
+    check_description: str,
+    display_fields: List[str],
+    verbose: bool,
 ) -> None:
     """
-    Check a generic logical condition involving annotations in a Hail Table and print the results to terminal
+    Check a generic logical condition involving annotations in a Hail Table and print the results to terminal.
+
+    Displays the number of rows in the Table that return False for a condition.
 
     :param ht: Table containing annotations to be checked.
-    :param cond_expr: logical expression referring to annotations in ht to be checked.
+    :param cond_expr: Logical expression referring to annotations in ht to be checked.
     :param check_description: String describing the condition being checked; is displayed in terminal summary message.
     :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes);
         these fields are also displayed if verbose is True.
-    :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False,
+    :param verbose: If True, show top values of annotations being checked, including checks that pass; if False,
         show only top values of annotations that fail checks.
     """
     ht_orig = ht

From ae3fcf4e93f4a4aaccd26ea7c305f68989cc6144 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:24:03 -0400
Subject: [PATCH 19/61] Updated docstring for make_filters_sanity_check_expr

---
 gnomad/utils/vcf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 8f8e099e0..ecbc12965 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -410,7 +410,8 @@ def generic_field_check(
 def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]:
     """
     Make Hail Expressions to measure % variants filtered under varying conditions of interest.
-    :param ht: Hail Table containing 'filter' annotation to be examined.
+
+    :param ht: Table containing 'filter' annotation to be examined.
     :return: Dictionary containing Hail aggregation expressions to measure filter flags.
     """
     filters_dict = {

From d3967aadb88272d02d27e925ae0f5a7bf9645b1a Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:32:55 -0400
Subject: [PATCH 20/61] Updated make_filters_sanity_check_expr

---
 gnomad/utils/vcf.py | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index ecbc12965..1ffc677af 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -409,32 +409,37 @@ def generic_field_check(
 
 def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]:
     """
-    Make Hail Expressions to measure % variants filtered under varying conditions of interest.
+    Make Hail expressions to measure % variants filtered under varying conditions of interest.
+
+    Checks for:
+        - Total number of variants
+        - Fraction of variants removed by any filter
+        - Fraction of variants removed because of InbreedingCoefficient filter in combination with any other filter
+        - Fraction of varinats removed because of AC0 filter in combination with any other filter
+        - Fraction of variants removed because of random forest filtering in combination with any other filter
+        - Fraction of variants removed only because of InbreedingCoefficient filter
+        - Fraction of variants removed only because of AC0 filter
+        - Fraction of variants removed only because of random forest filtering
+
 
     :param ht: Table containing 'filter' annotation to be examined.
     :return: Dictionary containing Hail aggregation expressions to measure filter flags.
     """
     filters_dict = {
         "n": hl.agg.count(),
-        "frac_any_filter": hl.agg.count_where(ht.is_filtered) / hl.agg.count(),
-        "frac_inbreed_coeff": hl.agg.count_where(
-            ht.filters.contains("inbreeding_coeff")
-        )
-        / hl.agg.count(),
-        "frac_ac0": hl.agg.count_where(ht.filters.contains("AC0")) / hl.agg.count(),
-        "frac_rf": hl.agg.count_where(ht.filters.contains("rf")) / hl.agg.count(),
-        "frac_inbreed_coeff_only": hl.agg.count_where(
+        "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) == 0),
+        "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("inbreeding_coeff")),
+        "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")),
+        "frac_rf": hl.agg.fraction(ht.filters.contains("rf")),
+        "frac_inbreed_coeff_only": hl.agg.fraction(
             ht.filters.contains("inbreeding_coeff") & (ht.filters.length() == 1)
-        )
-        / hl.agg.count(),
-        "frac_ac0_only": hl.agg.count_where(
+        ),
+        "frac_ac0_only": hl.agg.fraction(
             ht.filters.contains("AC0") & (ht.filters.length() == 1)
-        )
-        / hl.agg.count(),
-        "frac_rf_only": hl.agg.count_where(
+        ),
+        "frac_rf_only": hl.agg.fraction(
             ht.filters.contains("rf") & (ht.filters.length() == 1)
-        )
-        / hl.agg.count(),
+        ),
     }
     return filters_dict
 

From 5313ea1dea9082248b0856a7950b235eb2d02de1 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:38:49 -0400
Subject: [PATCH 21/61] Updated docstring for make_combo_header_text

---
 gnomad/utils/vcf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 1ffc677af..df25494aa 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -454,7 +454,7 @@ def make_combo_header_text(
     this function will return the string " for female samples of African-American/African ancestry".
 
     :param preposition: Relevant preposition to precede automatically generated text.
-    :param group_types: List of grouping types, e.g. "sex" or "pop".
+    :param group_types: List of grouping types. Possible values in this list are: "group", "pop", "sex", "subpop".
     :param combo_fields: List of the specific values for each grouping type, for which the text is being generated.
     :param prefix: Prefix string indicating sample subset.
     :return: String with automatically generated description text for a given set of combo fields.
@@ -462,11 +462,11 @@ def make_combo_header_text(
     combo_dict = dict(zip(group_types, combo_fields))
     header_text = " " + preposition
 
-    if len(combo_dict.keys()) == 1:
+    if len(combo_dict) == 1:
         if combo_dict["group"] == "adj":
             return ""
 
-    if "sex" in combo_dict.keys():
+    if "sex" in combo_dict:
         header_text = header_text + " " + combo_dict["sex"]
 
     header_text = header_text + " samples"

From b556441e58771e7b0b0571d4ae3b6c0231767b61 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:42:42 -0400
Subject: [PATCH 22/61] Updated docstring for make_info_dict

---
 gnomad/utils/vcf.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index df25494aa..9be64db77 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -500,7 +500,13 @@ def make_info_dict(
     """
     Generate dictionary of Number and Description attributes of VCF INFO fields.
 
-    Used to populate the VCF header during export.
+    Used to populate the INFO fields of the VCF header during export.
+
+    Creates:
+        - INFO fields for age histograms (bin freq, n_smaller, and n_larger for heterozygous and homozygous variant carriers)
+        - INFO fields for popmax AC, AN, AF, nhomalt, and popmax population
+        - INFO fields for AC, AN, AF, nhomalt for each combination of sample population, sex, and subpopulation, both for adj and raw data
+        - INFO fields for filtering allele frequency (faf) annotations 
     
     :param prefix: gnomAD or empty string.
     :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,

From 13ac89e786e0ecef2ae30cd2ff62808e47689e3d Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:54:07 -0400
Subject: [PATCH 23/61] Updated make_combo_header_text to take dict as input

---
 gnomad/utils/vcf.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 9be64db77..d4fe986fa 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -445,7 +445,7 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
 
 
 def make_combo_header_text(
-    preposition: str, group_types: List[str], combo_fields: List[str], prefix: str,
+    preposition: str, combo_dict: Dict[str, str], prefix: str,
 ) -> str:
     """
     Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset.
@@ -454,12 +454,12 @@ def make_combo_header_text(
     this function will return the string " for female samples of African-American/African ancestry".
 
     :param preposition: Relevant preposition to precede automatically generated text.
-    :param group_types: List of grouping types. Possible values in this list are: "group", "pop", "sex", "subpop".
-    :param combo_fields: List of the specific values for each grouping type, for which the text is being generated.
+    :param combo_dict: Dict with grouping types as keys and values for grouping type as values. This function generates text for these values.
+        Possible grouping types are: "group", "pop", "sex", and "subpop". 
+        Example input: {"pop": "afr", "sex": "female"}
     :param prefix: Prefix string indicating sample subset.
     :return: String with automatically generated description text for a given set of combo fields.
     """
-    combo_dict = dict(zip(group_types, combo_fields))
     header_text = " " + preposition
 
     if len(combo_dict) == 1:
@@ -471,17 +471,17 @@ def make_combo_header_text(
 
     header_text = header_text + " samples"
 
-    if "subpop" in combo_dict.keys():
+    if "subpop" in combo_dict:
         header_text = header_text + f" of {POP_NAMES[combo_dict['subpop']]} ancestry"
         combo_dict.pop("pop")
 
-    if "pop" in combo_dict.keys():
+    if "pop" in combo_dict:
         header_text = header_text + f" of {POP_NAMES[combo_dict['pop']]} ancestry"
 
     if "gnomad" in prefix:
         header_text = header_text + " in gnomAD"
 
-    if "group" in group_types:
+    if "group" in combo_dict:
         if combo_dict["group"] == "raw":
             header_text = header_text + ", before removing low-confidence genotypes"
 
@@ -489,7 +489,7 @@ def make_combo_header_text(
 
 
 def make_info_dict(
-    prefix: str,
+    prefix: str = "",
     label_groups: Dict[str, str] = None,
     bin_edges: Dict[str, str] = None,
     faf: bool = False,
@@ -508,7 +508,7 @@ def make_info_dict(
         - INFO fields for AC, AN, AF, nhomalt for each combination of sample population, sex, and subpopulation, both for adj and raw data
         - INFO fields for filtering allele frequency (faf) annotations 
     
-    :param prefix: gnomAD or empty string.
+    :param prefix: Prefix string for data, e.g. "gnomAD". Default is empty string.
     :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
         e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
     :param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation.
@@ -583,35 +583,36 @@ def make_info_dict(
 
         for combo in combos:
             combo_fields = combo.split("_")
+            group_dict = dict(zip(group_types, combo_fields))
 
             if not faf:
                 combo_dict = {
                     f"{prefix}AC_{combo}": {
                         "Number": "A",
-                        "Description": f"Alternate allele count{make_combo_header_text('for', group_types, combo_fields, prefix)}",
+                        "Description": f"Alternate allele count{make_combo_header_text('for', group_dict, prefix)}",
                     },
                     f"{prefix}AN_{combo}": {
                         "Number": "1",
-                        "Description": f"Total number of alleles{make_combo_header_text('in', group_types, combo_fields, prefix)}",
+                        "Description": f"Total number of alleles{make_combo_header_text('in', group_dict, prefix)}",
                     },
                     f"{prefix}AF_{combo}": {
                         "Number": "A",
-                        "Description": f"Alternate allele frequency{make_combo_header_text('in', group_types, combo_fields, prefix)}",
+                        "Description": f"Alternate allele frequency{make_combo_header_text('in', group_dict, prefix)}",
                     },
                     f"{prefix}nhomalt_{combo}": {
                         "Number": "A",
-                        "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_types, combo_fields, prefix)}",
+                        "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_dict, prefix)}",
                     },
                 }
             else:
                 combo_dict = {
                     f"{prefix}faf95_{combo}": {
                         "Number": "A",
-                        "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}",
+                        "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_dict, prefix)}",
                     },
                     f"{prefix}faf99_{combo}": {
                         "Number": "A",
-                        "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}",
+                        "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_dict, prefix)}",
                     },
                 }
             info_dict.update(combo_dict)

From 1202e2ebca096f25dec87b1d8a255cf052ec5f05 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:57:03 -0400
Subject: [PATCH 24/61] Updated make_info_dict to pass in sort order

---
 gnomad/utils/vcf.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index d4fe986fa..e6401e831 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -496,6 +496,7 @@ def make_info_dict(
     popmax: bool = False,
     description_text: str = "",
     age_hist_data: str = None,
+    sort_order: List[str] = SORT_ORDER,
 ) -> Dict[str, Dict[str, str]]:
     """
     Generate dictionary of Number and Description attributes of VCF INFO fields.
@@ -514,8 +515,9 @@ def make_info_dict(
     :param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation.
     :param faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations.
     :param popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations.
-    :param description_text: Optinal text to append to the end of age and popmax descriptions. 
+    :param description_text: Optional text to append to the end of age and popmax descriptions. Needs to start with a space if specified.
     :param str age_hist_data: Pipe-delimited string of age histograms, from `get_age_distributions`.
+    :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
     :return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes.
     """
     if prefix != "":
@@ -578,7 +580,7 @@ def make_info_dict(
         info_dict.update(popmax_dict)
 
     else:
-        group_types = sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x))
+        group_types = sorted(label_groups.keys(), key=lambda x: sort_order.index(x))
         combos = make_label_combos(label_groups)
 
         for combo in combos:

From 759cfac4806001a5c664ccee6fd7c1e061ac8c47 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 29 Jul 2020 09:54:46 -0400
Subject: [PATCH 25/61] Addressed rest of review comments

---
 gnomad/utils/vcf.py | 95 +++++++++++++++++++++++++++------------------
 1 file changed, 58 insertions(+), 37 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index e6401e831..37165c08e 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -621,29 +621,41 @@ def make_info_dict(
     return info_dict
 
 
-def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
+def add_as_info_dict(
+    info_dict: Dict[str, Dict[str, str]] = INFO_DICT, as_fields: List[str] = AS_FIELDS
+) -> Dict[str, Dict[str, str]]:
     """
     Updates info dictionary with allele-specific terms and their descriptions.
 
     Used in VCF export.
 
-    :param INFO_DICT: Dictionary containing site-level annotations and their descriptions.
+    :param info_dict: Dictionary containing site-level annotations and their descriptions. Default is INFO_DICT.
+    :param as_fields: List containing allele-specific fields to be added to info_dict. Default is AS_FIELDS.
     :return: Dictionary with allele specific annotations, their descriptions, and their VCF number field.
     """
     prefix_text = "Allele-specific"
-    AS_DICT = {}
+    as_dict = {}
+
+    for field in as_fields:
+
+        try:
+            # Strip AS_ from field name
+            site_field = field[3:]
 
-    for field in AS_FIELDS:
-        # Strip AS_ from field name
-        site_field = field[3:]
+            # Get site description from info dictionary and make first letter lower case
+            first_letter = info_dict[site_field]["Description"][0].lower()
+            rest_of_description = info_dict[site_field]["Description"][1:]
 
-        AS_DICT[field] = {}
-        AS_DICT[field]["Number"] = "A"
-        AS_DICT[field][
-            "Description"
-        ] = f"{prefix_text} {INFO_DICT[site_field]['Description'][0].lower()}{INFO_DICT[site_field]['Description'][1:]}"
+            as_dict[field] = {}
+            as_dict[field]["Number"] = "A"
+            as_dict[field][
+                "Description"
+            ] = f"{prefix_text} {first_letter}{rest_of_description}"
 
-    return AS_DICT
+        except KeyError:
+            raise ValueError(f"{field} is not present in input info dictionary!")
+
+    return as_dict
 
 
 def make_vcf_filter_dict(
@@ -652,7 +664,12 @@ def make_vcf_filter_dict(
     """
     Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations.
 
-    :param ht: Table containing global annotations of the Random Forests SNP and indel cutoffs.
+    Generates descriptions for:
+        - AC0 filter
+        - InbreedingCoeff filter
+        - RF filter
+        - PASS (passed all variant filters)
+
     :param snp_cutoff: Minimum SNP cutoff score from random forest model.
     :param indel_cutoff: Minimum indel cutoff score from random forest model.
     :param inbreeding_cutoff: Inbreeding coefficient hard cutoff.
@@ -671,43 +688,45 @@ def make_vcf_filter_dict(
     return filter_dict
 
 
-def make_hist_bin_edges_expr(ht: hl.Table, prefix: str) -> Dict[str, str]:
+def make_hist_bin_edges_expr(
+    ht: hl.Table, hists: List[str] = HISTS, prefix: str = ""
+) -> Dict[str, str]:
     """
     Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string
     separated by pipe delimiters.
 
     :param ht: Table containing histogram variant annotations.
-    :param prefix: Prefix text for age histogram bin edges. 
+    :param hists: List of variant histogram annotations. Default is HISTS.
+    :param prefix: Prefix text for age histogram bin edges.  Default is empty string.
     :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values.
     """
+    # Add underscore to prefix if it isn't empty
+    if prefix != "":
+        prefix += "_"
+
     edges_dict = {
-        f"{prefix}het": "|".join(
-            map(lambda x: f"{x:.1f}", ht.head(1).age_hist_het.collect()[0].bin_edges)
-        ),
-        f"{prefix}hom": "|".join(
-            map(lambda x: f"{x:.1f}", ht.head(1).age_hist_hom.collect()[0].bin_edges)
-        ),
+        f"{prefix}{call_type}": "|".join(
+            map(
+                lambda x: f"{x:.1f}",
+                ht.head(1)[f"age_hist_{call_type}"].collect()[0].bin_edges,
+            )
+        )
+        for call_type in ["het", "hom"]
     }
-    for hist in HISTS:
+
+    for hist in hists:
+
+        hist_name = hist
 
         # Parse hists calculated on both raw and adj-filtered data
         for hist_type in ["adj_qual_hists", "qual_hists"]:
-            hist_name = hist
+
             if "adj" in hist_type:
                 hist_name = f"{hist}_adj"
-            edges_dict[hist_name] = (
-                "|".join(
-                    map(
-                        lambda x: f"{x:.2f}",
-                        ht.head(1)[hist_type][hist].collect()[0].bin_edges,
-                    )
-                )
-                if "ab" in hist
-                else "|".join(
-                    map(
-                        lambda x: str(int(x)),
-                        ht.head(1)[hist_type][hist].collect()[0].bin_edges,
-                    )
+            edges_dict[hist_name] = "|".join(
+                map(
+                    lambda x: f"{x:.2f}" if "ab" in hist else str(int(x)),
+                    ht.head(1)[hist_type][hist].collect()[0].bin_edges,
                 )
             )
     return edges_dict
@@ -763,6 +782,7 @@ def sample_sum_check(
     label_groups: Dict[str, List[str]],
     verbose: bool,
     subpop: bool = None,
+    sort_order: List[str] = SORT_ORDER,
 ) -> None:
     """
     Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version;
@@ -775,6 +795,7 @@ def sample_sum_check(
     :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False,
         show only top values of annotations that fail checks.
     :param str subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked.
+    :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
     :rtype: None
     """
     combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in make_label_combos(label_groups)]
@@ -785,7 +806,7 @@ def sample_sum_check(
 
     group = label_groups.pop("group")[0]
     alt_groups = "_".join(
-        sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x))
+        sorted(label_groups.keys(), key=lambda x: sort_order.index(x))
     )
 
     annot_dict = {

From fb14af8527e433581553c8615246882abc0e1902 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Thu, 30 Jul 2020 13:50:58 -0400
Subject: [PATCH 26/61] Add BaseQRankSum to SITE_FIELDS const

---
 gnomad/utils/vcf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 37165c08e..dd868e355 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -54,6 +54,7 @@
 """
 
 SITE_FIELDS = [
+    "BaseQRankSum",
     "FS",
     "InbreedingCoeff",
     "MQ",

From 8d1f38c32110159bea11ce7ffcc9b8a090b0b2f6 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Thu, 30 Jul 2020 16:55:15 -0400
Subject: [PATCH 27/61] Addressed review comments

---
 gnomad/utils/vcf.py | 47 +++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index dd868e355..903a90040 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -85,7 +85,7 @@
 Annotations about variant region type.
 
 .. note::
-    decoy and segdup resource files do not currently exist for GrCh38/hg38.
+    decoy and segdup resource files do not currently exist for GRCh38/hg38.
 """
 
 RF_FIELDS = [
@@ -145,13 +145,11 @@
     "VarDP": {
         "Description": "Depth over variant genotypes (does not include depth of reference samples)"
     },
-    "AS_VQSLOD": {
-        "Number": "A",
-        "Description": "Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model",
+    "VQSLOD": {
+        "Description": "Log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model",
     },
-    "AS_VQSR_culprit": {
-        "Number": "A",
-        "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model",
+    "VQSR_culprit": {
+        "Description": "Worst-performing annotation in the VQSR Gaussian mixture model",
     },
     "decoy": {"Description": "Variant falls within a reference decoy region"},
     "lcr": {"Description": "Variant falls within a low complexity region"},
@@ -177,19 +175,16 @@
     "variant_type": {
         "Description": "Variant type (snv, indel, multi-snv, multi-indel, or mixed)"
     },
-    "allele_type": {
-        "Number": "A",
-        "Description": "Allele type (snv, insertion, deletion, or mixed)",
-    },
+    "allele_type": {"Description": "Allele type (snv, insertion, deletion, or mixed)",},
     "n_alt_alleles": {
-        "Number": "A",
+        "Number": 1,
         "Description": "Total number of alternate alleles observed at variant locus",
     },
     "was_mixed": {"Description": "Variant type was mixed"},
     "has_star": {
         "Description": "Variant locus coincides with a spanning deletion (represented by a star) observed elsewhere in the callset"
     },
-    "pab_max": {
+    "AS_pab_max": {
         "Number": "A",
         "Description": "Maximum p-value over callset for binomial test of observed allele balance for a heterozygous genotype, given expectation of 0.5",
     },
@@ -384,7 +379,12 @@ def generic_field_check(
     """
     Check a generic logical condition involving annotations in a Hail Table and print the results to terminal.
 
-    Displays the number of rows in the Table that return False for a condition.
+    Displays the number of rows in the Table that return False for a condition (`cond_expr`). 
+
+    .. note::
+        This function checks for the number of rows that violate the `cond_expr` and prints the desired condition (`check_description`) to the terminal.
+        `cond_expr` and `check_description` are opposites and should never be the same.
+
 
     :param ht: Table containing annotations to be checked.
     :param cond_expr: Logical expression referring to annotations in ht to be checked.
@@ -428,7 +428,7 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
     """
     filters_dict = {
         "n": hl.agg.count(),
-        "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) == 0),
+        "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) != 0),
         "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("inbreeding_coeff")),
         "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")),
         "frac_rf": hl.agg.fraction(ht.filters.contains("rf")),
@@ -542,7 +542,7 @@ def make_info_dict(
             },
             f"{prefix}age_hist_hom_bin_freq": {
                 "Number": "A",
-                "Description": f"Histogram of ages of homozygous alternate individuals{description_text}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
+                "Description": f"Histogram of ages of homozygous alternate individuals{description_text}; bin edges are: {bin_edges['hom']}; total number of individuals of any genotype bin: {age_hist_data}",
             },
             f"{prefix}age_hist_hom_n_smaller": {
                 "Number": "A",
@@ -571,7 +571,7 @@ def make_info_dict(
             },
             f"{prefix}AF_popmax": {
                 "Number": "A",
-                "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_text}",
+                "Description": f"Maximum allele frequency across populations{description_text}",
             },
             f"{prefix}nhomalt_popmax": {
                 "Number": "A",
@@ -634,7 +634,6 @@ def add_as_info_dict(
     :param as_fields: List containing allele-specific fields to be added to info_dict. Default is AS_FIELDS.
     :return: Dictionary with allele specific annotations, their descriptions, and their VCF number field.
     """
-    prefix_text = "Allele-specific"
     as_dict = {}
 
     for field in as_fields:
@@ -651,7 +650,7 @@ def add_as_info_dict(
             as_dict[field]["Number"] = "A"
             as_dict[field][
                 "Description"
-            ] = f"{prefix_text} {first_letter}{rest_of_description}"
+            ] = f"Allele-specific {first_letter}{rest_of_description}"
 
         except KeyError:
             raise ValueError(f"{field} is not present in input info dictionary!")
@@ -850,7 +849,7 @@ def sample_sum_check(
 
 
 def set_female_y_metrics_to_na(
-    t: Union[hl.Table, hl.MatrixTable], faf: bool = True,
+    t: Union[hl.Table, hl.MatrixTable],
 ) -> Dict[str, hl.expr.Int32Expression]:
     """
     Set AC, AN, and nhomalt chrY variant annotations for females to NA (instead of 0).
@@ -860,19 +859,13 @@ def set_female_y_metrics_to_na(
     """
     metrics = list(t.row.info)
     female_metrics = [x for x in metrics if "_female" in x]
-    female_metrics = [
-        x for x in female_metrics if ("nhomalt" in x) or ("AC" in x) or ("AN" in x)
-    ]
-
-    if faf:
-        female_metrics.extend([x for x in female_metrics if "faf" in x])
 
     female_metrics_dict = {}
     for metric in female_metrics:
         female_metrics_dict.update(
             {
                 f"{metric}": hl.or_missing(
-                    (t.locus.contig.in_y_nonpar() | t.locus.contig.in_y_par()),
+                    (~t.locus.contig.in_y_nonpar() | ~t.locus.contig.in_y_par()),
                     t.info[f"{metric}"],
                 )
             }

From 92dbedfdb3f5194de05d96d5de94d33dbf286f29 Mon Sep 17 00:00:00 2001
From: Nick Watts <nwatts@broadinstitute.org>
Date: Fri, 31 Jul 2020 10:19:41 -0400
Subject: [PATCH 28/61] Update cache and setup-python Actions (#244)

* Update checkout and setup-python actions

* Install wheel in CI workflow
---
 .github/workflows/ci.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 30dac2faa..df7e193c8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,11 +12,11 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
       - name: Setup Python
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v2
         with:
           python-version: 3.7
       - name: Use pip cache
-        uses: actions/cache@v1
+        uses: actions/cache@v2
         with:
           path: ~/.cache/pip
           key: pip-${{ hashFiles('**/requirements*.txt') }}
@@ -24,6 +24,7 @@ jobs:
             pip-
       - name: Install dependencies
         run: |
+          pip install wheel
           pip install -r requirements.txt
           pip install -r requirements-dev.txt
       - name: Run Pylint
@@ -49,6 +50,7 @@ jobs:
             pip-
       - name: Install dependencies
         run: |
+          pip install wheel
           pip install -r requirements.txt
           pip install -r docs/requirements.txt
       - name: Build docs

From d18e2bbd7fddc7bc995e83cc67c7aa270522bfcf Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 21 Jul 2020 11:56:25 -0400
Subject: [PATCH 29/61] Added constants, make_label_combos,
 generic_field_check, and make_filters_sanity_check_expr to vcf.py from
 gnomad_qc. Also added INFO_DICT, FORMAT_DICT, and make_vcf_filter_dict from
 ukb repo

---
 gnomad/utils/vcf.py | 286 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 286 insertions(+)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 43c4b6200..8138e0a07 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -6,6 +6,43 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
+
+GROUPS = ["adj", "raw"]
+"""
+Group names used to generate labels for high quality genotypes and all raw genotypes. Used in VCF export.
+"""
+
+HISTS = ["gq_hist_alt", "gq_hist_all", "dp_hist_alt", "dp_hist_all", "ab_hist_alt"]
+"""
+Quality histograms used in VCF export.
+"""
+
+POPS = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"]
+"""
+Continental population names used in VCF export.
+"""
+
+NFE_SUBPOPS = ["onf", "bgr", "swe", "nwe", "seu", "est"]
+"""
+gnomAD subpopulations for NFE population. Used in VCF export.
+"""
+
+EAS_SUBPOPS = ["kor", "oea", "jpn"]
+"""
+gnomAD subpopulations for EAS population. Used in VCF export.
+"""
+
+FAF_POPS = ["afr", "amr", "eas", "nfe", "sas"]
+"""
+Populations that are included in filtering allele frequency calculations. Used in VCF export.
+"""
+
+SEXES = ["male", "female"]
+"""
+Sample sexes used in VCF export.
+"""
+
+
 INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [
     "AS_QUALapprox",
     "AS_VarDP",
@@ -14,6 +51,148 @@
     "AS_SB_TABLE",
 ]
 
+INFO_DICT = {
+    "FS": {
+        "Description": "Phred-scaled p-value of Fisher's exact test for strand bias"
+    },
+    "InbreedingCoeff": {
+        "Description": "Inbreeding coefficient, the excess heterozygosity at a variant site, computed as 1 - (the number of heterozygous genotypes)/(the number of heterozygous genotypes expected under Hardy-Weinberg equilibrium)"
+    },
+    "MQ": {
+        "Description": "Root mean square of the mapping quality of reads across all samples"
+    },
+    "MQRankSum": {
+        "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference read mapping qualities"
+    },
+    "QD": {
+        "Description": "Variant call confidence normalized by depth of sample reads supporting a variant"
+    },
+    "ReadPosRankSum": {
+        "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference read position bias"
+    },
+    "SOR": {"Description": "Strand bias estimated by the symmetric odds ratio test"},
+    "VQSR_POSITIVE_TRAIN_SITE": {
+        "Description": "Variant was used to build the positive training set of high-quality variants for VQSR"
+    },
+    "VQSR_NEGATIVE_TRAIN_SITE": {
+        "Description": "Variant was used to build the negative training set of low-quality variants for VQSR"
+    },
+    "AS_BaseQRankSum": {
+        "Number": "A",
+        "Description": "Allele-specific Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities",
+    },
+    "ClippingRankSum": {
+        "Number": "A",
+        "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference number of hard clipped bases",
+    },
+    "VarDP": {
+        "Description": "Depth over variant genotypes (does not include depth of reference samples)"
+    },
+    "AS_VarDP": {
+        "Number": "A",
+        "Description": "Allele-specific depth over variant genotypes (does not include depth of reference samples)",
+    },
+    "AS_VQSLOD": {
+        "Number": "A",
+        "Description": "Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model",
+    },
+    "AS_VQSR_culprit": {
+        "Number": "A",
+        "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model",
+    },
+    "lcr": {"Description": "Variant falls within a low complexity region"},
+    "fail_interval_qc": {
+        "Description": f"Variant falls within a region where less than {INTERVAL_QC_PARAMETERS[0]}% of samples had a mean coverage of {INTERVAL_QC_PARAMETERS[1]}X"
+    },
+    "nonpar": {
+        "Description": "Variant (on sex chromosome) falls outside a pseudoautosomal region"
+    },
+    "rf_positive_label": {
+        "Description": "Variant was labelled as a positive example for training of random forest model"
+    },
+    "rf_negative_label": {
+        "Description": "Variant was labelled as a negative example for training of random forest model"
+    },
+    "rf_label": {"Description": "Random forest training label"},
+    "rf_train": {"Description": "Variant was used in training random forest model"},
+    "rf_tp_probability": {
+        "Description": "Probability of a called variant being a true variant as determined by random forest model"
+    },
+    "transmitted_singleton": {
+        "Description": "Variant was a callset-wide doubleton that was transmitted within a family from a parent to a child (i.e., a singleton amongst unrelated samples in cohort)"
+    },
+    "original_alleles": {"Description": "Alleles before splitting multiallelics"},
+    "variant_type": {
+        "Description": "Variant type (snv, indel, multi-snv, multi-indel, or mixed)"
+    },
+    "allele_type": {
+        "Number": "A",
+        "Description": "Allele type (snv, insertion, deletion, or mixed)",
+    },
+    "n_alt_alleles": {
+        "Number": "A",
+        "Description": "Total number of alternate alleles observed at variant locus",
+    },
+    "was_mixed": {"Description": "Variant type was mixed"},
+    "has_star": {
+        "Description": "Variant locus coincides with a spanning deletion (represented by a star) observed elsewhere in the callset"
+    },
+    "pab_max": {
+        "Number": "A",
+        "Description": "Maximum p-value over callset for binomial test of observed allele balance for a heterozygous genotype, given expectation of 0.5",
+    },
+}
+"""
+Dictionary used during VCF export to export row (variant) annotations.
+"""
+
+FORMAT_DICT = {
+    "GT": {"Description": "Genotype", "Number": "1", "Type": "String"},
+    "AD": {
+        "Description": "Allelic depths for the ref and alt alleles in the order listed",
+        "Number": "R",
+        "Type": "Integer",
+    },
+    "DP": {
+        "Description": "Approximate read depth (reads with MQ=255 or with bad mates are filtered)",
+        "Number": "1",
+        "Type": "Integer",
+    },
+    "GQ": {
+        "Description": "Phred-scaled confidence that the genotype assignment is correct. Value is the difference between the second lowest PL and the lowest PL (always normalized to 0).",
+        "Number": "1",
+        "Type": "Integer",
+    },
+    "MIN_DP": {
+        "Description": "Minimum DP observed within the GVCF block",
+        "Number": "1",
+        "Type": "Integer",
+    },
+    "PGT": {
+        "Description": "Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another",
+        "Number": "1",
+        "Type": "String",
+    },
+    "PID": {
+        "Description": "Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group",
+        "Number": "1",
+        "Type": "String",
+    },
+    "PL": {
+        "Description": "Normalized, phred-scaled likelihoods for genotypes as defined in the VCF specification",
+        "Number": "G",
+        "Type": "Integer",
+    },
+    "SB": {
+        "Description": "Per-sample component statistics which comprise the Fisher's exact test to detect strand bias. Values are: depth of reference allele on forward strand, depth of reference allele on reverse strand, depth of alternate allele on forward strand, depth of alternate allele on reverse strand.",
+        "Number": "4",
+        "Type": "Integer",
+    },
+}
+"""
+Dictionary used during VCF export to export MatrixTable entries.
+"""
+
 
 def ht_to_vcf_mt(
     info_ht: hl.Table,
@@ -92,3 +271,110 @@ def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpressi
     # Create an MT with no cols so that we acn export to VCF
     info_mt = info_ht.to_matrix_table_row_major(columns=["s"], entry_field_name="s")
     return info_mt.filter_cols(False)
+
+
+def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]:
+    """
+    Make combinations of all possible labels for a supplied dictionary of label groups
+    :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
+        e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"])
+    :return: list of all possible combinations of values for the supplied label groupings
+    :rtype: list[str]
+    """
+    copy_label_groups = copy.deepcopy(label_groups)
+    if len(copy_label_groups) == 1:
+        return [item for sublist in copy_label_groups.values() for item in sublist]
+    anchor_group = sorted(copy_label_groups.keys(), key=lambda x: SORT_ORDER.index(x))[
+        0
+    ]
+    anchor_val = copy_label_groups.pop(anchor_group)
+    combos = []
+    for x, y in itertools.product(anchor_val, make_label_combos(copy_label_groups)):
+        combos.append("{0}_{1}".format(x, y))
+    return combos
+
+
+def generic_field_check(
+    ht: hl.Table, cond_expr, check_description, display_fields, verbose
+):
+    """
+    Check a generic logical condition involving annotations in a Hail Table and print the results to terminal
+    :param Table ht: Table containing annotations to be checked
+    :param Expression cond_expr: logical expression referring to annotations in ht to be checked
+    :param str check_description: String describing the condition being checked; is displayed in terminal summary message
+    :param list of str display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes);
+        these fields are also displayed if verbose is True
+    :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False,
+        show only top values of annotations that fail checks
+    :rtype: None
+    """
+    ht_orig = ht
+    ht = ht.filter(cond_expr)
+    n_fail = ht.count()
+    if n_fail > 0:
+        logger.info(f"Found {n_fail} sites that fail {check_description} check:")
+        ht = ht.flatten()
+        ht.select("locus", "alleles", *display_fields).show()
+    else:
+        logger.info(f"PASSED {check_description} check")
+        if verbose:
+            ht_orig = ht_orig.flatten()
+            ht_orig.select(*display_fields).show()
+
+
+def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]:
+    """
+    Make Hail Expressions to measure % variants filtered under varying conditions of interest
+    :param Table ht: Hail Table containing 'filter' annotation to be examined
+    :return: Dictionary containing Hail aggregation expressions to measure filter flags
+    :rtype: Dict of str: Expression
+    """
+    filters_dict = {
+        "n": hl.agg.count(),
+        "frac_any_filter": hl.agg.count_where(ht.is_filtered) / hl.agg.count(),
+        "frac_inbreed_coeff": hl.agg.count_where(
+            ht.filters.contains("inbreeding_coeff")
+        )
+        / hl.agg.count(),
+        "frac_ac0": hl.agg.count_where(ht.filters.contains("AC0")) / hl.agg.count(),
+        "frac_rf": hl.agg.count_where(ht.filters.contains("rf")) / hl.agg.count(),
+        "frac_inbreed_coeff_only": hl.agg.count_where(
+            ht.filters.contains("inbreeding_coeff") & (ht.filters.length() == 1)
+        )
+        / hl.agg.count(),
+        "frac_ac0_only": hl.agg.count_where(
+            ht.filters.contains("AC0") & (ht.filters.length() == 1)
+        )
+        / hl.agg.count(),
+        "frac_rf_only": hl.agg.count_where(
+            ht.filters.contains("rf") & (ht.filters.length() == 1)
+        )
+        / hl.agg.count(),
+    }
+    return filters_dict
+
+
+def make_vcf_filter_dict(
+    ht: hl.Table, snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float
+) -> Dict[str, str]:
+    """
+    Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations.
+
+    :param Table ht: Table containing global annotations of the Random Forests SNP and indel cutoffs.
+    :param float snp_cutoff: Minimum SNP cutoff score from random forest model.
+    :param float indel_cutoff: Minimum indel cutoff score from random forest model.
+    :param float inbreeding_cutoff: Inbreeding coefficient hard cutoff.
+    :return: Dictionary keyed by VCF FILTER annotations, where values are Dictionaries of Number and Description attributes.
+    :rtype: Dict[str, str]
+    """
+    filter_dict = {
+        "AC0": {
+            "Description": "Allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB < 0.2 for het calls)"
+        },
+        "InbreedingCoeff": {"Description": f"InbreedingCoeff < {inbreeding_cutoff}"},
+        "RF": {
+            "Description": f"Failed random forest filtering thresholds of {snp_cutoff}, {indel_cutoff} (probabilities of being a true positive variant) for SNPs, indels"
+        },
+        "PASS": {"Description": "Passed all variant filters"},
+    }
+    return filter_dict

From 26956b969acf13748fda1510f679c7b67649d2ce Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 21 Jul 2020 12:42:48 -0400
Subject: [PATCH 30/61] Added AS_FIELDS, SITE_FIELDS constants and added
 function add_as_info_dict to update INFO_DICT with AS fields and their values

---
 gnomad/utils/vcf.py | 71 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 14 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 8138e0a07..a19aa215a 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -42,6 +42,36 @@
 Sample sexes used in VCF export.
 """
 
+AS_FIELDS = [
+    "AS_BaseQRankSum",
+    "AS_FS",
+    "AS_MQ",
+    "AS_MQRankSum",
+    "AS_pab_max",
+    "AS_QD",
+    "AS_ReadPosRankSum",
+    "AS_SOR",
+    "AS_VarDP",
+]
+"""
+Allele-specific variant annotations.
+"""
+
+SITE_FIELDS = [
+    "FS",
+    "InbreedingCoeff",
+    "MQ",
+    "MQRankSum",
+    "QD",
+    "ReadPosRankSum",
+    "sibling_singleton",
+    "SOR",
+    "transmitted_singleton",
+    "VarDP",
+]
+"""
+Site level variant annotations.
+"""
 
 INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [
     "AS_QUALapprox",
@@ -77,21 +107,12 @@
     "VQSR_NEGATIVE_TRAIN_SITE": {
         "Description": "Variant was used to build the negative training set of low-quality variants for VQSR"
     },
-    "AS_BaseQRankSum": {
-        "Number": "A",
-        "Description": "Allele-specific Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities",
-    },
-    "ClippingRankSum": {
-        "Number": "A",
-        "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference number of hard clipped bases",
+    "BaseQRankSum": {
+        "Description": "Z-score from Wilcoxon rank sum test of alternate vs. reference base qualities",
     },
     "VarDP": {
         "Description": "Depth over variant genotypes (does not include depth of reference samples)"
     },
-    "AS_VarDP": {
-        "Number": "A",
-        "Description": "Allele-specific depth over variant genotypes (does not include depth of reference samples)",
-    },
     "AS_VQSLOD": {
         "Number": "A",
         "Description": "Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model",
@@ -101,9 +122,6 @@
         "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model",
     },
     "lcr": {"Description": "Variant falls within a low complexity region"},
-    "fail_interval_qc": {
-        "Description": f"Variant falls within a region where less than {INTERVAL_QC_PARAMETERS[0]}% of samples had a mean coverage of {INTERVAL_QC_PARAMETERS[1]}X"
-    },
     "nonpar": {
         "Description": "Variant (on sex chromosome) falls outside a pseudoautosomal region"
     },
@@ -354,6 +372,31 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
     return filters_dict
 
 
+def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> None:
+    """
+    Updates info dictionary with allele-specific terms and their descriptions.
+
+    Used in VCF export.
+
+    :param INFO_DICT: Dictionary containing site-level annotations and their descriptions.
+    :return: None
+    """
+    prefix_text = "Allele-specific"
+    AS_DICT = {}
+
+    for field in AS_FIELDS:
+        # Strip AS_ from field name
+        site_field = field[3:]
+
+        AS_DICT[field] = {}
+        AS_DICT[field]["Number"] = "A"
+        AS_DICT[field][
+            "Description"
+        ] = f"{prefix_text} {INFO_DICT[site_field]['Description'][0].lower()}{INFO_DICT[site_field]['Description'][1:]}"
+
+    INFO_DICT.update(AS_DICT)
+
+
 def make_vcf_filter_dict(
     ht: hl.Table, snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float
 ) -> Dict[str, str]:

From 02f4f5633ef6144b9f298c55bf0d0aad09c5f4b3 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 21 Jul 2020 13:06:25 -0400
Subject: [PATCH 31/61] Updated make_as_info_dict and added more constants
 (RF_FIELDS, VQSR_FIELDS, REGION_TYPE_FIELDS, ALLELE_TYPE_FIELDS

---
 gnomad/utils/vcf.py | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index a19aa215a..d27e8ea00 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -73,6 +73,33 @@
 Site level variant annotations.
 """
 
+ALLELE_TYPE_FIELDS = ["allele_type", "has_star", "n_alt_alleles", "original_alleles", "variant_type", "was_mixed"]
+"""
+Allele-type annotations.
+"""
+
+REGION_TYPE_FIELDS = ["lcr", "nonpar"]
+"""
+Annotations about variant region type.
+"""
+
+RF_FIELDS = [
+    "fail_hard_filters",
+    "filters",
+    "rf_label",
+    "rf_train",
+    "rf_probability",
+    "tp",
+]
+"""
+Annotations specific to the random forest model.
+"""
+
+VQSR_FIELDS = ["AS_VQSLOD", "AS_culprit", "NEGATIVE_TRAIN_SITE", "POSITIVE_TRAIN_SITE"]
+"""
+Annotations specific to VQSR.
+"""
+
 INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [
     "AS_QUALapprox",
     "AS_VarDP",
@@ -372,14 +399,14 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
     return filters_dict
 
 
-def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> None:
+def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
     """
     Updates info dictionary with allele-specific terms and their descriptions.
 
     Used in VCF export.
 
     :param INFO_DICT: Dictionary containing site-level annotations and their descriptions.
-    :return: None
+    :return: Dictionary with allele specific annotations, their descriptions, and their VCF number field.
     """
     prefix_text = "Allele-specific"
     AS_DICT = {}
@@ -394,7 +421,7 @@ def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> None:
             "Description"
         ] = f"{prefix_text} {INFO_DICT[site_field]['Description'][0].lower()}{INFO_DICT[site_field]['Description'][1:]}"
 
-    INFO_DICT.update(AS_DICT)
+    return AS_DICT
 
 
 def make_vcf_filter_dict(

From e7edd87713a93f2a76226d5a1f8b7a609c7c2260 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 21 Jul 2020 13:27:26 -0400
Subject: [PATCH 32/61] Added constants for entries to select during export

---
 gnomad/utils/vcf.py | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index d27e8ea00..cbcbe8b28 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -73,7 +73,14 @@
 Site level variant annotations.
 """
 
-ALLELE_TYPE_FIELDS = ["allele_type", "has_star", "n_alt_alleles", "original_alleles", "variant_type", "was_mixed"]
+ALLELE_TYPE_FIELDS = [
+    "allele_type",
+    "has_star",
+    "n_alt_alleles",
+    "original_alleles",
+    "variant_type",
+    "was_mixed",
+]
 """
 Allele-type annotations.
 """
@@ -191,6 +198,28 @@
 Dictionary used during VCF export to export row (variant) annotations.
 """
 
+ENTRIES = ["GT", "GQ", "DP", "AD", "MIN_DP", "PGT", "PID", "PL", "SB"]
+"""
+Densified entries to be selected during VCF export.
+"""
+
+SPARSE_ENTRIES = [
+    "DP",
+    "GQ",
+    "LA",
+    "LAD",
+    "LGT",
+    "LPGT",
+    "LPL",
+    "MIN_DP",
+    "PID",
+    "RGQ",
+    "SB",
+]
+"""
+Sparse entries to be selected and densified during VCF export.
+"""
+
 FORMAT_DICT = {
     "GT": {"Description": "Genotype", "Number": "1", "Type": "String"},
     "AD": {

From 922a67476694c4af6b73e7c46eb67355ad10540d Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 21 Jul 2020 17:00:18 -0400
Subject: [PATCH 33/61] Added make_info_dict to vcf.py

---
 gnomad/utils/vcf.py | 125 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index cbcbe8b28..afeff1b27 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -428,6 +428,131 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
     return filters_dict
 
 
+def make_info_dict(
+    prefix: str,
+    label_groups: Dict[str, str] = None,
+    bin_edges: Dict[str, str] = None,
+    faf: bool = False,
+    popmax: bool = False,
+    description_text: str = "",
+    age_hist_data: str = None,
+) -> Dict[str, Dict[str, str]]:
+    """
+    Generate dictionary of Number and Description attributes of VCF INFO fields.
+
+    Used to populate the VCF header during export.
+    
+    :param str prefix: gnomAD or empty string.
+    :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
+        e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
+    :param dict bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation.
+    :param bool faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations.
+    :param bool popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations.
+    :param bool description_text: Optinal text to append to the end of age and popmax descriptions. 
+    :param str age_hist_data: Pipe-delimited string of age histograms, from `get_age_distributions`.
+    :return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes.
+    :rtype: Dict[str, Dict[str, str]]
+    """
+    if prefix != "":
+        prefix = f"{prefix}_"
+
+    info_dict = dict()
+
+    if age_hist_data:
+        age_hist_dict = {
+            f"{prefix}age_hist_het_bin_freq": {
+                "Number": "A",
+                "Description": f"Histogram of ages of heterozygous individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
+            },
+            f"{prefix}age_hist_het_n_smaller": {
+                "Number": "A",
+                "Description": f"Count of age values falling below lowest histogram bin edge for heterozygous individuals{description_test}",
+            },
+            f"{prefix}age_hist_het_n_larger": {
+                "Number": "A",
+                "Description": f"Count of age values falling above highest histogram bin edge for heterozygous individuals{description_test}",
+            },
+            f"{prefix}age_hist_hom_bin_freq": {
+                "Number": "A",
+                "Description": f"Histogram of ages of homozygous alternate individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
+            },
+            f"{prefix}age_hist_hom_n_smaller": {
+                "Number": "A",
+                "Description": f"Count of age values falling below lowest histogram bin edge for homozygous alternate individuals{description_test}",
+            },
+            f"{prefix}age_hist_hom_n_larger": {
+                "Number": "A",
+                "Description": f"Count of age values falling above highest histogram bin edge for homozygous alternate individuals{description_test}",
+            },
+        }
+        info_dict.update(age_hist_dict)
+
+    if popmax:
+        popmax_dict = {
+            f"{prefix}popmax": {
+                "Number": "A",
+                "Description": f"Population with maximum AF{description_test}",
+            },
+            f"{prefix}AC_popmax": {
+                "Number": "A",
+                "Description": f"Allele count in the population with the maximum AF{description_test}",
+            },
+            f"{prefix}AN_popmax": {
+                "Number": "A",
+                "Description": f"Total number of alleles in the population with the maximum AF{description_test}",
+            },
+            f"{prefix}AF_popmax": {
+                "Number": "A",
+                "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_test}",
+            },
+            f"{prefix}nhomalt_popmax": {
+                "Number": "A",
+                "Description": f"Count of homozygous individuals in the population with the maximum allele frequency{description_test}",
+            },
+        }
+        info_dict.update(popmax_dict)
+
+    else:
+        group_types = sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x))
+        combos = make_label_combos(label_groups)
+
+        for combo in combos:
+            combo_fields = combo.split("_")
+
+            if not faf:
+                combo_dict = {
+                    f"{prefix}AC_{combo}": {
+                        "Number": "A",
+                        "Description": f"Alternate allele count{make_combo_header_text('for', group_types, combo_fields, prefix)}",
+                    },
+                    f"{prefix}AN_{combo}": {
+                        "Number": "1",
+                        "Description": f"Total number of alleles{make_combo_header_text('in', group_types, combo_fields, prefix)}",
+                    },
+                    f"{prefix}AF_{combo}": {
+                        "Number": "A",
+                        "Description": f"Alternate allele frequency{make_combo_header_text('in', group_types, combo_fields, prefix)}",
+                    },
+                    f"{prefix}nhomalt_{combo}": {
+                        "Number": "A",
+                        "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_types, combo_fields, prefix)}",
+                    },
+                }
+            else:
+                combo_dict = {
+                    f"{prefix}faf95_{combo}": {
+                        "Number": "A",
+                        "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}",
+                    },
+                    f"{prefix}faf99_{combo}": {
+                        "Number": "A",
+                        "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}",
+                    },
+                }
+            info_dict.update(combo_dict)
+    return info_dict
+
+
 def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
     """
     Updates info dictionary with allele-specific terms and their descriptions.

From 9ff11ca11c9c72cd5c15e00c0e0677b4ea14eb10 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 09:48:18 -0400
Subject: [PATCH 34/61] Removed unnecessary pop constants (can be imported from
 ancestry.py)

---
 gnomad/utils/vcf.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index afeff1b27..3022a63e5 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -17,21 +17,6 @@
 Quality histograms used in VCF export.
 """
 
-POPS = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"]
-"""
-Continental population names used in VCF export.
-"""
-
-NFE_SUBPOPS = ["onf", "bgr", "swe", "nwe", "seu", "est"]
-"""
-gnomAD subpopulations for NFE population. Used in VCF export.
-"""
-
-EAS_SUBPOPS = ["kor", "oea", "jpn"]
-"""
-gnomAD subpopulations for EAS population. Used in VCF export.
-"""
-
 FAF_POPS = ["afr", "amr", "eas", "nfe", "sas"]
 """
 Populations that are included in filtering allele frequency calculations. Used in VCF export.

From 1bb7162e438f680bb5216babe99ab7d0cf352450 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 10:20:33 -0400
Subject: [PATCH 35/61] Added make hist bin edges expr function to vcf.py, also
 removed types from docstrings

---
 gnomad/utils/vcf.py | 78 +++++++++++++++++++++++++++++++++------------
 1 file changed, 58 insertions(+), 20 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 3022a63e5..081996506 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -355,17 +355,16 @@ def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]:
 
 def generic_field_check(
     ht: hl.Table, cond_expr, check_description, display_fields, verbose
-):
+) -> None:
     """
     Check a generic logical condition involving annotations in a Hail Table and print the results to terminal
-    :param Table ht: Table containing annotations to be checked
-    :param Expression cond_expr: logical expression referring to annotations in ht to be checked
-    :param str check_description: String describing the condition being checked; is displayed in terminal summary message
-    :param list of str display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes);
+    :param ht: Table containing annotations to be checked
+    :param cond_expr: logical expression referring to annotations in ht to be checked
+    :param check_description: String describing the condition being checked; is displayed in terminal summary message
+    :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes);
         these fields are also displayed if verbose is True
     :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False,
         show only top values of annotations that fail checks
-    :rtype: None
     """
     ht_orig = ht
     ht = ht.filter(cond_expr)
@@ -384,9 +383,8 @@ def generic_field_check(
 def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]:
     """
     Make Hail Expressions to measure % variants filtered under varying conditions of interest
-    :param Table ht: Hail Table containing 'filter' annotation to be examined
+    :param ht: Hail Table containing 'filter' annotation to be examined
     :return: Dictionary containing Hail aggregation expressions to measure filter flags
-    :rtype: Dict of str: Expression
     """
     filters_dict = {
         "n": hl.agg.count(),
@@ -427,16 +425,15 @@ def make_info_dict(
 
     Used to populate the VCF header during export.
     
-    :param str prefix: gnomAD or empty string.
-    :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
+    :param prefix: gnomAD or empty string.
+    :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
         e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
-    :param dict bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation.
-    :param bool faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations.
-    :param bool popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations.
-    :param bool description_text: Optinal text to append to the end of age and popmax descriptions. 
+    :param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation.
+    :param faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations.
+    :param popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations.
+    :param description_text: Optinal text to append to the end of age and popmax descriptions. 
     :param str age_hist_data: Pipe-delimited string of age histograms, from `get_age_distributions`.
     :return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes.
-    :rtype: Dict[str, Dict[str, str]]
     """
     if prefix != "":
         prefix = f"{prefix}_"
@@ -569,12 +566,11 @@ def make_vcf_filter_dict(
     """
     Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations.
 
-    :param Table ht: Table containing global annotations of the Random Forests SNP and indel cutoffs.
-    :param float snp_cutoff: Minimum SNP cutoff score from random forest model.
-    :param float indel_cutoff: Minimum indel cutoff score from random forest model.
-    :param float inbreeding_cutoff: Inbreeding coefficient hard cutoff.
+    :param ht: Table containing global annotations of the Random Forests SNP and indel cutoffs.
+    :param snp_cutoff: Minimum SNP cutoff score from random forest model.
+    :param indel_cutoff: Minimum indel cutoff score from random forest model.
+    :param inbreeding_cutoff: Inbreeding coefficient hard cutoff.
     :return: Dictionary keyed by VCF FILTER annotations, where values are Dictionaries of Number and Description attributes.
-    :rtype: Dict[str, str]
     """
     filter_dict = {
         "AC0": {
@@ -587,3 +583,45 @@ def make_vcf_filter_dict(
         "PASS": {"Description": "Passed all variant filters"},
     }
     return filter_dict
+
+
+def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str, str]:
+    """
+    Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string
+    separated by pipe delimiters.
+
+    :param ht: Table containing histogram variant annotations.
+    :param prefix: Prefix text for age histogram bin edges. Default is gnomad_.
+    :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values.
+    """
+    edges_dict = {
+        "het": "|".join(
+            map(lambda x: f"{x:.1f}", ht.head(1).age_hist_het.collect()[0].bin_edges)
+        ),
+        "hom": "|".join(
+            map(lambda x: f"{x:.1f}", ht.head(1).age_hist_hom.collect()[0].bin_edges)
+        ),
+    }
+    for hist in HISTS:
+
+        # Parse hists calculated on both raw and adj-filtered data
+        for hist_type in ["adj_qual_hists", "qual_hists"]:
+            hist_name = hist
+            if "adj" in hist_type:
+                hist_name = f"{hist}_adj"
+            edges_dict[hist_name] = (
+                "|".join(
+                    map(
+                        lambda x: f"{x:.2f}",
+                        ht.head(1)[hist_type][hist].collect()[0].bin_edges,
+                    )
+                )
+                if "ab" in hist
+                else "|".join(
+                    map(
+                        lambda x: str(int(x)),
+                        ht.head(1)[hist_type][hist].collect()[0].bin_edges,
+                    )
+                )
+            )
+    return edges_dict

From 48bf3bbec6afe8a01dcc2d96b0ad31fc460cc215 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 10:29:54 -0400
Subject: [PATCH 36/61] Added make hist dict

---
 gnomad/utils/vcf.py | 69 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 57 insertions(+), 12 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 081996506..85cacf397 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -334,11 +334,11 @@ def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpressi
 
 def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]:
     """
-    Make combinations of all possible labels for a supplied dictionary of label groups
+    Make combinations of all possible labels for a supplied dictionary of label groups.
+
     :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
-        e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"])
-    :return: list of all possible combinations of values for the supplied label groupings
-    :rtype: list[str]
+        e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
+    :return: list of all possible combinations of values for the supplied label groupings.
     """
     copy_label_groups = copy.deepcopy(label_groups)
     if len(copy_label_groups) == 1:
@@ -358,13 +358,14 @@ def generic_field_check(
 ) -> None:
     """
     Check a generic logical condition involving annotations in a Hail Table and print the results to terminal
-    :param ht: Table containing annotations to be checked
-    :param cond_expr: logical expression referring to annotations in ht to be checked
-    :param check_description: String describing the condition being checked; is displayed in terminal summary message
+
+    :param ht: Table containing annotations to be checked.
+    :param cond_expr: logical expression referring to annotations in ht to be checked.
+    :param check_description: String describing the condition being checked; is displayed in terminal summary message.
     :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes);
-        these fields are also displayed if verbose is True
+        these fields are also displayed if verbose is True.
     :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False,
-        show only top values of annotations that fail checks
+        show only top values of annotations that fail checks.
     """
     ht_orig = ht
     ht = ht.filter(cond_expr)
@@ -382,9 +383,9 @@ def generic_field_check(
 
 def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]:
     """
-    Make Hail Expressions to measure % variants filtered under varying conditions of interest
-    :param ht: Hail Table containing 'filter' annotation to be examined
-    :return: Dictionary containing Hail aggregation expressions to measure filter flags
+    Make Hail Expressions to measure % variants filtered under varying conditions of interest.
+    :param ht: Hail Table containing 'filter' annotation to be examined.
+    :return: Dictionary containing Hail aggregation expressions to measure filter flags.
     """
     filters_dict = {
         "n": hl.agg.count(),
@@ -625,3 +626,47 @@ def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str,
                 )
             )
     return edges_dict
+
+
+def make_hist_dict(bin_edges: Dict[str, Dict[str, str]], adj: bool) -> Dict[str, str]:
+    """
+    Generate dictionary of Number and Description attributes to be used in the VCF header, specifically for histogram annotations.
+
+    :param bin_edges: Dictionary keyed by histogram annotation name, with corresponding string-reformatted bin edges for values.
+    :param adj: Whether to create a header dict for raw or adj quality histograms.
+    :return: Dictionary keyed by VCF INFO annotations, where values are Dictionaries of Number and Description attributes.
+    """
+    header_hist_dict = {}
+    for hist in HISTS:
+
+        # Get hists for both raw and adj data
+        if adj:
+            hist = f"{hist}_adj"
+
+        edges = bin_edges[hist]
+        hist_fields = hist.split("_")
+        hist_text = hist_fields[0].upper()
+
+        if hist_fields[2] == "alt":
+            hist_text = hist_text + " in heterozygous individuals"
+        if adj:
+            hist_text = hist_text + " calculated on high quality genotypes"
+
+        hist_dict = {
+            f"{hist}_bin_freq": {
+                "Number": "A",
+                "Description": f"Histogram for {hist_text}; bin edges are: {edges}",
+            },
+            f"{hist}_n_smaller": {
+                "Number": "A",
+                "Description": f"Count of {hist_fields[0].upper()} values falling below lowest histogram bin edge {hist_text}",
+            },
+            f"{hist}_n_larger": {
+                "Number": "A",
+                "Description": f"Count of {hist_fields[0].upper()} values falling above highest histogram bin edge {hist_text}",
+            },
+        }
+
+        header_hist_dict.update(hist_dict)
+
+    return header_hist_dict

From e736da01d58eaf0de3c15a852be4bdfc9fbb848a Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 10:33:02 -0400
Subject: [PATCH 37/61] Updated changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e4ab4b5f1..78388cbf2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@
 * Fix for error in `compute_quantile_bin` that caused incorrect binning when a single score overlapped multiple bins. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238)
 * Removed assumption of `snv` annotation from `compute_quantile_bin`. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238)
 * Fixed `create_binned_ht` because it produced a "Cannot combine expressions from different source objects error". [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238)
+* Added constants and functions relevant to VCF export [(#241)](https://github.com/broadinstitute/gnomad_methods/pull/241)
 
 ## Version 0.4.0 - July 9th, 2020
 

From ffaa908a3cf895349e6816ae62b94f803b96f7ff Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 10:56:30 -0400
Subject: [PATCH 38/61] Added SORT_ORDER and sample_sum_check to vcf.py

---
 gnomad/utils/vcf.py | 98 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 86 insertions(+), 12 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 85cacf397..6bee6707e 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -1,11 +1,15 @@
 import hail as hl
-from typing import List
+from typing import Dict, List
 import logging
 
 logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
+SORT_ORDER = ["popmax", "group", "pop", "subpop", "sex"]
+"""
+Order to sort subgroupings during VCF export.
+"""
 
 GROUPS = ["adj", "raw"]
 """
@@ -445,27 +449,27 @@ def make_info_dict(
         age_hist_dict = {
             f"{prefix}age_hist_het_bin_freq": {
                 "Number": "A",
-                "Description": f"Histogram of ages of heterozygous individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
+                "Description": f"Histogram of ages of heterozygous individuals{description_text}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
             },
             f"{prefix}age_hist_het_n_smaller": {
                 "Number": "A",
-                "Description": f"Count of age values falling below lowest histogram bin edge for heterozygous individuals{description_test}",
+                "Description": f"Count of age values falling below lowest histogram bin edge for heterozygous individuals{description_text}",
             },
             f"{prefix}age_hist_het_n_larger": {
                 "Number": "A",
-                "Description": f"Count of age values falling above highest histogram bin edge for heterozygous individuals{description_test}",
+                "Description": f"Count of age values falling above highest histogram bin edge for heterozygous individuals{description_text}",
             },
             f"{prefix}age_hist_hom_bin_freq": {
                 "Number": "A",
-                "Description": f"Histogram of ages of homozygous alternate individuals{description_test}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
+                "Description": f"Histogram of ages of homozygous alternate individuals{description_text}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
             },
             f"{prefix}age_hist_hom_n_smaller": {
                 "Number": "A",
-                "Description": f"Count of age values falling below lowest histogram bin edge for homozygous alternate individuals{description_test}",
+                "Description": f"Count of age values falling below lowest histogram bin edge for homozygous alternate individuals{description_text}",
             },
             f"{prefix}age_hist_hom_n_larger": {
                 "Number": "A",
-                "Description": f"Count of age values falling above highest histogram bin edge for homozygous alternate individuals{description_test}",
+                "Description": f"Count of age values falling above highest histogram bin edge for homozygous alternate individuals{description_text}",
             },
         }
         info_dict.update(age_hist_dict)
@@ -474,23 +478,23 @@ def make_info_dict(
         popmax_dict = {
             f"{prefix}popmax": {
                 "Number": "A",
-                "Description": f"Population with maximum AF{description_test}",
+                "Description": f"Population with maximum AF{description_text}",
             },
             f"{prefix}AC_popmax": {
                 "Number": "A",
-                "Description": f"Allele count in the population with the maximum AF{description_test}",
+                "Description": f"Allele count in the population with the maximum AF{description_text}",
             },
             f"{prefix}AN_popmax": {
                 "Number": "A",
-                "Description": f"Total number of alleles in the population with the maximum AF{description_test}",
+                "Description": f"Total number of alleles in the population with the maximum AF{description_text}",
             },
             f"{prefix}AF_popmax": {
                 "Number": "A",
-                "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_test}",
+                "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_text}",
             },
             f"{prefix}nhomalt_popmax": {
                 "Number": "A",
-                "Description": f"Count of homozygous individuals in the population with the maximum allele frequency{description_test}",
+                "Description": f"Count of homozygous individuals in the population with the maximum allele frequency{description_text}",
             },
         }
         info_dict.update(popmax_dict)
@@ -670,3 +674,73 @@ def make_hist_dict(bin_edges: Dict[str, Dict[str, str]], adj: bool) -> Dict[str,
         header_hist_dict.update(hist_dict)
 
     return header_hist_dict
+
+
+def sample_sum_check(
+    ht: hl.Table,
+    prefix: str,
+    label_groups: Dict[str, List[str]],
+    verbose: bool,
+    subpop: bool = None,
+) -> None:
+    """
+    Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version;
+    display results from checking the sum of the specified annotations in the terminal.
+
+    :param Table ht: Table containing annotations to be summed.
+    :param str prefix: String indicating sample subset.
+    :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
+        e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
+    :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False,
+        show only top values of annotations that fail checks.
+    :param str subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked.
+    :rtype: None
+    """
+    combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in make_label_combos(label_groups)]
+    combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in make_label_combos(label_groups)]
+    combo_nhomalt = [
+        ht.info[f"{prefix}nhomalt_{x}"] for x in make_label_combos(label_groups)
+    ]
+
+    group = label_groups.pop("group")[0]
+    alt_groups = "_".join(
+        sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x))
+    )
+
+    annot_dict = {
+        f"sum_AC_{group}_{alt_groups}": hl.sum(combo_AC),
+        f"sum_AN_{group}_{alt_groups}": hl.sum(combo_AN),
+        f"sum_nhomalt_{group}_{alt_groups}": hl.sum(combo_nhomalt),
+    }
+
+    ht = ht.annotate(**annot_dict)
+
+    for subfield in ["AC", "AN", "nhomalt"]:
+        if not subpop:
+            generic_field_check(
+                ht,
+                (
+                    ht.info[f"{prefix}{subfield}_{group}"]
+                    != ht[f"sum_{subfield}_{group}_{alt_groups}"]
+                ),
+                f"{prefix}{subfield}_{group} = sum({subfield}_{group}_{alt_groups})",
+                [
+                    f"info.{prefix}{subfield}_{group}",
+                    f"sum_{subfield}_{group}_{alt_groups}",
+                ],
+                verbose,
+            )
+        else:
+            generic_field_check(
+                ht,
+                (
+                    ht.info[f"{prefix}{subfield}_{group}_{subpop}"]
+                    != ht[f"sum_{subfield}_{group}_{alt_groups}"]
+                ),
+                f"{prefix}{subfield}_{group}_{subpop} = sum({subfield}_{group}_{alt_groups})",
+                [
+                    f"info.{prefix}{subfield}_{group}_{subpop}",
+                    f"sum_{subfield}_{group}_{alt_groups}",
+                ],
+                verbose,
+            )

From 00b92ec5208de64a0492981fef61de94f3014df4 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 11:38:18 -0400
Subject: [PATCH 39/61] Moved make combo header text to vcf.py

---
 gnomad/utils/vcf.py | 58 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 6bee6707e..47f02659a 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -1,6 +1,9 @@
-import hail as hl
-from typing import Dict, List
 import logging
+from typing import Dict, List
+
+import hail as hl
+
+from gnomad.sample_qc.ancestry import POP_NAMES
 
 logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
 logger = logging.getLogger(__name__)
@@ -416,6 +419,53 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
     return filters_dict
 
 
+def make_combo_header_text(
+    preposition: str,
+    group_types: List[str],
+    combo_fields: List[str],
+    prefix: str,
+) -> str:
+    """
+    Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset.
+
+    For example, if preposition is "for", group_types is ["group", "pop", "sex"], and combo_fields is ["adj", "afr", "female"],
+    this function will return the string " for female samples of African-American/African ancestry".
+
+    :param preposition: Relevant preposition to precede automatically generated text.
+    :param group_types: List of grouping types, e.g. "sex" or "pop".
+    :param combo_fields: List of the specific values for each grouping type, for which the text is being generated.
+    :param prefix: Prefix string indicating sample subset.
+    :return: String with automatically generated description text for a given set of combo fields.
+    """
+    combo_dict = dict(zip(group_types, combo_fields))
+    header_text = " " + preposition
+
+    if len(combo_dict.keys()) == 1:
+        if combo_dict["group"] == "adj":
+            return ""
+
+    if "sex" in combo_dict.keys():
+        header_text = header_text + " " + combo_dict["sex"]
+
+    header_text = header_text + " samples"
+
+    if "subpop" in combo_dict.keys():
+        header_text = header_text + f" of {POP_NAMES[combo_dict['subpop']]} ancestry"
+        combo_dict.pop("pop")
+
+    if "pop" in combo_dict.keys():
+        header_text = header_text + f" of {POP_NAMES[combo_dict['pop']]} ancestry"
+
+    if "gnomad" in prefix:
+        header_text = header_text + " in gnomAD"
+
+    if "group" in group_types:
+        if combo_dict["group"] == "raw":
+            header_text = header_text + ", before removing low-confidence genotypes"
+
+    return header_text
+
+
 def make_info_dict(
     prefix: str,
     label_groups: Dict[str, str] = None,
@@ -529,11 +579,11 @@ def make_info_dict(
                 combo_dict = {
                     f"{prefix}faf95_{combo}": {
                         "Number": "A",
-                        "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}",
+                        "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}",
                     },
                     f"{prefix}faf99_{combo}": {
                         "Number": "A",
-                        "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix, faf=True)}",
+                        "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}",
                     },
                 }
             info_dict.update(combo_dict)

From 6ed92b0a5a71c8b105f5efb2661cc3f5d6b96543 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 11:42:12 -0400
Subject: [PATCH 40/61] Fixed imports and reformatted with black

---
 gnomad/utils/vcf.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 47f02659a..002b791df 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -1,3 +1,5 @@
+import copy
+import itertools
 import logging
 from typing import Dict, List
 
@@ -420,10 +422,7 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
 
 
 def make_combo_header_text(
-    preposition: str,
-    group_types: List[str],
-    combo_fields: List[str],
-    prefix: str,
+    preposition: str, group_types: List[str], combo_fields: List[str], prefix: str,
 ) -> str:
     """
     Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset.
@@ -616,7 +615,7 @@ def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str
 
 
 def make_vcf_filter_dict(
-    ht: hl.Table, snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float
+    snp_cutoff: float, indel_cutoff: float, inbreeding_cutoff: float
 ) -> Dict[str, str]:
     """
     Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations.
@@ -650,10 +649,10 @@ def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str,
     :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values.
     """
     edges_dict = {
-        "het": "|".join(
+        f"{prefix}het": "|".join(
             map(lambda x: f"{x:.1f}", ht.head(1).age_hist_het.collect()[0].bin_edges)
         ),
-        "hom": "|".join(
+        f"{prefix}hom": "|".join(
             map(lambda x: f"{x:.1f}", ht.head(1).age_hist_hom.collect()[0].bin_edges)
         ),
     }

From 191b073727cbfcb05c3515a9a139994dcae6b6cd Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 11:47:25 -0400
Subject: [PATCH 41/61] Changed docstring for make_hist_bin_edges_expr

---
 gnomad/utils/vcf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 002b791df..f2c980ce4 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -639,13 +639,13 @@ def make_vcf_filter_dict(
     return filter_dict
 
 
-def make_hist_bin_edges_expr(ht: hl.Table, prefix: str = "gnomad_") -> Dict[str, str]:
+def make_hist_bin_edges_expr(ht: hl.Table, prefix: str) -> Dict[str, str]:
     """
     Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string
     separated by pipe delimiters.
 
     :param ht: Table containing histogram variant annotations.
-    :param prefix: Prefix text for age histogram bin edges. Default is gnomad_.
+    :param prefix: Prefix text for age histogram bin edges. 
     :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values.
     """
     edges_dict = {

From d8a3e7a7db9450e85f060a15a84effeea538ef24 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 13:02:24 -0400
Subject: [PATCH 42/61] Added set female metrics to NA

---
 gnomad/utils/vcf.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index f2c980ce4..46edfda14 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -1,7 +1,7 @@
 import copy
 import itertools
 import logging
-from typing import Dict, List
+from typing import Dict, List, Union
 
 import hail as hl
 
@@ -793,3 +793,35 @@ def sample_sum_check(
                 ],
                 verbose,
             )
+
+
+def set_female_y_metrics_to_na(
+    t: Union[hl.Table, hl.MatrixTable], faf: bool = True,
+) -> Dict[str, hl.expr.Int32Expression]:
+    """
+    Set AC, AN, and nhomalt Y variant annotations for females to NA (instead of 0).
+
+    :param Table/MatrixTable t: Table/MatrixTable containing female variant annotations.
+    :return: Dictionary with reset annotations
+    :rtype: Dict[str, hl.expr.Int32Expression]
+    """
+    metrics = list(t.row.info)
+    female_metrics = [x for x in metrics if "_female" in x]
+    female_metrics = [
+        x for x in female_metrics if ("nhomalt" in x) or ("AC" in x) or ("AN" in x)
+    ]
+
+    if faf:
+        female_metrics.extend([x for x in female_metrics if "faf" in x])
+
+    female_metrics_dict = {}
+    for metric in female_metrics:
+        female_metrics_dict.update(
+            {
+                f"{metric}": hl.or_missing(
+                    (t.locus.contig.in_y_nonpar() | t.locus.contig.in_y_par()),
+                    t.info[f"{metric}"],
+                )
+            }
+        )
+    return female_metrics_dict

From 51a5a73a47d2786e4f1b3e022dc9f084a5f97e34 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 22 Jul 2020 13:06:08 -0400
Subject: [PATCH 43/61] Updated docstring in set female metrics to na

---
 gnomad/utils/vcf.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 46edfda14..010c9adba 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -799,11 +799,10 @@ def set_female_y_metrics_to_na(
     t: Union[hl.Table, hl.MatrixTable], faf: bool = True,
 ) -> Dict[str, hl.expr.Int32Expression]:
     """
-    Set AC, AN, and nhomalt Y variant annotations for females to NA (instead of 0).
+    Set AC, AN, and nhomalt chrY variant annotations for females to NA (instead of 0).
 
-    :param Table/MatrixTable t: Table/MatrixTable containing female variant annotations.
+    :param t: Table/MatrixTable containing female variant annotations.
     :return: Dictionary with reset annotations
-    :rtype: Dict[str, hl.expr.Int32Expression]
     """
     metrics = list(t.row.info)
     female_metrics = [x for x in metrics if "_female" in x]

From 6b1969dd943fc53445362cfc03976a0c39386039 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 12:15:21 -0400
Subject: [PATCH 44/61] Removed transmitted singleton and sibling singleton
 from SITE_FIELDS

---
 gnomad/utils/vcf.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 010c9adba..150125429 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -58,9 +58,7 @@
     "MQRankSum",
     "QD",
     "ReadPosRankSum",
-    "sibling_singleton",
     "SOR",
-    "transmitted_singleton",
     "VarDP",
 ]
 """

From 2af3587c0a9597b4a2aa0a80a168e0b86587599d Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:19:26 -0400
Subject: [PATCH 45/61] Updated make_label_combos docstring and fixed values
 for some constants/their descriptions

---
 gnomad/utils/vcf.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 150125429..690d37a7a 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -14,6 +14,7 @@
 SORT_ORDER = ["popmax", "group", "pop", "subpop", "sex"]
 """
 Order to sort subgroupings during VCF export.
+Ensures that INFO labels in VCF are in desired order (e.g., raw_AC_afr_female).
 """
 
 GROUPS = ["adj", "raw"]
@@ -28,12 +29,13 @@
 
 FAF_POPS = ["afr", "amr", "eas", "nfe", "sas"]
 """
-Populations that are included in filtering allele frequency calculations. Used in VCF export.
+Global populations that are included in filtering allele frequency (faf) calculations. Used in VCF export.
 """
 
 SEXES = ["male", "female"]
 """
 Sample sexes used in VCF export.
+Used to stratify frequency annotations (AC, AN, AF) for each sex.
 """
 
 AS_FIELDS = [
@@ -77,9 +79,12 @@
 Allele-type annotations.
 """
 
-REGION_TYPE_FIELDS = ["lcr", "nonpar"]
+REGION_FLAG_FIELDS = ["decoy", "lcr", "nonpar", "segdup"]
 """
 Annotations about variant region type.
+
+.. note::
+    decoy and segdup resource files do not currently exist for GrCh38/hg38.
 """
 
 RF_FIELDS = [
@@ -91,7 +96,7 @@
     "tp",
 ]
 """
-Annotations specific to the random forest model.
+Annotations specific to the variant QC using a random forest model.
 """
 
 VQSR_FIELDS = ["AS_VQSLOD", "AS_culprit", "NEGATIVE_TRAIN_SITE", "POSITIVE_TRAIN_SITE"]
@@ -147,10 +152,12 @@
         "Number": "A",
         "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model",
     },
+    "decoy": {"Description": "Variant falls within a reference decoy region"},
     "lcr": {"Description": "Variant falls within a low complexity region"},
     "nonpar": {
         "Description": "Variant (on sex chromosome) falls outside a pseudoautosomal region"
     },
+    "segdup": {"Description": "Variant falls within a segmental duplication region"},
     "rf_positive_label": {
         "Description": "Variant was labelled as a positive example for training of random forest model"
     },
@@ -339,18 +346,24 @@ def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpressi
     return info_mt.filter_cols(False)
 
 
-def make_label_combos(label_groups: Dict[str, List[str]]) -> List[str]:
+def make_label_combos(
+    label_groups: Dict[str, List[str]], sort_order: List[str] = SORT_ORDER,
+) -> List[str]:
     """
     Make combinations of all possible labels for a supplied dictionary of label groups.
 
-    :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
+    For example, if label_groups is `{"sex": ["male", "female"], "pop": ["afr", "nfe", "amr"]}`,
+    this function will return `["afr_male", "afr_female", "nfe_male", "nfe_female", "amr_male", "amr_female']`
+
+    :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
         e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
+    :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
     :return: list of all possible combinations of values for the supplied label groupings.
     """
     copy_label_groups = copy.deepcopy(label_groups)
     if len(copy_label_groups) == 1:
         return [item for sublist in copy_label_groups.values() for item in sublist]
-    anchor_group = sorted(copy_label_groups.keys(), key=lambda x: SORT_ORDER.index(x))[
+    anchor_group = sorted(copy_label_groups.keys(), key=lambda x: sort_order.index(x))[
         0
     ]
     anchor_val = copy_label_groups.pop(anchor_group)

From ca577082dc1aa9db67357e7b4a072eb9428e60b4 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:21:43 -0400
Subject: [PATCH 46/61] Updated docstring for generic_field_check

---
 gnomad/utils/vcf.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 690d37a7a..8f8e099e0 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -374,17 +374,23 @@ def make_label_combos(
 
 
 def generic_field_check(
-    ht: hl.Table, cond_expr, check_description, display_fields, verbose
+    ht: hl.Table,
+    cond_expr: hl.expr.BooleanExpression,
+    check_description: str,
+    display_fields: List[str],
+    verbose: bool,
 ) -> None:
     """
-    Check a generic logical condition involving annotations in a Hail Table and print the results to terminal
+    Check a generic logical condition involving annotations in a Hail Table and print the results to terminal.
+
+    Displays the number of rows in the Table that return False for a condition.
 
     :param ht: Table containing annotations to be checked.
-    :param cond_expr: logical expression referring to annotations in ht to be checked.
+    :param cond_expr: Logical expression referring to annotations in ht to be checked.
     :param check_description: String describing the condition being checked; is displayed in terminal summary message.
     :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes);
         these fields are also displayed if verbose is True.
-    :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False,
+    :param verbose: If True, show top values of annotations being checked, including checks that pass; if False,
         show only top values of annotations that fail checks.
     """
     ht_orig = ht

From 4428b0ccd5e32d5f4eb6fc63153eff8d9a3c457b Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:24:03 -0400
Subject: [PATCH 47/61] Updated docstring for make_filters_sanity_check_expr

---
 gnomad/utils/vcf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 8f8e099e0..ecbc12965 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -410,7 +410,8 @@ def generic_field_check(
 def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]:
     """
     Make Hail Expressions to measure % variants filtered under varying conditions of interest.
-    :param ht: Hail Table containing 'filter' annotation to be examined.
+
+    :param ht: Table containing 'filter' annotation to be examined.
     :return: Dictionary containing Hail aggregation expressions to measure filter flags.
     """
     filters_dict = {

From 75e997d5ebcd14cb7ff8b23a555d34e53be522ea Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:32:55 -0400
Subject: [PATCH 48/61] Updated make_filters_sanity_check_expr

---
 gnomad/utils/vcf.py | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index ecbc12965..1ffc677af 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -409,32 +409,37 @@ def generic_field_check(
 
 def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]:
     """
-    Make Hail Expressions to measure % variants filtered under varying conditions of interest.
+    Make Hail expressions to measure % variants filtered under varying conditions of interest.
+
+    Checks for:
+        - Total number of variants
+        - Fraction of variants removed by any filter
+        - Fraction of variants removed because of InbreedingCoefficient filter in combination with any other filter
+        - Fraction of varinats removed because of AC0 filter in combination with any other filter
+        - Fraction of variants removed because of random forest filtering in combination with any other filter
+        - Fraction of variants removed only because of InbreedingCoefficient filter
+        - Fraction of variants removed only because of AC0 filter
+        - Fraction of variants removed only because of random forest filtering
+
 
     :param ht: Table containing 'filter' annotation to be examined.
     :return: Dictionary containing Hail aggregation expressions to measure filter flags.
     """
     filters_dict = {
         "n": hl.agg.count(),
-        "frac_any_filter": hl.agg.count_where(ht.is_filtered) / hl.agg.count(),
-        "frac_inbreed_coeff": hl.agg.count_where(
-            ht.filters.contains("inbreeding_coeff")
-        )
-        / hl.agg.count(),
-        "frac_ac0": hl.agg.count_where(ht.filters.contains("AC0")) / hl.agg.count(),
-        "frac_rf": hl.agg.count_where(ht.filters.contains("rf")) / hl.agg.count(),
-        "frac_inbreed_coeff_only": hl.agg.count_where(
+        "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) == 0),
+        "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("inbreeding_coeff")),
+        "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")),
+        "frac_rf": hl.agg.fraction(ht.filters.contains("rf")),
+        "frac_inbreed_coeff_only": hl.agg.fraction(
             ht.filters.contains("inbreeding_coeff") & (ht.filters.length() == 1)
-        )
-        / hl.agg.count(),
-        "frac_ac0_only": hl.agg.count_where(
+        ),
+        "frac_ac0_only": hl.agg.fraction(
             ht.filters.contains("AC0") & (ht.filters.length() == 1)
-        )
-        / hl.agg.count(),
-        "frac_rf_only": hl.agg.count_where(
+        ),
+        "frac_rf_only": hl.agg.fraction(
             ht.filters.contains("rf") & (ht.filters.length() == 1)
-        )
-        / hl.agg.count(),
+        ),
     }
     return filters_dict
 

From fdfed1adb48685aefb22b5c720bd25c5de1233d6 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:38:49 -0400
Subject: [PATCH 49/61] Updated docstring for make_combo_header_text

---
 gnomad/utils/vcf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 1ffc677af..df25494aa 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -454,7 +454,7 @@ def make_combo_header_text(
     this function will return the string " for female samples of African-American/African ancestry".
 
     :param preposition: Relevant preposition to precede automatically generated text.
-    :param group_types: List of grouping types, e.g. "sex" or "pop".
+    :param group_types: List of grouping types. Possible values in this list are: "group", "pop", "sex", "subpop".
     :param combo_fields: List of the specific values for each grouping type, for which the text is being generated.
     :param prefix: Prefix string indicating sample subset.
     :return: String with automatically generated description text for a given set of combo fields.
@@ -462,11 +462,11 @@ def make_combo_header_text(
     combo_dict = dict(zip(group_types, combo_fields))
     header_text = " " + preposition
 
-    if len(combo_dict.keys()) == 1:
+    if len(combo_dict) == 1:
         if combo_dict["group"] == "adj":
             return ""
 
-    if "sex" in combo_dict.keys():
+    if "sex" in combo_dict:
         header_text = header_text + " " + combo_dict["sex"]
 
     header_text = header_text + " samples"

From 2c47c635faa68fffcd6f52970fc46b0d8c65b6fd Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:42:42 -0400
Subject: [PATCH 50/61] Updated docstring for make_info_dict

---
 gnomad/utils/vcf.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index df25494aa..9be64db77 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -500,7 +500,13 @@ def make_info_dict(
     """
     Generate dictionary of Number and Description attributes of VCF INFO fields.
 
-    Used to populate the VCF header during export.
+    Used to populate the INFO fields of the VCF header during export.
+
+    Creates:
+        - INFO fields for age histograms (bin freq, n_smaller, and n_larger for heterozygous and homozygous variant carriers)
+        - INFO fields for popmax AC, AN, AF, nhomalt, and popmax population
+        - INFO fields for AC, AN, AF, nhomalt for each combination of sample population, sex, and subpopulation, both for adj and raw data
+        - INFO fields for filtering allele frequency (faf) annotations 
     
     :param prefix: gnomAD or empty string.
     :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,

From 41fb66d4b2765669a57e706522f9e73875fbc3a3 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:54:07 -0400
Subject: [PATCH 51/61] Updated make_combo_header_text to take dict as input

---
 gnomad/utils/vcf.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 9be64db77..d4fe986fa 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -445,7 +445,7 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
 
 
 def make_combo_header_text(
-    preposition: str, group_types: List[str], combo_fields: List[str], prefix: str,
+    preposition: str, combo_dict: Dict[str, str], prefix: str,
 ) -> str:
     """
     Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset.
@@ -454,12 +454,12 @@ def make_combo_header_text(
     this function will return the string " for female samples of African-American/African ancestry".
 
     :param preposition: Relevant preposition to precede automatically generated text.
-    :param group_types: List of grouping types. Possible values in this list are: "group", "pop", "sex", "subpop".
-    :param combo_fields: List of the specific values for each grouping type, for which the text is being generated.
+    :param combo_dict: Dict with grouping types as keys and values for grouping type as values. This function generates text for these values.
+        Possible grouping types are: "group", "pop", "sex", and "subpop". 
+        Example input: {"pop": "afr", "sex": "female"}
     :param prefix: Prefix string indicating sample subset.
     :return: String with automatically generated description text for a given set of combo fields.
     """
-    combo_dict = dict(zip(group_types, combo_fields))
     header_text = " " + preposition
 
     if len(combo_dict) == 1:
@@ -471,17 +471,17 @@ def make_combo_header_text(
 
     header_text = header_text + " samples"
 
-    if "subpop" in combo_dict.keys():
+    if "subpop" in combo_dict:
         header_text = header_text + f" of {POP_NAMES[combo_dict['subpop']]} ancestry"
         combo_dict.pop("pop")
 
-    if "pop" in combo_dict.keys():
+    if "pop" in combo_dict:
         header_text = header_text + f" of {POP_NAMES[combo_dict['pop']]} ancestry"
 
     if "gnomad" in prefix:
         header_text = header_text + " in gnomAD"
 
-    if "group" in group_types:
+    if "group" in combo_dict:
         if combo_dict["group"] == "raw":
             header_text = header_text + ", before removing low-confidence genotypes"
 
@@ -489,7 +489,7 @@ def make_combo_header_text(
 
 
 def make_info_dict(
-    prefix: str,
+    prefix: str = "",
     label_groups: Dict[str, str] = None,
     bin_edges: Dict[str, str] = None,
     faf: bool = False,
@@ -508,7 +508,7 @@ def make_info_dict(
         - INFO fields for AC, AN, AF, nhomalt for each combination of sample population, sex, and subpopulation, both for adj and raw data
         - INFO fields for filtering allele frequency (faf) annotations 
     
-    :param prefix: gnomAD or empty string.
+    :param prefix: Prefix string for data, e.g. "gnomAD". Default is empty string.
     :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
         e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
     :param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation.
@@ -583,35 +583,36 @@ def make_info_dict(
 
         for combo in combos:
             combo_fields = combo.split("_")
+            group_dict = dict(zip(group_types, combo_fields))
 
             if not faf:
                 combo_dict = {
                     f"{prefix}AC_{combo}": {
                         "Number": "A",
-                        "Description": f"Alternate allele count{make_combo_header_text('for', group_types, combo_fields, prefix)}",
+                        "Description": f"Alternate allele count{make_combo_header_text('for', group_dict, prefix)}",
                     },
                     f"{prefix}AN_{combo}": {
                         "Number": "1",
-                        "Description": f"Total number of alleles{make_combo_header_text('in', group_types, combo_fields, prefix)}",
+                        "Description": f"Total number of alleles{make_combo_header_text('in', group_dict, prefix)}",
                     },
                     f"{prefix}AF_{combo}": {
                         "Number": "A",
-                        "Description": f"Alternate allele frequency{make_combo_header_text('in', group_types, combo_fields, prefix)}",
+                        "Description": f"Alternate allele frequency{make_combo_header_text('in', group_dict, prefix)}",
                     },
                     f"{prefix}nhomalt_{combo}": {
                         "Number": "A",
-                        "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_types, combo_fields, prefix)}",
+                        "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_dict, prefix)}",
                     },
                 }
             else:
                 combo_dict = {
                     f"{prefix}faf95_{combo}": {
                         "Number": "A",
-                        "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}",
+                        "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_dict, prefix)}",
                     },
                     f"{prefix}faf99_{combo}": {
                         "Number": "A",
-                        "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_types, combo_fields, prefix)}",
+                        "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_dict, prefix)}",
                     },
                 }
             info_dict.update(combo_dict)

From 2ddd8f17aa83eb750c12cd5044d4a4f5e6f61243 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 28 Jul 2020 16:57:03 -0400
Subject: [PATCH 52/61] Updated make_info_dict to pass in sort order

---
 gnomad/utils/vcf.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index d4fe986fa..e6401e831 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -496,6 +496,7 @@ def make_info_dict(
     popmax: bool = False,
     description_text: str = "",
     age_hist_data: str = None,
+    sort_order: List[str] = SORT_ORDER,
 ) -> Dict[str, Dict[str, str]]:
     """
     Generate dictionary of Number and Description attributes of VCF INFO fields.
@@ -514,8 +515,9 @@ def make_info_dict(
     :param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation.
     :param faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations.
     :param popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations.
-    :param description_text: Optinal text to append to the end of age and popmax descriptions. 
+    :param description_text: Optional text to append to the end of age and popmax descriptions. Needs to start with a space if specified.
     :param str age_hist_data: Pipe-delimited string of age histograms, from `get_age_distributions`.
+    :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
     :return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes.
     """
     if prefix != "":
@@ -578,7 +580,7 @@ def make_info_dict(
         info_dict.update(popmax_dict)
 
     else:
-        group_types = sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x))
+        group_types = sorted(label_groups.keys(), key=lambda x: sort_order.index(x))
         combos = make_label_combos(label_groups)
 
         for combo in combos:

From 3a80a41383e86cee97292824b049ac8be6348e55 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Wed, 29 Jul 2020 09:54:46 -0400
Subject: [PATCH 53/61] Addressed rest of review comments

---
 gnomad/utils/vcf.py | 95 +++++++++++++++++++++++++++------------------
 1 file changed, 58 insertions(+), 37 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index e6401e831..37165c08e 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -621,29 +621,41 @@ def make_info_dict(
     return info_dict
 
 
-def add_as_info_dict(INFO_DICT: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
+def add_as_info_dict(
+    info_dict: Dict[str, Dict[str, str]] = INFO_DICT, as_fields: List[str] = AS_FIELDS
+) -> Dict[str, Dict[str, str]]:
     """
     Updates info dictionary with allele-specific terms and their descriptions.
 
     Used in VCF export.
 
-    :param INFO_DICT: Dictionary containing site-level annotations and their descriptions.
+    :param info_dict: Dictionary containing site-level annotations and their descriptions. Default is INFO_DICT.
+    :param as_fields: List containing allele-specific fields to be added to info_dict. Default is AS_FIELDS.
     :return: Dictionary with allele specific annotations, their descriptions, and their VCF number field.
     """
     prefix_text = "Allele-specific"
-    AS_DICT = {}
+    as_dict = {}
+
+    for field in as_fields:
+
+        try:
+            # Strip AS_ from field name
+            site_field = field[3:]
 
-    for field in AS_FIELDS:
-        # Strip AS_ from field name
-        site_field = field[3:]
+            # Get site description from info dictionary and make first letter lower case
+            first_letter = info_dict[site_field]["Description"][0].lower()
+            rest_of_description = info_dict[site_field]["Description"][1:]
 
-        AS_DICT[field] = {}
-        AS_DICT[field]["Number"] = "A"
-        AS_DICT[field][
-            "Description"
-        ] = f"{prefix_text} {INFO_DICT[site_field]['Description'][0].lower()}{INFO_DICT[site_field]['Description'][1:]}"
+            as_dict[field] = {}
+            as_dict[field]["Number"] = "A"
+            as_dict[field][
+                "Description"
+            ] = f"{prefix_text} {first_letter}{rest_of_description}"
 
-    return AS_DICT
+        except KeyError:
+            raise ValueError(f"{field} is not present in input info dictionary!")
+
+    return as_dict
 
 
 def make_vcf_filter_dict(
@@ -652,7 +664,12 @@ def make_vcf_filter_dict(
     """
     Generates dictionary of Number and Description attributes to be used in the VCF header, specifically for FILTER annotations.
 
-    :param ht: Table containing global annotations of the Random Forests SNP and indel cutoffs.
+    Generates descriptions for:
+        - AC0 filter
+        - InbreedingCoeff filter
+        - RF filter
+        - PASS (passed all variant filters)
+
     :param snp_cutoff: Minimum SNP cutoff score from random forest model.
     :param indel_cutoff: Minimum indel cutoff score from random forest model.
     :param inbreeding_cutoff: Inbreeding coefficient hard cutoff.
@@ -671,43 +688,45 @@ def make_vcf_filter_dict(
     return filter_dict
 
 
-def make_hist_bin_edges_expr(ht: hl.Table, prefix: str) -> Dict[str, str]:
+def make_hist_bin_edges_expr(
+    ht: hl.Table, hists: List[str] = HISTS, prefix: str = ""
+) -> Dict[str, str]:
     """
     Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string
     separated by pipe delimiters.
 
     :param ht: Table containing histogram variant annotations.
-    :param prefix: Prefix text for age histogram bin edges. 
+    :param hists: List of variant histogram annotations. Default is HISTS.
+    :param prefix: Prefix text for age histogram bin edges.  Default is empty string.
     :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values.
     """
+    # Add underscore to prefix if it isn't empty
+    if prefix != "":
+        prefix += "_"
+
     edges_dict = {
-        f"{prefix}het": "|".join(
-            map(lambda x: f"{x:.1f}", ht.head(1).age_hist_het.collect()[0].bin_edges)
-        ),
-        f"{prefix}hom": "|".join(
-            map(lambda x: f"{x:.1f}", ht.head(1).age_hist_hom.collect()[0].bin_edges)
-        ),
+        f"{prefix}{call_type}": "|".join(
+            map(
+                lambda x: f"{x:.1f}",
+                ht.head(1)[f"age_hist_{call_type}"].collect()[0].bin_edges,
+            )
+        )
+        for call_type in ["het", "hom"]
     }
-    for hist in HISTS:
+
+    for hist in hists:
+
+        hist_name = hist
 
         # Parse hists calculated on both raw and adj-filtered data
         for hist_type in ["adj_qual_hists", "qual_hists"]:
-            hist_name = hist
+
             if "adj" in hist_type:
                 hist_name = f"{hist}_adj"
-            edges_dict[hist_name] = (
-                "|".join(
-                    map(
-                        lambda x: f"{x:.2f}",
-                        ht.head(1)[hist_type][hist].collect()[0].bin_edges,
-                    )
-                )
-                if "ab" in hist
-                else "|".join(
-                    map(
-                        lambda x: str(int(x)),
-                        ht.head(1)[hist_type][hist].collect()[0].bin_edges,
-                    )
+            edges_dict[hist_name] = "|".join(
+                map(
+                    lambda x: f"{x:.2f}" if "ab" in hist else str(int(x)),
+                    ht.head(1)[hist_type][hist].collect()[0].bin_edges,
                 )
             )
     return edges_dict
@@ -763,6 +782,7 @@ def sample_sum_check(
     label_groups: Dict[str, List[str]],
     verbose: bool,
     subpop: bool = None,
+    sort_order: List[str] = SORT_ORDER,
 ) -> None:
     """
     Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version;
@@ -775,6 +795,7 @@ def sample_sum_check(
     :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False,
         show only top values of annotations that fail checks.
     :param str subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked.
+    :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
     :rtype: None
     """
     combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in make_label_combos(label_groups)]
@@ -785,7 +806,7 @@ def sample_sum_check(
 
     group = label_groups.pop("group")[0]
     alt_groups = "_".join(
-        sorted(label_groups.keys(), key=lambda x: SORT_ORDER.index(x))
+        sorted(label_groups.keys(), key=lambda x: sort_order.index(x))
     )
 
     annot_dict = {

From 1e48d1219afe581e70291e2163f27b62be547231 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Thu, 30 Jul 2020 13:50:58 -0400
Subject: [PATCH 54/61] Add BaseQRankSum to SITE_FIELDS const

---
 gnomad/utils/vcf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 37165c08e..dd868e355 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -54,6 +54,7 @@
 """
 
 SITE_FIELDS = [
+    "BaseQRankSum",
     "FS",
     "InbreedingCoeff",
     "MQ",

From ff61379f8ff0432a51eaebfd92d1f3891500cffe Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Thu, 30 Jul 2020 16:55:15 -0400
Subject: [PATCH 55/61] Addressed review comments

---
 gnomad/utils/vcf.py | 47 +++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index dd868e355..903a90040 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -85,7 +85,7 @@
 Annotations about variant region type.
 
 .. note::
-    decoy and segdup resource files do not currently exist for GrCh38/hg38.
+    decoy and segdup resource files do not currently exist for GRCh38/hg38.
 """
 
 RF_FIELDS = [
@@ -145,13 +145,11 @@
     "VarDP": {
         "Description": "Depth over variant genotypes (does not include depth of reference samples)"
     },
-    "AS_VQSLOD": {
-        "Number": "A",
-        "Description": "Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model",
+    "VQSLOD": {
+        "Description": "Log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model",
     },
-    "AS_VQSR_culprit": {
-        "Number": "A",
-        "Description": "Allele-specific worst-performing annotation in the VQSR Gaussian mixture model",
+    "VQSR_culprit": {
+        "Description": "Worst-performing annotation in the VQSR Gaussian mixture model",
     },
     "decoy": {"Description": "Variant falls within a reference decoy region"},
     "lcr": {"Description": "Variant falls within a low complexity region"},
@@ -177,19 +175,16 @@
     "variant_type": {
         "Description": "Variant type (snv, indel, multi-snv, multi-indel, or mixed)"
     },
-    "allele_type": {
-        "Number": "A",
-        "Description": "Allele type (snv, insertion, deletion, or mixed)",
-    },
+    "allele_type": {"Description": "Allele type (snv, insertion, deletion, or mixed)",},
     "n_alt_alleles": {
-        "Number": "A",
+        "Number": 1,
         "Description": "Total number of alternate alleles observed at variant locus",
     },
     "was_mixed": {"Description": "Variant type was mixed"},
     "has_star": {
         "Description": "Variant locus coincides with a spanning deletion (represented by a star) observed elsewhere in the callset"
     },
-    "pab_max": {
+    "AS_pab_max": {
         "Number": "A",
         "Description": "Maximum p-value over callset for binomial test of observed allele balance for a heterozygous genotype, given expectation of 0.5",
     },
@@ -384,7 +379,12 @@ def generic_field_check(
     """
     Check a generic logical condition involving annotations in a Hail Table and print the results to terminal.
 
-    Displays the number of rows in the Table that return False for a condition.
+    Displays the number of rows in the Table that return False for a condition (`cond_expr`). 
+
+    .. note::
+        This function checks for the number of rows that violate the `cond_expr` and prints the desired condition (`check_description`) to the terminal.
+        `cond_expr` and `check_description` are opposites and should never be the same.
+
 
     :param ht: Table containing annotations to be checked.
     :param cond_expr: Logical expression referring to annotations in ht to be checked.
@@ -428,7 +428,7 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
     """
     filters_dict = {
         "n": hl.agg.count(),
-        "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) == 0),
+        "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) != 0),
         "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("inbreeding_coeff")),
         "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")),
         "frac_rf": hl.agg.fraction(ht.filters.contains("rf")),
@@ -542,7 +542,7 @@ def make_info_dict(
             },
             f"{prefix}age_hist_hom_bin_freq": {
                 "Number": "A",
-                "Description": f"Histogram of ages of homozygous alternate individuals{description_text}; bin edges are: {bin_edges['het']}; total number of individuals of any genotype bin: {age_hist_data}",
+                "Description": f"Histogram of ages of homozygous alternate individuals{description_text}; bin edges are: {bin_edges['hom']}; total number of individuals of any genotype bin: {age_hist_data}",
             },
             f"{prefix}age_hist_hom_n_smaller": {
                 "Number": "A",
@@ -571,7 +571,7 @@ def make_info_dict(
             },
             f"{prefix}AF_popmax": {
                 "Number": "A",
-                "Description": f"Maximum allele frequency across populations (excluding samples of Ashkenazi, Finnish, and indeterminate ancestry){description_text}",
+                "Description": f"Maximum allele frequency across populations{description_text}",
             },
             f"{prefix}nhomalt_popmax": {
                 "Number": "A",
@@ -634,7 +634,6 @@ def add_as_info_dict(
     :param as_fields: List containing allele-specific fields to be added to info_dict. Default is AS_FIELDS.
     :return: Dictionary with allele specific annotations, their descriptions, and their VCF number field.
     """
-    prefix_text = "Allele-specific"
     as_dict = {}
 
     for field in as_fields:
@@ -651,7 +650,7 @@ def add_as_info_dict(
             as_dict[field]["Number"] = "A"
             as_dict[field][
                 "Description"
-            ] = f"{prefix_text} {first_letter}{rest_of_description}"
+            ] = f"Allele-specific {first_letter}{rest_of_description}"
 
         except KeyError:
             raise ValueError(f"{field} is not present in input info dictionary!")
@@ -850,7 +849,7 @@ def sample_sum_check(
 
 
 def set_female_y_metrics_to_na(
-    t: Union[hl.Table, hl.MatrixTable], faf: bool = True,
+    t: Union[hl.Table, hl.MatrixTable],
 ) -> Dict[str, hl.expr.Int32Expression]:
     """
     Set AC, AN, and nhomalt chrY variant annotations for females to NA (instead of 0).
@@ -860,19 +859,13 @@ def set_female_y_metrics_to_na(
     """
     metrics = list(t.row.info)
     female_metrics = [x for x in metrics if "_female" in x]
-    female_metrics = [
-        x for x in female_metrics if ("nhomalt" in x) or ("AC" in x) or ("AN" in x)
-    ]
-
-    if faf:
-        female_metrics.extend([x for x in female_metrics if "faf" in x])
 
     female_metrics_dict = {}
     for metric in female_metrics:
         female_metrics_dict.update(
             {
                 f"{metric}": hl.or_missing(
-                    (t.locus.contig.in_y_nonpar() | t.locus.contig.in_y_par()),
+                    (~t.locus.contig.in_y_nonpar() | ~t.locus.contig.in_y_par()),
                     t.info[f"{metric}"],
                 )
             }

From ca49ac51d8abd72e42120ea26ee3f207e12ffad9 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Fri, 31 Jul 2020 10:38:35 -0400
Subject: [PATCH 56/61] Updated docstring for sample_sum_check

---
 gnomad/utils/vcf.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 903a90040..5b17cd31c 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -788,15 +788,15 @@ def sample_sum_check(
     Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version;
     display results from checking the sum of the specified annotations in the terminal.
 
-    :param Table ht: Table containing annotations to be summed.
-    :param str prefix: String indicating sample subset.
-    :param dict label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
+    :param ht: Table containing annotations to be summed.
+    :param prefix: String indicating sample subset.
+    :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
         e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
-    :param bool verbose: If True, show top values of annotations being checked, including checks that pass; if False,
+    :param verbose: If True, show top values of annotations being checked, including checks that pass; if False,
         show only top values of annotations that fail checks.
-    :param str subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked.
+    :param subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked.
     :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
-    :rtype: None
+    :return: None
     """
     combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in make_label_combos(label_groups)]
     combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in make_label_combos(label_groups)]

From 261c38ddf6b6701a47305a2ae61bf3922577997d Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Mon, 3 Aug 2020 10:11:55 -0400
Subject: [PATCH 57/61] Updated some docstrings addressing review comments

---
 gnomad/utils/vcf.py | 73 +++++++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 33 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 5b17cd31c..3d42d5181 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -48,6 +48,7 @@
     "AS_ReadPosRankSum",
     "AS_SOR",
     "AS_VarDP",
+    "InbreedingCoeff",
 ]
 """
 Allele-specific variant annotations.
@@ -56,7 +57,6 @@
 SITE_FIELDS = [
     "BaseQRankSum",
     "FS",
-    "InbreedingCoeff",
     "MQ",
     "MQRankSum",
     "QD",
@@ -379,11 +379,13 @@ def generic_field_check(
     """
     Check a generic logical condition involving annotations in a Hail Table and print the results to terminal.
 
-    Displays the number of rows in the Table that return False for a condition (`cond_expr`). 
+    Displays the number of rows in the Table that match the `cond_expr` and fail to be the desired condition (`check_description`). 
 
     .. note::
-        This function checks for the number of rows that violate the `cond_expr` and prints the desired condition (`check_description`) to the terminal.
-        `cond_expr` and `check_description` are opposites and should never be the same.
+        `cond_expr` and `check_description` are opposites and should never be the same. 
+        E.g., If `cond_expr` filters for instances where the raw AC is less than adj AC, 
+        then it is checking sites that fail to be the desired condition (`check_description`) 
+        of having a raw AC greater than or equal to the adj AC.
 
 
     :param ht: Table containing annotations to be checked.
@@ -414,39 +416,41 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
 
     Checks for:
         - Total number of variants
-        - Fraction of variants removed by any filter
-        - Fraction of variants removed because of InbreedingCoefficient filter in combination with any other filter
-        - Fraction of varinats removed because of AC0 filter in combination with any other filter
-        - Fraction of variants removed because of random forest filtering in combination with any other filter
-        - Fraction of variants removed only because of InbreedingCoefficient filter
-        - Fraction of variants removed only because of AC0 filter
-        - Fraction of variants removed only because of random forest filtering
-
+        - Fraction of variants removed due to:
+            - Any filter
+            - Inbreeding coefficient filter in combination with any other filter
+            - AC0 filter in combination with any other filter
+            - Random forest filtering in combination with any other filter
+            - Only inbreeding coefficient filter
+            - Only AC0 filter
+            - Only random forest filtering
 
     :param ht: Table containing 'filter' annotation to be examined.
-    :return: Dictionary containing Hail aggregation expressions to measure filter flags.
+    :return: Dictionary containing Hail aggregation expressions to examine filter flags.
     """
-    filters_dict = {
+    return {
         "n": hl.agg.count(),
         "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) != 0),
-        "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("inbreeding_coeff")),
+        "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("InbreedingCoeff")),
         "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")),
-        "frac_rf": hl.agg.fraction(ht.filters.contains("rf")),
+        "frac_rf": hl.agg.fraction(ht.filters.contains("RF")),
         "frac_inbreed_coeff_only": hl.agg.fraction(
-            ht.filters.contains("inbreeding_coeff") & (ht.filters.length() == 1)
+            ht.filters.contains("InbreedingCoeff") & (ht.filters.length() == 1)
         ),
         "frac_ac0_only": hl.agg.fraction(
             ht.filters.contains("AC0") & (ht.filters.length() == 1)
         ),
         "frac_rf_only": hl.agg.fraction(
-            ht.filters.contains("rf") & (ht.filters.length() == 1)
+            ht.filters.contains("RF") & (ht.filters.length() == 1)
         ),
     }
-    return filters_dict
 
 
 def make_combo_header_text(
-    preposition: str, combo_dict: Dict[str, str], prefix: str,
+    preposition: str,
+    combo_dict: Dict[str, str],
+    prefix: str,
+    pop_names: Dict[str, str] = POP_NAMES,
 ) -> str:
     """
     Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset.
@@ -459,6 +463,7 @@ def make_combo_header_text(
         Possible grouping types are: "group", "pop", "sex", and "subpop". 
         Example input: {"pop": "afr", "sex": "female"}
     :param prefix: Prefix string indicating sample subset.
+    :param pop_names: Dict with global population names (keys) and population descriptions (values). Default is POP_NAMES.
     :return: String with automatically generated description text for a given set of combo fields.
     """
     header_text = " " + preposition
@@ -473,11 +478,11 @@ def make_combo_header_text(
     header_text = header_text + " samples"
 
     if "subpop" in combo_dict:
-        header_text = header_text + f" of {POP_NAMES[combo_dict['subpop']]} ancestry"
+        header_text = header_text + f" of {pop_names[combo_dict['subpop']]} ancestry"
         combo_dict.pop("pop")
 
     if "pop" in combo_dict:
-        header_text = header_text + f" of {POP_NAMES[combo_dict['pop']]} ancestry"
+        header_text = header_text + f" of {pop_names[combo_dict['pop']]} ancestry"
 
     if "gnomad" in prefix:
         header_text = header_text + " in gnomAD"
@@ -588,34 +593,37 @@ def make_info_dict(
             combo_fields = combo.split("_")
             group_dict = dict(zip(group_types, combo_fields))
 
+            for_combo = make_combo_header_text("for", group_dict, prefix)
+            in_combo = make_combo_header_text("in", group_dict, prefix)
+
             if not faf:
                 combo_dict = {
                     f"{prefix}AC_{combo}": {
                         "Number": "A",
-                        "Description": f"Alternate allele count{make_combo_header_text('for', group_dict, prefix)}",
+                        "Description": f"Alternate allele count{for_combo}",
                     },
                     f"{prefix}AN_{combo}": {
                         "Number": "1",
-                        "Description": f"Total number of alleles{make_combo_header_text('in', group_dict, prefix)}",
+                        "Description": f"Total number of alleles{in_combo}",
                     },
                     f"{prefix}AF_{combo}": {
                         "Number": "A",
-                        "Description": f"Alternate allele frequency{make_combo_header_text('in', group_dict, prefix)}",
+                        "Description": f"Alternate allele frequency{in_combo}",
                     },
                     f"{prefix}nhomalt_{combo}": {
                         "Number": "A",
-                        "Description": f"Count of homozygous individuals{make_combo_header_text('in', group_dict, prefix)}",
+                        "Description": f"Count of homozygous individuals{in_combo}",
                     },
                 }
             else:
                 combo_dict = {
                     f"{prefix}faf95_{combo}": {
                         "Number": "A",
-                        "Description": f"Filtering allele frequency (using Poisson 95% CI) {make_combo_header_text('for', group_dict, prefix)}",
+                        "Description": f"Filtering allele frequency (using Poisson 95% CI) {for_combo}",
                     },
                     f"{prefix}faf99_{combo}": {
                         "Number": "A",
-                        "Description": f"Filtering allele frequency (using Poisson 99% CI) {make_combo_header_text('for', group_dict, prefix)}",
+                        "Description": f"Filtering allele frequency (using Poisson 99% CI) {for_combo}",
                     },
                 }
             info_dict.update(combo_dict)
@@ -798,11 +806,10 @@ def sample_sum_check(
     :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
     :return: None
     """
-    combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in make_label_combos(label_groups)]
-    combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in make_label_combos(label_groups)]
-    combo_nhomalt = [
-        ht.info[f"{prefix}nhomalt_{x}"] for x in make_label_combos(label_groups)
-    ]
+    label_combos = make_label_combos(label_groups)
+    combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in label_combos]
+    combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in label_combos]
+    combo_nhomalt = [ht.info[f"{prefix}nhomalt_{x}"] for x in label_combos]
 
     group = label_groups.pop("group")[0]
     alt_groups = "_".join(

From d176f68aed2bfb81fcd2a343e55bac2c4f6ddf20 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Mon, 3 Aug 2020 13:24:45 -0400
Subject: [PATCH 58/61] Created assessment folder and moved sample_sum_check,
 generic_field_check, make_filters_sanity_check_expr to assessment

---
 gnomad/assessment/__init__.py      |   1 +
 gnomad/assessment/sanity_checks.py | 159 +++++++++++++++++++++++++++++
 2 files changed, 160 insertions(+)
 create mode 100644 gnomad/assessment/__init__.py
 create mode 100644 gnomad/assessment/sanity_checks.py

diff --git a/gnomad/assessment/__init__.py b/gnomad/assessment/__init__.py
new file mode 100644
index 000000000..2ae7a6039
--- /dev/null
+++ b/gnomad/assessment/__init__.py
@@ -0,0 +1 @@
+from gnomad.assessment import sanity_checks
diff --git a/gnomad/assessment/sanity_checks.py b/gnomad/assessment/sanity_checks.py
new file mode 100644
index 000000000..c6eb2d478
--- /dev/null
+++ b/gnomad/assessment/sanity_checks.py
@@ -0,0 +1,159 @@
+import logging
+from typing import Dict, List
+
+import hail as hl
+
+from gnomad.utils.vcf import make_label_combos, SORT_ORDER
+
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def generic_field_check(
+    ht: hl.Table,
+    cond_expr: hl.expr.BooleanExpression,
+    check_description: str,
+    display_fields: List[str],
+    verbose: bool,
+) -> None:
+    """
+    Check a generic logical condition involving annotations in a Hail Table and print the results to terminal.
+
+    Displays the number of rows in the Table that match the `cond_expr` and fail to be the desired condition (`check_description`). 
+
+    .. note::
+        `cond_expr` and `check_description` are opposites and should never be the same. 
+        E.g., If `cond_expr` filters for instances where the raw AC is less than adj AC, 
+        then it is checking sites that fail to be the desired condition (`check_description`) 
+        of having a raw AC greater than or equal to the adj AC.
+
+
+    :param ht: Table containing annotations to be checked.
+    :param cond_expr: Logical expression referring to annotations in ht to be checked.
+    :param check_description: String describing the condition being checked; is displayed in terminal summary message.
+    :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes);
+        these fields are also displayed if verbose is True.
+    :param verbose: If True, show top values of annotations being checked, including checks that pass; if False,
+        show only top values of annotations that fail checks.
+    """
+    ht_orig = ht
+    ht = ht.filter(cond_expr)
+    n_fail = ht.count()
+    if n_fail > 0:
+        logger.info(f"Found {n_fail} sites that fail {check_description} check:")
+        ht = ht.flatten()
+        ht.select("locus", "alleles", *display_fields).show()
+    else:
+        logger.info(f"PASSED {check_description} check")
+        if verbose:
+            ht_orig = ht_orig.flatten()
+            ht_orig.select(*display_fields).show()
+
+
+def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]:
+    """
+    Make Hail expressions to measure % variants filtered under varying conditions of interest.
+
+    Checks for:
+        - Total number of variants
+        - Fraction of variants removed due to:
+            - Any filter
+            - Inbreeding coefficient filter in combination with any other filter
+            - AC0 filter in combination with any other filter
+            - Random forest filtering in combination with any other filter
+            - Only inbreeding coefficient filter
+            - Only AC0 filter
+            - Only random forest filtering
+
+    :param ht: Table containing 'filter' annotation to be examined.
+    :return: Dictionary containing Hail aggregation expressions to examine filter flags.
+    """
+    return {
+        "n": hl.agg.count(),
+        "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) != 0),
+        "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("InbreedingCoeff")),
+        "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")),
+        "frac_rf": hl.agg.fraction(ht.filters.contains("RF")),
+        "frac_inbreed_coeff_only": hl.agg.fraction(
+            ht.filters.contains("InbreedingCoeff") & (ht.filters.length() == 1)
+        ),
+        "frac_ac0_only": hl.agg.fraction(
+            ht.filters.contains("AC0") & (ht.filters.length() == 1)
+        ),
+        "frac_rf_only": hl.agg.fraction(
+            ht.filters.contains("RF") & (ht.filters.length() == 1)
+        ),
+    }
+
+
+def sample_sum_check(
+    ht: hl.Table,
+    prefix: str,
+    label_groups: Dict[str, List[str]],
+    verbose: bool,
+    subpop: bool = None,
+    sort_order: List[str] = SORT_ORDER,
+) -> None:
+    """
+    Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version;
+    display results from checking the sum of the specified annotations in the terminal.
+
+    :param ht: Table containing annotations to be summed.
+    :param prefix: String indicating sample subset.
+    :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
+        e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
+    :param verbose: If True, show top values of annotations being checked, including checks that pass; if False,
+        show only top values of annotations that fail checks.
+    :param subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked.
+    :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
+    :return: None
+    """
+    label_combos = make_label_combos(label_groups)
+    combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in label_combos]
+    combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in label_combos]
+    combo_nhomalt = [ht.info[f"{prefix}nhomalt_{x}"] for x in label_combos]
+
+    group = label_groups.pop("group")[0]
+    alt_groups = "_".join(
+        sorted(label_groups.keys(), key=lambda x: sort_order.index(x))
+    )
+
+    annot_dict = {
+        f"sum_AC_{group}_{alt_groups}": hl.sum(combo_AC),
+        f"sum_AN_{group}_{alt_groups}": hl.sum(combo_AN),
+        f"sum_nhomalt_{group}_{alt_groups}": hl.sum(combo_nhomalt),
+    }
+
+    ht = ht.annotate(**annot_dict)
+
+    for subfield in ["AC", "AN", "nhomalt"]:
+        if not subpop:
+            generic_field_check(
+                ht,
+                (
+                    ht.info[f"{prefix}{subfield}_{group}"]
+                    != ht[f"sum_{subfield}_{group}_{alt_groups}"]
+                ),
+                f"{prefix}{subfield}_{group} = sum({subfield}_{group}_{alt_groups})",
+                [
+                    f"info.{prefix}{subfield}_{group}",
+                    f"sum_{subfield}_{group}_{alt_groups}",
+                ],
+                verbose,
+            )
+        else:
+            generic_field_check(
+                ht,
+                (
+                    ht.info[f"{prefix}{subfield}_{group}_{subpop}"]
+                    != ht[f"sum_{subfield}_{group}_{alt_groups}"]
+                ),
+                f"{prefix}{subfield}_{group}_{subpop} = sum({subfield}_{group}_{alt_groups})",
+                [
+                    f"info.{prefix}{subfield}_{group}_{subpop}",
+                    f"sum_{subfield}_{group}_{alt_groups}",
+                ],
+                verbose,
+            )

From 1640b3f16ceeccadf72eb375ddc64ef0fb1fd174 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Mon, 3 Aug 2020 13:26:16 -0400
Subject: [PATCH 59/61] Forgot to commit changes in vcf.py when moving to
 assessment

---
 gnomad/utils/vcf.py | 148 --------------------------------------------
 1 file changed, 148 deletions(-)

diff --git a/gnomad/utils/vcf.py b/gnomad/utils/vcf.py
index 3d42d5181..1147813eb 100644
--- a/gnomad/utils/vcf.py
+++ b/gnomad/utils/vcf.py
@@ -369,83 +369,6 @@ def make_label_combos(
     return combos
 
 
-def generic_field_check(
-    ht: hl.Table,
-    cond_expr: hl.expr.BooleanExpression,
-    check_description: str,
-    display_fields: List[str],
-    verbose: bool,
-) -> None:
-    """
-    Check a generic logical condition involving annotations in a Hail Table and print the results to terminal.
-
-    Displays the number of rows in the Table that match the `cond_expr` and fail to be the desired condition (`check_description`). 
-
-    .. note::
-        `cond_expr` and `check_description` are opposites and should never be the same. 
-        E.g., If `cond_expr` filters for instances where the raw AC is less than adj AC, 
-        then it is checking sites that fail to be the desired condition (`check_description`) 
-        of having a raw AC greater than or equal to the adj AC.
-
-
-    :param ht: Table containing annotations to be checked.
-    :param cond_expr: Logical expression referring to annotations in ht to be checked.
-    :param check_description: String describing the condition being checked; is displayed in terminal summary message.
-    :param display_fields: List of names of ht annotations to be displayed in case of failure (for troubleshooting purposes);
-        these fields are also displayed if verbose is True.
-    :param verbose: If True, show top values of annotations being checked, including checks that pass; if False,
-        show only top values of annotations that fail checks.
-    """
-    ht_orig = ht
-    ht = ht.filter(cond_expr)
-    n_fail = ht.count()
-    if n_fail > 0:
-        logger.info(f"Found {n_fail} sites that fail {check_description} check:")
-        ht = ht.flatten()
-        ht.select("locus", "alleles", *display_fields).show()
-    else:
-        logger.info(f"PASSED {check_description} check")
-        if verbose:
-            ht_orig = ht_orig.flatten()
-            ht_orig.select(*display_fields).show()
-
-
-def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]:
-    """
-    Make Hail expressions to measure % variants filtered under varying conditions of interest.
-
-    Checks for:
-        - Total number of variants
-        - Fraction of variants removed due to:
-            - Any filter
-            - Inbreeding coefficient filter in combination with any other filter
-            - AC0 filter in combination with any other filter
-            - Random forest filtering in combination with any other filter
-            - Only inbreeding coefficient filter
-            - Only AC0 filter
-            - Only random forest filtering
-
-    :param ht: Table containing 'filter' annotation to be examined.
-    :return: Dictionary containing Hail aggregation expressions to examine filter flags.
-    """
-    return {
-        "n": hl.agg.count(),
-        "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) != 0),
-        "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("InbreedingCoeff")),
-        "frac_ac0": hl.agg.fraction(ht.filters.contains("AC0")),
-        "frac_rf": hl.agg.fraction(ht.filters.contains("RF")),
-        "frac_inbreed_coeff_only": hl.agg.fraction(
-            ht.filters.contains("InbreedingCoeff") & (ht.filters.length() == 1)
-        ),
-        "frac_ac0_only": hl.agg.fraction(
-            ht.filters.contains("AC0") & (ht.filters.length() == 1)
-        ),
-        "frac_rf_only": hl.agg.fraction(
-            ht.filters.contains("RF") & (ht.filters.length() == 1)
-        ),
-    }
-
-
 def make_combo_header_text(
     preposition: str,
     combo_dict: Dict[str, str],
@@ -784,77 +707,6 @@ def make_hist_dict(bin_edges: Dict[str, Dict[str, str]], adj: bool) -> Dict[str,
     return header_hist_dict
 
 
-def sample_sum_check(
-    ht: hl.Table,
-    prefix: str,
-    label_groups: Dict[str, List[str]],
-    verbose: bool,
-    subpop: bool = None,
-    sort_order: List[str] = SORT_ORDER,
-) -> None:
-    """
-    Compute afresh the sum of annotations for a specified group of annotations, and compare to the annotated version;
-    display results from checking the sum of the specified annotations in the terminal.
-
-    :param ht: Table containing annotations to be summed.
-    :param prefix: String indicating sample subset.
-    :param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
-        e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
-    :param verbose: If True, show top values of annotations being checked, including checks that pass; if False,
-        show only top values of annotations that fail checks.
-    :param subpop: Subpop abbreviation, supplied only if subpopulations are included in the annotation groups being checked.
-    :param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
-    :return: None
-    """
-    label_combos = make_label_combos(label_groups)
-    combo_AC = [ht.info[f"{prefix}AC_{x}"] for x in label_combos]
-    combo_AN = [ht.info[f"{prefix}AN_{x}"] for x in label_combos]
-    combo_nhomalt = [ht.info[f"{prefix}nhomalt_{x}"] for x in label_combos]
-
-    group = label_groups.pop("group")[0]
-    alt_groups = "_".join(
-        sorted(label_groups.keys(), key=lambda x: sort_order.index(x))
-    )
-
-    annot_dict = {
-        f"sum_AC_{group}_{alt_groups}": hl.sum(combo_AC),
-        f"sum_AN_{group}_{alt_groups}": hl.sum(combo_AN),
-        f"sum_nhomalt_{group}_{alt_groups}": hl.sum(combo_nhomalt),
-    }
-
-    ht = ht.annotate(**annot_dict)
-
-    for subfield in ["AC", "AN", "nhomalt"]:
-        if not subpop:
-            generic_field_check(
-                ht,
-                (
-                    ht.info[f"{prefix}{subfield}_{group}"]
-                    != ht[f"sum_{subfield}_{group}_{alt_groups}"]
-                ),
-                f"{prefix}{subfield}_{group} = sum({subfield}_{group}_{alt_groups})",
-                [
-                    f"info.{prefix}{subfield}_{group}",
-                    f"sum_{subfield}_{group}_{alt_groups}",
-                ],
-                verbose,
-            )
-        else:
-            generic_field_check(
-                ht,
-                (
-                    ht.info[f"{prefix}{subfield}_{group}_{subpop}"]
-                    != ht[f"sum_{subfield}_{group}_{alt_groups}"]
-                ),
-                f"{prefix}{subfield}_{group}_{subpop} = sum({subfield}_{group}_{alt_groups})",
-                [
-                    f"info.{prefix}{subfield}_{group}_{subpop}",
-                    f"sum_{subfield}_{group}_{alt_groups}",
-                ],
-                verbose,
-            )
-
-
 def set_female_y_metrics_to_na(
     t: Union[hl.Table, hl.MatrixTable],
 ) -> Dict[str, hl.expr.Int32Expression]:

From 6e67800cfb00c9f8856dd224d3bb73a1b67ae508 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Mon, 3 Aug 2020 13:52:49 -0400
Subject: [PATCH 60/61] Updated generic field check docstring

---
 gnomad/assessment/sanity_checks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnomad/assessment/sanity_checks.py b/gnomad/assessment/sanity_checks.py
index c6eb2d478..6ffdbe7ab 100644
--- a/gnomad/assessment/sanity_checks.py
+++ b/gnomad/assessment/sanity_checks.py
@@ -22,6 +22,7 @@ def generic_field_check(
     Check a generic logical condition involving annotations in a Hail Table and print the results to terminal.
 
     Displays the number of rows in the Table that match the `cond_expr` and fail to be the desired condition (`check_description`). 
+    If the number of rows that match the `cond_expr` is 0, then the Table passes that check; otherwise, it fails.
 
     .. note::
         `cond_expr` and `check_description` are opposites and should never be the same. 
@@ -29,7 +30,6 @@ def generic_field_check(
         then it is checking sites that fail to be the desired condition (`check_description`) 
         of having a raw AC greater than or equal to the adj AC.
 
-
     :param ht: Table containing annotations to be checked.
     :param cond_expr: Logical expression referring to annotations in ht to be checked.
     :param check_description: String describing the condition being checked; is displayed in terminal summary message.

From f021255ccf801406044374ddd978ab4ee6915690 Mon Sep 17 00:00:00 2001
From: Katherine Chao <kchao@broadinstitute.org>
Date: Tue, 4 Aug 2020 09:40:11 -0400
Subject: [PATCH 61/61] Added option to check for additional filters to
 make_filters_sanity_check_expr

---
 gnomad/assessment/sanity_checks.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/gnomad/assessment/sanity_checks.py b/gnomad/assessment/sanity_checks.py
index 6ffdbe7ab..492ac3a14 100644
--- a/gnomad/assessment/sanity_checks.py
+++ b/gnomad/assessment/sanity_checks.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import hail as hl
 
@@ -52,7 +52,9 @@ def generic_field_check(
             ht_orig.select(*display_fields).show()
 
 
-def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression]:
+def make_filters_sanity_check_expr(
+    ht: hl.Table, extra_filter_checks: Optional[Dict[str, hl.expr.Expression]] = None
+) -> Dict[str, hl.expr.Expression]:
     """
     Make Hail expressions to measure % variants filtered under varying conditions of interest.
 
@@ -68,9 +70,10 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
             - Only random forest filtering
 
     :param ht: Table containing 'filter' annotation to be examined.
+    :param extra_filter_checks: Optional dictionary containing filter condition name (key) extra filter expressions (value) to be examined.
     :return: Dictionary containing Hail aggregation expressions to examine filter flags.
     """
-    return {
+    filters_dict = {
         "n": hl.agg.count(),
         "frac_any_filter": hl.agg.fraction(hl.len(ht.filters) != 0),
         "frac_inbreed_coeff": hl.agg.fraction(ht.filters.contains("InbreedingCoeff")),
@@ -86,6 +89,10 @@ def make_filters_sanity_check_expr(ht: hl.Table) -> Dict[str, hl.expr.Expression
             ht.filters.contains("RF") & (ht.filters.length() == 1)
         ),
     }
+    if extra_filter_checks:
+        filters_dict.update(extra_filter_checks)
+
+    return filters_dict
 
 
 def sample_sum_check(