From 071d4de2b1aa6ecc8ad6803f40a308c4cc000eca Mon Sep 17 00:00:00 2001
From: KoalaQin <heqin@broadinstitute.org>
Date: Tue, 21 May 2024 22:01:35 -0400
Subject: [PATCH 1/2] Code to run logistic regression on v4 exomes and genomes
 with ancestry PCs

---
 .../v4/assessment/logistic_regression.py      | 179 ++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 gnomad_qc/v4/assessment/logistic_regression.py

diff --git a/gnomad_qc/v4/assessment/logistic_regression.py b/gnomad_qc/v4/assessment/logistic_regression.py
new file mode 100644
index 000000000..787fb89f8
--- /dev/null
+++ b/gnomad_qc/v4/assessment/logistic_regression.py
@@ -0,0 +1,179 @@
+"""Script to perform logistic regression on the unified MatrixTable of v4 exomess and genomes including ancestry PCs."""
+
+import argparse
+import logging
+
+import hail as hl
+
+from gnomad_qc.v4.resources.basics import get_gnomad_v4_genomes_vds, get_gnomad_v4_vds
+from gnomad_qc.v4.resources.release import release_sites
+from gnomad_qc.v4.resources.sample_qc import ancestry_pca_scores
+
+# Steps to perform logistic regression:
+# 1. Load gnomAD v4 exomes and genomes VDS, and split_multiallelics
+# 2. Union exomes and genomes matrix tables
+# 3. Filter and densify the v3 genomes matrix table
+# 4. Filter to the test lists
+
+logging.basicConfig(
+    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S %p",
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def get_test_intervals(ht: hl.Table) -> hl.Table:
+    """
+    Get the test lists for the exomes and genomes MatrixTables.
+
+    :param joint_ht: Joint HT of v4 exomes and genomes.
+    :return: Test Table
+    """
+    # Filter to chr22
+    ht = hl.filter_intervals(ht, [hl.parse_locus_interval("chr22")])
+
+    # Filter to variants with AF >= 0.1 and p-value < 10e-4
+    ht1 = ht.filter(
+        (ht.joint.freq[0].AF >= 0.1)
+        & (ht.freq_comparison_stats.stat_union.p_value < 10e-4)
+        & (
+            ht.freq_comparison_stats.stat_union.stat_test_name
+            == "cochran_mantel_haenszel_test"
+        )
+    )
+    # Filter to variants with AF < 0.1 and p-value < 10e-4
+    ht2 = ht.filter(
+        (ht.joint.freq[0].AF < 0.1)
+        & (ht.joint.freq[0].AC > 0)
+        & (ht.freq_comparison_stats.stat_union.p_value < 10e-4)
+        & (
+            ht.freq_comparison_stats.stat_union.stat_test_name
+            == "cochran_mantel_haenszel_test"
+        )
+    )
+    ht2 = ht2.naive_coalesce(50)
+    ht2 = ht2._filter_partitions(range(5))
+    # Filter to ~8000 variants with p-value >= 10e-4
+    ht3 = ht.filter(
+        (ht.freq_comparison_stats.stat_union.p_value >= 10e-4)
+        & (
+            ht.freq_comparison_stats.stat_union.stat_test_name
+            == "cochran_mantel_haenszel_test"
+        )
+    )
+    ht3 = ht3._filter_partitions(range(5))
+    ht = ht1.union(ht2).union(ht3)
+
+    # make this an interval table
+    ht = ht.annotate(
+        interval=hl.locus_interval(
+            ht.locus.contig, ht.locus.position, ht.locus.position + 1
+        )
+    )
+    ht = ht.key_by(ht.interval)
+    ht = ht.select()
+
+    return ht
+
+
+def densify_union_exomes_genomes(
+    exomes_vds: hl.vds.VariantDataset,
+    genomes_vds: hl.vds.VariantDataset,
+) -> hl.MatrixTable:
+    """
+    Union exomes and genomes MatrixTables.
+
+    :param exomes_vds: VDS of exomes
+    :param genomes_vds: VDS of genomes
+    :return: Unioned MatrixTable
+    """
+    logger.info("Densifying exomes...")
+    exomes_mt = hl.vds.to_dense_mt(exomes_vds)
+    exomes_mt = exomes_mt.annotate_cols(is_genome=False)
+    exomes_mt = exomes_mt.select_entries("GT").select_rows().select_cols("is_genome")
+
+    logger.info("Densifying genomes...")
+    genomes_mt = hl.vds.to_dense_mt(genomes_vds)
+    genomes_mt = genomes_mt.annotate_cols(is_genome=True)
+    genomes_mt = genomes_mt.select_entries("GT").select_rows().select_cols("is_genome")
+
+    logger.info("Unioning exomes and genomes...")
+    mt = exomes_mt.union_cols(genomes_mt)
+    return mt
+
+
+def main(args):
+    """Perform logistic regression on the unioned exomes and genomes MatrixTable."""
+    hl.init(
+        log="/logistic_regression.log",
+        default_reference="GRCh38",
+        tmp_dir="gs://gnomad-tmp-30day",
+    )
+
+    test = args.test
+    overwrite = args.overwrite
+
+    exomes_vds = get_gnomad_v4_vds(
+        release_only=True,
+        split=True,
+    )
+    genomes_vds = get_gnomad_v4_genomes_vds(
+        release_only=True,
+        split=True,
+    )
+
+    ht = release_sites(data_type="joint").ht()
+    # Maybe only filter to variants with a union p-value < 10e-4 later
+
+    if test:
+        logger.info("Filtering to test intervals...")
+        ht = get_test_intervals(ht)
+        ht = ht.checkpoint(hl.utils.new_temp_file("test_intervals", "ht"))
+        exomes_vds = hl.vds.filter_intervals(
+            exomes_vds, ht, split_reference_blocks=True
+        )
+        genomes_vds = hl.vds.filter_intervals(
+            genomes_vds, ht, split_reference_blocks=True
+        )
+
+    mt = densify_union_exomes_genomes(exomes_vds, genomes_vds)
+    mt = mt.checkpoint(
+        hl.utils.new_temp_file("union_exomes_genomes", "mt"),
+    )
+
+    pc_scores = ancestry_pca_scores().ht()
+    mt = mt.annotate_cols(pc=pc_scores[mt.col_key].scores)
+
+    # Perform logistic regression
+    ht = hl.logistic_regression_rows(
+        "firth",
+        y=mt.is_genome,
+        x=mt.GT.n_alt_alleles(),
+        covariates=[1] + [mt.pc[i] for i in range(10)],
+    )
+
+    ht.write(
+        f"/{'test' if test else ''}_logistic_regression_results.ht", overwrite=overwrite
+    )
+
+
+def get_script_argument_parser() -> argparse.ArgumentParser:
+    """Get script argument parser."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--test",
+        help="Import VCFs and write MatrixTable",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--overwrite",
+        help="Overwrite existing MatrixTable",
+        action="store_true",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_script_argument_parser()
+    main(parser.parse_args())

From ad96374760d26e76dc3c169320d94e9b51ea8a49 Mon Sep 17 00:00:00 2001
From: KoalaQin <heqin@broadinstitute.org>
Date: Tue, 28 May 2024 08:29:19 -0400
Subject: [PATCH 2/2] Address review suggestions

---
 .../v4/assessment/logistic_regression.py      | 249 +++++++++++-------
 1 file changed, 158 insertions(+), 91 deletions(-)

diff --git a/gnomad_qc/v4/assessment/logistic_regression.py b/gnomad_qc/v4/assessment/logistic_regression.py
index 787fb89f8..e36ee6a5b 100644
--- a/gnomad_qc/v4/assessment/logistic_regression.py
+++ b/gnomad_qc/v4/assessment/logistic_regression.py
@@ -4,8 +4,13 @@
 import logging
 
 import hail as hl
+from gnomad.utils.annotations import annotate_adj
 
-from gnomad_qc.v4.resources.basics import get_gnomad_v4_genomes_vds, get_gnomad_v4_vds
+from gnomad_qc.v4.resources.basics import (
+    get_gnomad_v4_genomes_vds,
+    get_gnomad_v4_vds,
+    get_logging_path,
+)
 from gnomad_qc.v4.resources.release import release_sites
 from gnomad_qc.v4.resources.sample_qc import ancestry_pca_scores
 
@@ -23,48 +28,56 @@
 logger.setLevel(logging.INFO)
 
 
-def get_test_intervals(ht: hl.Table) -> hl.Table:
+def get_test_intervals(ht: hl.Table, test_partitions: bool = False) -> hl.Table:
     """
     Get the test lists for the exomes and genomes MatrixTables.
 
-    :param joint_ht: Joint HT of v4 exomes and genomes.
-    :return: Test Table
+    :param ht: Joint Table of v4 exomes and genomes.
+    :param test_partitions: Whether to filter to test partitions.
+    :return: Table of test intervals
     """
     # Filter to chr22
     ht = hl.filter_intervals(ht, [hl.parse_locus_interval("chr22")])
 
-    # Filter to variants with AF >= 0.1 and p-value < 10e-4
-    ht1 = ht.filter(
-        (ht.joint.freq[0].AF >= 0.1)
-        & (ht.freq_comparison_stats.stat_union.p_value < 10e-4)
-        & (
-            ht.freq_comparison_stats.stat_union.stat_test_name
-            == "cochran_mantel_haenszel_test"
+    if test_partitions:
+        logger.info("Filtering to 2 partitions on chr22 for test...")
+        ht = ht._filter_partitions(range(2))
+    else:
+        logger.info(
+            "Filtering to sets of variants on chr22 that have questionable CMH"
+            " p-values..."
         )
-    )
-    # Filter to variants with AF < 0.1 and p-value < 10e-4
-    ht2 = ht.filter(
-        (ht.joint.freq[0].AF < 0.1)
-        & (ht.joint.freq[0].AC > 0)
-        & (ht.freq_comparison_stats.stat_union.p_value < 10e-4)
-        & (
-            ht.freq_comparison_stats.stat_union.stat_test_name
-            == "cochran_mantel_haenszel_test"
+        # Filter to variants with AF >= 0.1 and p-value < 10e-4
+        ht1 = ht.filter(
+            (ht.joint.freq[0].AF >= 0.1)
+            & (ht.freq_comparison_stats.stat_union.p_value < 10e-4)
+            & (
+                ht.freq_comparison_stats.stat_union.stat_test_name
+                == "cochran_mantel_haenszel_test"
+            )
         )
-    )
-    ht2 = ht2.naive_coalesce(50)
-    ht2 = ht2._filter_partitions(range(5))
-    # Filter to ~8000 variants with p-value >= 10e-4
-    ht3 = ht.filter(
-        (ht.freq_comparison_stats.stat_union.p_value >= 10e-4)
-        & (
-            ht.freq_comparison_stats.stat_union.stat_test_name
-            == "cochran_mantel_haenszel_test"
+        # Filter to variants with AF < 0.1 and p-value < 10e-4
+        ht2 = ht.filter(
+            (ht.joint.freq[0].AF < 0.1)
+            & (ht.joint.freq[0].AC > 0)
+            & (ht.freq_comparison_stats.stat_union.p_value < 10e-4)
+            & (
+                ht.freq_comparison_stats.stat_union.stat_test_name
+                == "cochran_mantel_haenszel_test"
+            )
         )
-    )
-    ht3 = ht3._filter_partitions(range(5))
-    ht = ht1.union(ht2).union(ht3)
-
+        ht2 = ht2.naive_coalesce(50)
+        ht2 = ht2._filter_partitions(range(5))
+        # Filter to ~8000 variants with p-value >= 10e-4
+        ht3 = ht.filter(
+            (ht.freq_comparison_stats.stat_union.p_value >= 10e-4)
+            & (
+                ht.freq_comparison_stats.stat_union.stat_test_name
+                == "cochran_mantel_haenszel_test"
+            )
+        )
+        ht3 = ht3._filter_partitions(range(5))
+        ht = ht1.union(ht2).union(ht3)
     # make this an interval table
     ht = ht.annotate(
         interval=hl.locus_interval(
@@ -77,29 +90,38 @@ def get_test_intervals(ht: hl.Table) -> hl.Table:
     return ht
 
 
-def densify_union_exomes_genomes(
-    exomes_vds: hl.vds.VariantDataset,
-    genomes_vds: hl.vds.VariantDataset,
+def filter_and_densify_vds(
+    vds: hl.vds.VariantDataset,
+    filtered_intervals: hl.Table,
+    data_type: str = "exomes",
 ) -> hl.MatrixTable:
     """
-    Union exomes and genomes MatrixTables.
+    Filter and densify the VDS.
 
-    :param exomes_vds: VDS of exomes
-    :param genomes_vds: VDS of genomes
-    :return: Unioned MatrixTable
+    :param vds: VDS to filter and densify.
+    :param filtered_intervals: Table of filtered variants.
+    :param data_type: exomes or genomes.
+    :return: Densified MatrixTable
     """
-    logger.info("Densifying exomes...")
-    exomes_mt = hl.vds.to_dense_mt(exomes_vds)
-    exomes_mt = exomes_mt.annotate_cols(is_genome=False)
-    exomes_mt = exomes_mt.select_entries("GT").select_rows().select_cols("is_genome")
-
-    logger.info("Densifying genomes...")
-    genomes_mt = hl.vds.to_dense_mt(genomes_vds)
-    genomes_mt = genomes_mt.annotate_cols(is_genome=True)
-    genomes_mt = genomes_mt.select_entries("GT").select_rows().select_cols("is_genome")
-
-    logger.info("Unioning exomes and genomes...")
-    mt = exomes_mt.union_cols(genomes_mt)
+    logger.info(f"Filtering and densifying {data_type} VDS...")
+    vds = hl.vds.filter_intervals(vds, filtered_intervals, split_reference_blocks=True)
+    rmt = vds.reference_data
+    vmt = vds.variant_data
+    # TODO: check where did 'LA', 'LAD', 'LGT' go? Why did we use them in
+    # generate_freq? Was the VDS new?
+    vmt = vmt.select_entries("AD", "DP", "GT", "GQ")
+    vmt = vmt.annotate_cols(
+        is_genome=True if data_type == "genomes" else False,
+    )
+    vds = hl.vds.VariantDataset(rmt, vmt)
+    vds = vds.checkpoint(
+        hl.utils.new_temp_file(f"temp_{data_type}_vds_filtered", "vds")
+    )
+    mt = hl.vds.to_dense_mt(vds)
+    mt = annotate_adj(mt)
+    # TODO: What do we do about the high ab het?
+    # TODO: For testing, not adjust by sex karyotype yet
+    mt = mt.select_entries("GT", "adj").select_cols("is_genome").select_rows()
     return mt
 
 
@@ -112,50 +134,70 @@ def main(args):
     )
 
     test = args.test
+    test_partitions = args.test_partitions
     overwrite = args.overwrite
+    try:
+        ht = release_sites(data_type="joint").ht()
+        # Maybe only filter to variants with a union p-value < 10e-4 later
 
-    exomes_vds = get_gnomad_v4_vds(
-        release_only=True,
-        split=True,
-    )
-    genomes_vds = get_gnomad_v4_genomes_vds(
-        release_only=True,
-        split=True,
-    )
-
-    ht = release_sites(data_type="joint").ht()
-    # Maybe only filter to variants with a union p-value < 10e-4 later
+        if test:
+            ht = get_test_intervals(ht, test_partitions=test_partitions)
+            ht = ht.checkpoint(hl.utils.new_temp_file("test_intervals", "ht"))
 
-    if test:
-        logger.info("Filtering to test intervals...")
-        ht = get_test_intervals(ht)
-        ht = ht.checkpoint(hl.utils.new_temp_file("test_intervals", "ht"))
-        exomes_vds = hl.vds.filter_intervals(
-            exomes_vds, ht, split_reference_blocks=True
+        exomes_vds = get_gnomad_v4_vds(
+            release_only=True,
+            split=True,
         )
-        genomes_vds = hl.vds.filter_intervals(
-            genomes_vds, ht, split_reference_blocks=True
+        genomes_vds = get_gnomad_v4_genomes_vds(
+            release_only=True,
+            split=True,
         )
-
-    mt = densify_union_exomes_genomes(exomes_vds, genomes_vds)
-    mt = mt.checkpoint(
-        hl.utils.new_temp_file("union_exomes_genomes", "mt"),
-    )
-
-    pc_scores = ancestry_pca_scores().ht()
-    mt = mt.annotate_cols(pc=pc_scores[mt.col_key].scores)
-
-    # Perform logistic regression
-    ht = hl.logistic_regression_rows(
-        "firth",
-        y=mt.is_genome,
-        x=mt.GT.n_alt_alleles(),
-        covariates=[1] + [mt.pc[i] for i in range(10)],
-    )
-
-    ht.write(
-        f"/{'test' if test else ''}_logistic_regression_results.ht", overwrite=overwrite
-    )
+        if args.filter_densify_exomes:
+            exomes_mt = filter_and_densify_vds(exomes_vds, ht, data_type="exomes")
+            exomes_mt.checkpoint(
+                "gs://gnomad-tmp-30day/qin/mt/filtered_exomes_dense.mt"
+            )
+        if args.filter_densify_genomes:
+            genomes_mt = filter_and_densify_vds(genomes_vds, ht, data_type="genomes")
+            genomes_mt.checkpoint(
+                "gs://gnomad-tmp-30day/qin/mt/filtered_genomes_dense.mt"
+            )
+        if args.union_dense_mts:
+            exomes_mt = hl.read_matrix_table(
+                "gs://gnomad-tmp-30day/qin/mt/filtered_exomes_dense.mt"
+            )
+            genomes_mt = hl.read_matrix_table(
+                "gs://gnomad-tmp-30day/qin/mt/filtered_genomes_dense.mt"
+            )
+            mt = hl.MatrixTable.union_rows(exomes_mt, genomes_mt)
+            mt.checkpoint(
+                "gs://gnomad-tmp-30day/qin/mt/unioned_exomes_genomes_dense.mt"
+            )
+        if args.logistic_regression:
+            mt = hl.read_matrix_table(
+                "gs://gnomad-tmp-30day/qin/mt/unioned_exomes_genomes_dense.mt"
+            )
+            # Annotate with ancestry PCs
+            pc_scores = ancestry_pca_scores().ht()
+            mt = mt.annotate_cols(pc=pc_scores[mt.col_key].scores)
+            # Perform logistic regression
+            # Checked that number of PCs used to assign ancestry on the joint exomes and genomes
+            # was 20
+            ht = hl.logistic_regression_rows(
+                "firth",
+                y=mt.is_genome,
+                x=mt.GT.n_alt_alleles(),
+                covariates=[1] + [mt.pc[i] for i in range(21)],
+            )
+
+            ht.write(
+                f"/{'test' if test else ''}_logistic_regression_results.ht",
+                overwrite=overwrite,
+            )
+
+    finally:
+        logger.info("Copying log to logging bucket...")
+        hl.copy_log(get_logging_path("freq_logistic_regression"))
 
 
 def get_script_argument_parser() -> argparse.ArgumentParser:
@@ -166,11 +208,36 @@ def get_script_argument_parser() -> argparse.ArgumentParser:
         help="Import VCFs and write MatrixTable",
         action="store_true",
     )
+    parser.add_argument(
+        "--test-partitions",
+        help="Filter to test partitions",
+        action="store_true",
+    )
     parser.add_argument(
         "--overwrite",
         help="Overwrite existing MatrixTable",
         action="store_true",
     )
+    parser.add_argument(
+        "--filter-densify-exomes",
+        help="Filter and densify exomes VDS",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--filter-densify-genomes",
+        help="Filter and densify genomes VDS",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--union-dense-mts",
+        help="Union dense exomes and genomes MatrixTables",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--logistic-regression",
+        help="Perform logistic regression on unioned exomes and genomes MatrixTable",
+        action="store_true",
+    )
     return parser