### This notebook focus on generating a SurvIndel matrix table from MELT VCFs¶


In [1]:
%%configure -f
{"driverMemory": "6000M"}

In [2]:
import hail as hl
hl.init(sc)

Starting Spark application


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.1.2-amzn-0
SparkUI available at
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.80-4ccfae1ff293
LOGGING: writing to

In [3]:
## list all resources  used in this notebook 
survindel2_uri="DUP.for_hail.vcf.gz"
#whiltelist_region_bed_uri = "resources_broad_hg38_v0_wgs_calling_regions.hg38.merged.minus_excl_regions.bed"
whiltelist_region_bed_uri = "resources_broad_hg38_v0_wgs_calling_regions.hg38.merged.autosome_only-minus_excl_regions.bed"

release14_sample_txt_uri = "SG10K-SV-Release-1.3.samples.txt"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
mt = hl.import_vcf(survindel2_uri, reference_genome="GRCh38", force_bgz=True)
mt.describe()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        SVTYPE: str, 
        END: int32, 
        N_SAMPLES: int32, 
        N_SVS: int32, 
        GCF: float64, 
        SVSIZE: int32
    }
----------------------------------------
Entry fields:
    'GT': call
    'FT': str
    'AR': int32
    'SR': int32
    'RR': int32
    'RS': int32
    'RE': int32
    'CN': int32
    'IL': int32
    'WR': int32
    'DFL': int32
    'DDL': int32
    'DDR': int32
    'DFR': int32
    'CID': int32
    'OW': int32
    'DHFC': float64
    'DHBFC': float64
    'DHFFC': float64
    'DHSP': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------

In [5]:
## Filter out relevant samples
sample_ht = hl.import_table(release14_sample_txt_uri).key_by('s')

print(sample_ht.count()) ## 5487
print(mt.count()) ## 

mt = mt.filter_cols(hl.is_defined(sample_ht[mt.col_key]))

print(mt.count()) ## 


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

5487
(43622, 5856)
(43622, 5487)
2024-04-11 09:16:44 Hail: INFO: Reading table without type imputation
  Loading field 's' as type str (not specified)
2024-04-11 09:16:58 Hail: INFO: Coerced sorted dataset
2024-04-11 09:17:14 Hail: INFO: Coerced sorted dataset

In [6]:
svtype_stats = mt.aggregate_rows(hl.struct( svtype_stat = hl.agg.counter(mt.info.SVTYPE)))
print(svtype_stats.svtype_stat)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

frozendict({'DUP': 43622})
2024-04-11 09:17:30 Hail: INFO: Coerced sorted dataset

In [7]:
mt.rows().show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+---------------+-------------+-----------+----------+
| locus         | alleles       | rsid        |      qual | filters  |
+---------------+---------------+-------------+-----------+----------+
| locus<GRCh38> | array<str>    | str         |   float64 | set<str> |
+---------------+---------------+-------------+-----------+----------+
| chr1:10321    | ["C","<DUP>"] | "CLUSTER_0" | -1.00e+01 | NA       |
| chr1:54720    | ["C","<DUP>"] | "CLUSTER_1" | -1.00e+01 | NA       |
| chr1:66534    | ["T","<DUP>"] | "CLUSTER_2" | -1.00e+01 | NA       |
| chr1:83963    | ["A","<DUP>"] | "CLUSTER_3" | -1.00e+01 | NA       |
| chr1:99061    | ["C","<DUP>"] | "CLUSTER_4" | -1.00e+01 | NA       |
+---------------+---------------+-------------+-----------+----------+

+-------------+----------+----------------+------------+----------+
| info.SVTYPE | info.END | info.N_SAMPLES | info.N_SVS | info.GCF |
+-------------+----------+----------------+------------+----------+
| str         

In [8]:
# Check if the genotype PASS the QC
mt = mt.annotate_entries(
    FS = hl.case()
            .when((mt.GT.is_hom_ref()) & (mt.FT == "PASS"), "PASS")
            .when((mt.GT.is_het()) &(mt.FT == "PASS") & (mt.DHBFC > 1.3), "PASS")
            .when((mt.GT.is_hom_var()) &(mt.FT == "PASS") & (mt.DHBFC > 1.3), "PASS")
            .default("FAIL")            
    )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
# Check if the site PASS across all samples
# Variant site is marked as FAIL if all samples have FS == FAIL

mt = mt.annotate_rows(
    site_is_pass = hl.agg.any(mt.FS == "PASS")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
# Define filters
filters = [
    {'key': 'PASS', 'expr': ( mt.rows().select("site_is_pass") == True ) },
    {'key': 'FAIL', 'expr': ( mt.rows().select("site_is_pass") == False )  },
]

ht = mt.rows()

# Compute filters
ht_filters = ht.annotate(
    filters=hl.set(hl.filter(
        lambda x: hl.is_defined(x),
        [hl.or_missing(
            d['expr'],
            d['key']
        ) for d in filters]
    ))
)

# Copy filters into main sample ht
mt= mt.annotate_rows(
    filters = mt.filters.union(ht_filters[mt.row_key].filters)
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
##  because we want to only carry fwd entries wit at least one hom-ref and drop monomorphic entries
##   and there is a need to put all genotypes that fail FORMAT/FT to fORMAT/GT = `./.`


mt = mt.annotate_entries(
    GT = hl.case()
            .when((mt.GT.is_hom_ref()) & (mt.FT == "PASS"), mt.GT)
            .when((mt.GT.is_het()) &(mt.FT == "PASS") & (mt.DHBFC > 1.3), mt.GT)
            .when((mt.GT.is_hom_var()) &(mt.FT == "PASS") & (mt.DHBFC > 1.3),mt.GT)
            .default( hl.null(hl.tcall) )            
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
mt.rows().show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+---------------+-------------+-----------+----------+
| locus         | alleles       | rsid        |      qual | filters  |
+---------------+---------------+-------------+-----------+----------+
| locus<GRCh38> | array<str>    | str         |   float64 | set<str> |
+---------------+---------------+-------------+-----------+----------+
| chr1:10321    | ["C","<DUP>"] | "CLUSTER_0" | -1.00e+01 | {}       |
| chr1:54720    | ["C","<DUP>"] | "CLUSTER_1" | -1.00e+01 | {}       |
| chr1:66534    | ["T","<DUP>"] | "CLUSTER_2" | -1.00e+01 | {}       |
| chr1:83963    | ["A","<DUP>"] | "CLUSTER_3" | -1.00e+01 | {}       |
| chr1:99061    | ["C","<DUP>"] | "CLUSTER_4" | -1.00e+01 | {}       |
| chr1:600342   | ["T","<DUP>"] | "CLUSTER_5" | -1.00e+01 | {}       |
| chr1:600743   | ["T","<DUP>"] | "CLUSTER_6" | -1.00e+01 | {}       |
| chr1:609338   | ["C","<DUP>"] | "CLUSTER_7" | -1.00e+01 | {}       |
| chr1:712566   | ["A","<DUP>"] | "CLUSTER_8" | -1.00e+01 | {}       |
| chr1

In [13]:
## filter relevant variant
## 2- contains "PASS", "{fail}" FILTER entries We only carry fwd "FILTER=PASS" entries 
mt = mt.filter_rows(mt.filters.length() == 0, keep=True)
print(mt.count()) 

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(43622, 5487)
2024-04-11 09:18:12 Hail: INFO: Coerced sorted dataset
2024-04-11 09:18:19 Hail: INFO: Coerced sorted dataset

In [14]:
# Remove monomorphic variants

# run variant qc
mt = hl.variant_qc(mt)

# annotate the matrix table
mt = mt.annotate_rows(
    n_called = mt.variant_qc.n_called,
    n_hom_ref = hl.agg.count_where(mt.GT.is_hom_ref()),
    n_het = hl.agg.count_where(mt.GT.is_het()),
    n_hom_alt = hl.agg.count_where(mt.GT.is_hom_var())
)

mt = mt.annotate_rows(
   n_non_ref = mt.n_hom_alt + mt.n_het 
)

# Remove monomorphic variants
mt = mt.filter_rows(
    (mt.n_called == mt.n_hom_ref) | 
    (mt.n_called == mt.n_het) | 
    (mt.n_called == mt.n_hom_alt),
    keep = False
)
print("Samples: %d; Variants: %d; Entries: %d" % (mt.count_cols(), mt.count_rows(), mt.entries().count()))


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Samples: 5487; Variants: 34316; Entries: 188291892
2024-04-11 09:18:38 Hail: INFO: Coerced sorted dataset
2024-04-11 09:18:44 Hail: INFO: Coerced sorted dataset
2024-04-11 09:18:51 Hail: INFO: Coerced sorted dataset
2024-04-11 09:19:11 Hail: WARN: entries(): Resulting entries table is sorted by '(row_key, col_key)'.
    To preserve row-major matrix table order, first unkey columns with 'key_cols_by()'
2024-04-11 09:19:17 Hail: INFO: Coerced sorted dataset
2024-04-11 09:19:23 Hail: INFO: Coerced sorted dataset

In [15]:
# FILTER: sites with no hom_ref call
mt = mt.filter_rows(mt.n_called == mt.n_non_ref, keep = False)
print("Samples: %d; Variants: %d; Entries: %d" % (mt.count_cols(), mt.count_rows(), mt.entries().count()))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Samples: 5487; Variants: 34304; Entries: 188226048
2024-04-11 09:19:51 Hail: INFO: Coerced sorted dataset
2024-04-11 09:19:57 Hail: INFO: Coerced sorted dataset
2024-04-11 09:20:03 Hail: INFO: Coerced sorted dataset
2024-04-11 09:20:30 Hail: INFO: Coerced sorted dataset
2024-04-11 09:20:36 Hail: INFO: Coerced sorted dataset

In [16]:
##
## filter relevant variant
## 2- contains "PASS", "{fail}" FILTER entries We only carry fwd "FILTER=PASS" entries 
## 5- contains monomorphic entries. We only carry fwd entries wit at least one hom-ref and drop monomorphic entries
## 6 Size > 50bp and size < 10,000,000 bp
## 7- contains SV outside of our predefind whitlist region (ie not low-cpmplexity, telemore, centromere, ...) 

print(mt.count()) ## 


## load the predefine whitlist region (ie not low-cpmplexity, telemore, centromere, ...) 
whitelist_region= hl.import_bed(whiltelist_region_bed_uri, reference_genome='GRCh38')

## filter relevant variant
mt = mt.filter_rows(
    True
    & ((~hl.is_defined(mt.info.SVSIZE))                         ## We only carry fwd entries with INFO/SVSIZE undefined 
       | (mt.info.SVSIZE >= 50)                                 ##                            or  INFO/SVSIZE > 50bp
       | (mt.info.SVSIZE <= 10000000))                          ##                            or INFO/SVSIZE < 10,000,000bp 
    & (hl.is_defined(whitelist_region[mt.locus]))               ## We only carry fwd  whitelist region contained SV 
    , keep=True)

print(mt.count()) 

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(34304, 5487)
(32465, 5487)
2024-04-11 09:21:02 Hail: INFO: Coerced sorted dataset
2024-04-11 09:21:08 Hail: INFO: Coerced sorted dataset
2024-04-11 09:21:28 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2024-04-11 09:21:35 Hail: INFO: Coerced sorted dataset
2024-04-11 09:21:40 Hail: INFO: Coerced sorted dataset
2024-04-11 09:21:41 Hail: INFO: Coerced sorted dataset

In [17]:

## write the resulting mt
release14_surv_mt_uri =   "SG10K_SV_SURVINDEL2.n5487.m32465.discovery.mt"

mt.write(release14_surv_mt_uri, overwrite=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2024-04-11 09:30:24 Hail: INFO: Coerced sorted dataset
2024-04-11 09:30:33 Hail: INFO: Coerced sorted dataset
2024-04-11 09:30:34 Hail: INFO: Coerced sorted dataset
2024-04-11 09:32:00 Hail: INFO: wrote matrix table with 32465 rows and 5487 columns in 22 partitions to SG10K_SV_SURVINDEL2.n5487.m32465.discovery.mt
    Total size: 2.47 GiB
    * Rows/entries: 2.47 GiB
    * Columns: 21.49 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  1646 rows (128.85 MiB)