### This notebook focus on generating a MELT matrix table from MELT VCFs¶


In [1]:
%%configure -f
{"driverMemory": "6000M"}

In [2]:
import hail as hl
hl.init(sc)

Starting Spark application


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.1.2-amzn-0
SparkUI available at
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.80-4ccfae1ff293
LOGGING: writing to 

In [3]:
## list all resources  used in this notebook 

release_10_melt_sva_vcf_uri = "SVA.final_comp.vcf.gz"
release_10_melt_line_vcf_uri = "LINE1.final_comp.vcf.gz"
release_10_melt_alu_vcf_uri = "ALU.final_comp.vcf.gz"

#whiltelist_region_bed_uri = "resources_broad_hg38_v0_wgs_calling_regions.hg38.merged.minus_excl_regions.bed"
whiltelist_region_bed_uri = "resources_broad_hg38_v0_wgs_calling_regions.hg38.merged.autosome_only-minus_excl_regions.bed"

release14_sample_txt_uri = "SG10K-SV-Release-1.3.samples.txt"


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# load MELT VCFs
sva_mt = hl.import_vcf(release_10_melt_sva_vcf_uri,   reference_genome="GRCh38", force_bgz=True)
line_mt = hl.import_vcf(release_10_melt_line_vcf_uri, reference_genome="GRCh38", force_bgz=True)
alu_mt = hl.import_vcf(release_10_melt_alu_vcf_uri,   reference_genome="GRCh38", force_bgz=True)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
##
## merge filtered out relevant samples MELT MTs
##     remove ".bqsr" from VCF samples names 
##     remove INFO/ISTP
##     

## filer for relevant samples
def clean_melt_sample(sample_ht, mt):
    mt = mt.key_cols_by()
    mt2 = mt.annotate_cols(s = mt.s.replace( ".bqsr", ""))
    mt2 = mt2.key_cols_by('s')
    mt2 = mt2.filter_cols(hl.is_defined(sample_ht[mt2.col_key]))
    return mt2 

sample_ht = hl.import_table(release14_sample_txt_uri).key_by('s')
#[clean_melt_sample(sample_ht, mt) for mt in melt_mts] ## list completion does not work in this context 
line_mt = clean_melt_sample(sample_ht, line_mt)
alu_mt = clean_melt_sample(sample_ht, alu_mt)
sva_mt = clean_melt_sample(sample_ht, sva_mt)

## remove INFO/ISTP
line_mt = line_mt.annotate_rows(info = line_mt.info.drop('ISTP'))

## merge 
melt_mts = [line_mt, alu_mt, sva_mt]
mt = hl.MatrixTable.union_rows(*melt_mts)
mt.describe()
mt.count()



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        ASSESS: int32, 
        TSD: str, 
        INTERNAL: array<str>, 
        SVTYPE: str, 
        SVLEN: int32, 
        MEINFO: array<str>, 
        DIFF: array<str>, 
        LP: int32, 
        RP: int32, 
        RA: float64, 
        PRIOR: str, 
        SR: int32
    }
----------------------------------------
Entry fields:
    'GT': call
    'GL': array<float64>
    'DP': int32
    'AD': int32
    'PL': array<int32>
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
(168965, 5487)
2024-04-11 09:01:34 Hail: INFO: Reading table without type imputation
  Loading field 's' as type str 

In [6]:
##
## filter relevant variant
##
## 2- contains "PASS", "{fail}" FILTER entries We only carry fwd "FILTER=PASS" entries 
## 5- contains monomorphic entries. We only carry fwd entries wit at least one hom-ref and drop monomorphic entries
## 7- contains SV outside of our predefind whitlist region (ie not low-cpmplexity, telemore, centromere, ...) 

print(mt.count()) ## (168965, 5487)


## load the predefine whitlist region (ie not low-cpmplexity, telemore, centromere, ...) 
whitelist_region= hl.import_bed(whiltelist_region_bed_uri, reference_genome='GRCh38')

## filter relevant variant
mt = mt.filter_rows(
    True
    & (mt.filters.length() == 0)                                ## We only carry fwd "FILTER=PASS" entries 
    & (hl.agg.any(mt.GT.is_hom_ref()))                          ## We only carry fwd entries with at least one hom-ref
    & (hl.if_else(hl.agg.any(hl.is_missing(mt.GT)),             ## We only carry fwd polumorphic entries 
                  hl.agg.counter(mt.GT).size() > 2,             ##       that is GT contain NA + at least 2 of 0/0, 0/1, 1/1  
                  hl.agg.counter(mt.GT).size() > 1 ))           ##       that is GT contain at least 2 of 0/0, 0/1, 1/1  
    & (hl.is_defined(whitelist_region[mt.locus]))               ## We only carry fwd  whitelist region contained SV 
    , keep=True)

print(mt.count()) ## (24674, 5487)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(168965, 5487)
(24674, 5487)
2024-04-11 09:03:32 Hail: INFO: Ordering unsorted dataset with shuffle
2024-04-11 09:03:40 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-04-11 09:03:48 Hail: INFO: Ordering unsorted dataset with shuffle
2024-04-11 09:03:56 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-04-11 09:03:59 Hail: INFO: Ordering unsorted dataset with shuffle
2024-04-11 09:04:01 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-04-11 09:04:12 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2024-04-11 09:04:21 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-04-11 09:04:29 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-04-11 09:04:32 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-04-11 09:04:32 Hail: INFO: Coerced sorted dataset

In [8]:
## write the resulting mt
release14_melt_mt_uri =   "SG10K_SV_MELT.n5487.m24674.discovery.mt"

mt.write(release14_melt_mt_uri, overwrite=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2024-04-11 09:05:46 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-04-11 09:05:54 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-04-11 09:05:56 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-04-11 09:05:56 Hail: INFO: Coerced sorted dataset
2024-04-11 09:07:50 Hail: INFO: wrote matrix table with 24674 rows and 5487 columns in 16 partitions to SG10K_SV_MELT.n5487.m24674.discovery.mt
    Total size: 413.81 MiB
    * Rows/entries: 413.79 MiB
    * Columns: 20.33 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  2355 rows (38.41 MiB)