## This notebook focus on generating a MELT matrix table from MELT VCFs for validation datasets
- The MELT VCF contain samples for 15x and 30x 
- So split the dataset then process them

In [1]:
%%configure -f
{"driverMemory": "6000M"}

In [2]:
import hail as hl
hl.init(sc)

Starting Spark application


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.1.2-amzn-0
SparkUI available at
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.80-4ccfae1ff293
LOGGING: writing to 

In [3]:
## list all resources  used in this notebook 

release_10_melt_sva_vcf_uri = "SVA.final_comp.sorted.vcf.gz"
release_10_melt_line_vcf_uri = "LINE1.final_comp.sorted.vcf.gz"
release_10_melt_alu_vcf_uri = "ALU.final_comp.sorted.vcf.gz"

whiltelist_region_bed_uri = "resources_broad_hg38_v0_wgs_calling_regions.hg38.merged.autosome_only-minus_excl_regions.bed"

validation15x_sample_txt_uri = "SG10K-SV-Release-1.3_15xValidation.samples.txt"
validation30x_sample_txt_uri = "SG10K-SV-Release-1.3_30xValidation.samples.txt"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# load MELT VCFs
sva_mt = hl.import_vcf(release_10_melt_sva_vcf_uri,   reference_genome="GRCh38", force_bgz=True)
line_mt = hl.import_vcf(release_10_melt_line_vcf_uri, reference_genome="GRCh38", force_bgz=True)
alu_mt = hl.import_vcf(release_10_melt_alu_vcf_uri,   reference_genome="GRCh38", force_bgz=True)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## 15x Validation

In [5]:
## filer for relevant samples 15x
def clean_melt_sample(sample_ht, mt):
    mt = mt.key_cols_by()
    mt2 = mt.annotate_cols(s = mt.s.replace( ".bqsr", ""))
    mt2 = mt2.key_cols_by('s')
    mt2 = mt2.filter_cols(hl.is_defined(sample_ht[mt2.col_key]))
    return mt2 

## remove INFO/ISTP
line_mt = line_mt.annotate_rows(info = line_mt.info.drop('ISTP'))

sample_ht = hl.import_table(validation15x_sample_txt_uri).key_by('s')
#[clean_melt_sample(sample_ht, mt) for mt in melt_mts] ## list completion does not work in this context 
line_15x_mt = clean_melt_sample(sample_ht, line_mt)
alu_15x_mt = clean_melt_sample(sample_ht, alu_mt)
sva_15x_mt = clean_melt_sample(sample_ht, sva_mt)


## merge 
melt_15x_mts = [line_15x_mt, alu_15x_mt, sva_15x_mt]
mt15 = hl.MatrixTable.union_rows(*melt_15x_mts)
mt15.describe()
mt15.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        ASSESS: int32, 
        TSD: str, 
        INTERNAL: array<str>, 
        SVTYPE: str, 
        SVLEN: int32, 
        MEINFO: array<str>, 
        DIFF: array<str>, 
        LP: int32, 
        RP: int32, 
        RA: float64, 
        PRIOR: str, 
        SR: int32
    }
----------------------------------------
Entry fields:
    'GT': call
    'GL': array<float64>
    'DP': int32
    'AD': int32
    'PL': array<int32>
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
(168965, 1523)
2024-05-06 07:42:21 Hail: INFO: Reading table without type imputation
  Loading field 's' as type str 

In [6]:
mt15.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(168965, 1523)
2024-05-06 07:45:04 Hail: INFO: Ordering unsorted dataset with shuffle
2024-05-06 07:45:09 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 07:45:19 Hail: INFO: Ordering unsorted dataset with shuffle
2024-05-06 07:45:28 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 07:45:30 Hail: INFO: Ordering unsorted dataset with shuffle
2024-05-06 07:45:31 Hail: INFO: Ordering unsorted dataset with network shuffle

In [7]:
##
## filter relevant variant
##
## 2- contains "PASS", "{fail}" FILTER entries We only carry fwd "FILTER=PASS" entries 
## 5- contains monomorphic entries. We only carry fwd entries wit at least one hom-ref and drop monomorphic entries
## 7- contains SV outside of our predefind whitlist region (ie not low-cpmplexity, telemore, centromere, ...) 
print(mt15.count()) ## 


## load the predefine whitlist region (ie not low-cpmplexity, telemore, centromere, ...) 
whitelist_region= hl.import_bed(whiltelist_region_bed_uri, reference_genome='GRCh38')

## filter relevant variant
mt15 = mt15.filter_rows(
    True
    & (mt15.filters.length() == 0)                                ## We only carry fwd "FILTER=PASS" entries 
    & (hl.agg.any(mt15.GT.is_hom_ref()))                          ## We only carry fwd entries with at least one hom-ref
    & (hl.if_else(hl.agg.any(hl.is_missing(mt15.GT)),             ## We only carry fwd polumorphic entries 
                  hl.agg.counter(mt15.GT).size() > 2,             ##       that is GT contain NA + at least 2 of 0/0, 0/1, 1/1  
                  hl.agg.counter(mt15.GT).size() > 1 ))           ##       that is GT contain at least 2 of 0/0, 0/1, 1/1  
    & (hl.is_defined(whitelist_region[mt15.locus]))               ## We only carry fwd  whitelist region contained SV 
    , keep=True)

print(mt15.count()) ## 

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(168965, 1523)
(12556, 1523)
2024-05-06 07:47:32 Hail: INFO: Ordering unsorted dataset with shuffle
2024-05-06 07:47:38 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 07:47:57 Hail: INFO: Ordering unsorted dataset with shuffle
2024-05-06 07:48:08 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 07:48:09 Hail: INFO: Ordering unsorted dataset with shuffle
2024-05-06 07:48:11 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 07:48:25 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2024-05-06 07:48:34 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 07:48:43 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 07:48:45 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 07:48:45 Hail: INFO: Coerced sorted dataset

In [8]:
## write the resulting mt
release14_15xv_melt_mt_uri =   "SG10K_SV_MELT_validation15x.n1523.m12556.15xvalidation.mt"

mt15.write(release14_15xv_melt_mt_uri, overwrite=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2024-05-06 07:51:53 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 07:52:13 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 07:52:14 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 07:52:15 Hail: INFO: Coerced sorted dataset
2024-05-06 07:53:51 Hail: INFO: wrote matrix table with 12556 rows and 1523 columns in 13 partitions to SG10K_SV_MELT_validation15x.n1523.m12556.15xvalidation.mt
    Total size: 77.04 MiB
    * Rows/entries: 77.04 MiB
    * Columns: 5.97 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  1519 rows (9.10 MiB)

## 30x validation

In [9]:
## filer for relevant samples 30x
def clean_melt_sample(sample_ht, mt):
    mt = mt.key_cols_by()
    mt2 = mt.annotate_cols(s = mt.s.replace( ".bqsr", ""))
    mt2 = mt2.key_cols_by('s')
    mt2 = mt2.filter_cols(hl.is_defined(sample_ht[mt2.col_key]))
    return mt2 


sample_30_ht = hl.import_table(validation30x_sample_txt_uri).key_by('s')
#[clean_melt_sample(sample_ht, mt) for mt in melt_mts] ## list completion does not work in this context 
line_30x_mt = clean_melt_sample(sample_30_ht, line_mt)
alu_30x_mt = clean_melt_sample(sample_30_ht, alu_mt)
sva_30x_mt = clean_melt_sample(sample_30_ht, sva_mt)


## merge 
melt_30x_mts = [line_30x_mt, alu_30x_mt, sva_30x_mt]
mt30 = hl.MatrixTable.union_rows(*melt_30x_mts)
mt30.describe()
mt30.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        ASSESS: int32, 
        TSD: str, 
        INTERNAL: array<str>, 
        SVTYPE: str, 
        SVLEN: int32, 
        MEINFO: array<str>, 
        DIFF: array<str>, 
        LP: int32, 
        RP: int32, 
        RA: float64, 
        PRIOR: str, 
        SR: int32
    }
----------------------------------------
Entry fields:
    'GT': call
    'GL': array<float64>
    'DP': int32
    'AD': int32
    'PL': array<int32>
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
(168965, 1922)
2024-05-06 07:57:01 Hail: INFO: Reading table without type imputation
  Loading field 's' as type str 

In [10]:
##
## filter relevant variant
##
## 2- contains "PASS", "{fail}" FILTER entries We only carry fwd "FILTER=PASS" entries 
## 5- contains monomorphic entries. We only carry fwd entries wit at least one hom-ref and drop monomorphic entries
## 7- contains SV outside of our predefind whitlist region (ie not low-cpmplexity, telemore, centromere, ...)

print(mt30.count()) ## 


## load the predefine whitlist region (ie not low-cpmplexity, telemore, centromere, ...) 
whitelist_region= hl.import_bed(whiltelist_region_bed_uri, reference_genome='GRCh38')

## filter relevant variant
mt30 = mt30.filter_rows(
    True
    & (mt30.filters.length() == 0)                                ## We only carry fwd "FILTER=PASS" entries 
    & (hl.agg.any(mt30.GT.is_hom_ref()))                          ## We only carry fwd entries with at least one hom-ref
    & (hl.if_else(hl.agg.any(hl.is_missing(mt30.GT)),             ## We only carry fwd polumorphic entries 
                  hl.agg.counter(mt30.GT).size() > 2,             ##       that is GT contain NA + at least 2 of 0/0, 0/1, 1/1  
                  hl.agg.counter(mt30.GT).size() > 1 ))           ##       that is GT contain at least 2 of 0/0, 0/1, 1/1  
    & (hl.is_defined(whitelist_region[mt30.locus]))               ## We only carry fwd  whitelist region contained SV 
    , keep=True)

print(mt30.count()) ## 

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(168965, 1922)
(14025, 1922)
2024-05-06 08:00:05 Hail: INFO: Ordering unsorted dataset with shuffle
2024-05-06 08:00:12 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 08:00:31 Hail: INFO: Ordering unsorted dataset with shuffle
2024-05-06 08:00:41 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 08:00:43 Hail: INFO: Ordering unsorted dataset with shuffle
2024-05-06 08:00:45 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 08:00:58 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2024-05-06 08:01:04 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 08:01:13 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 08:01:15 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 08:01:15 Hail: INFO: Coerced sorted dataset

In [11]:
## write the resulting mt
release14_30xv_melt_mt_uri =   "SG10K_SV_MELT_validation30x.n1922.m14025.30xvalidation.mt"

mt30.write(release14_30xv_melt_mt_uri, overwrite=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2024-05-06 08:24:40 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 08:24:59 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 08:25:00 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-05-06 08:25:01 Hail: INFO: Coerced sorted dataset
2024-05-06 08:26:42 Hail: INFO: wrote matrix table with 14025 rows and 1922 columns in 13 partitions to SG10K_SV_MELT_validation30x.n1922.m14025.30xvalidation.mt
    Total size: 109.98 MiB
    * Rows/entries: 109.97 MiB
    * Columns: 7.52 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  1743 rows (13.35 MiB)