# Merge variants

In this notebook, we practice merging a small number of AoU and UKB variants

# Setup 

In [None]:
from datetime import datetime
import hail as hl
import os
import time

## Define constants

<div class="alert alert-block alert-info">
<b>Note:</b> The AoU matrix table for the alpha1 release was created via notebook 'Hail Demo' and then moved to a better place within the workspace bucket. It contains all samples and variants for the alpha1 release.
</div>

In [None]:
AOU_MT = f'{os.getenv("WORKSPACE_BUCKET")}/data/aou/alpha1/cohort.mt'

<div class="alert alert-block alert-info">
<b>Note:</b> The UKB matrix table was created via notebook 'create_matrix_tables'. It contains data for all samples within region <kbd>chr21:10M-chr21:20M</kbd>.
</div>

In [None]:
UKB_MT = f'{os.getenv("WORKSPACE_BUCKET")}/data/ukb/exomes/cohort_chr21.mt'

In [None]:
EXOME_REGIONS = f'{os.getenv("WORKSPACE_BUCKET")}/data/ukb/exomes/xgen_plus_spikein.GRCh38.bed'

In [None]:
RESULT_BUCKET = os.getenv("WORKSPACE_BUCKET")
DATESTAMP = time.strftime('%Y%m%d')
TIMESTAMP = time.strftime('%Y%m%d_%H%M%S')
WORK_DIR = !pwd

# Outputs
MERGED_MT = f'{os.getenv("WORKSPACE_BUCKET")}/data/merged/{DATESTAMP}/merged.mt'
AOU_ONLY_MT = f'{os.getenv("WORKSPACE_BUCKET")}/data/merged/{DATESTAMP}/aou_only.mt'
UKB_ONLY_MT = f'{os.getenv("WORKSPACE_BUCKET")}/data/merged/{DATESTAMP}/ukb_only.mt'
HAIL_LOG = f'{WORK_DIR[0]}/hail-merge-variants-{TIMESTAMP}.log'

## Check access

In [None]:
!gsutil ls {AOU_MT}

In [None]:
!gsutil ls {UKB_MT}

## Start Hail 

In [None]:
EXTRA_SPARK_CONFIG = {
    'spark.driver.extraJavaOptions': '-Xss4M',
    'spark.driver.maxResultSize': '50G',
    'spark.driver.memory': '90G',
    'spark.executor.extraJavaOptions': '-Xss4M',
    'spark.kryoserializer.buffer.max': '1G',
    'spark.memory.fraction': '0.33',
    'spark.network.timeout': '300',
    'spark.speculation': 'true',
    'spark.speculation.quantile': '0.95',
    'spark.task.maxFailures': '20'
}

In [None]:
hl.init(spark_conf=EXTRA_SPARK_CONFIG,
        min_block_size=50,
        default_reference='GRCh38',
        log=HAIL_LOG)

# Load exome capture regions

In [None]:
ukb_exome_capture_regions = hl.import_bed(EXOME_REGIONS)

In [None]:
ukb_exome_capture_regions.describe()

In [None]:
ukb_exome_capture_regions.show(5)

## Temporary - limit to a small subset of chr_21¶ 

In [None]:
chr21_interval = hl.parse_locus_interval("chr21:10M-chr22:11M")

In [None]:
ukb_exome_capture_regions = ukb_exome_capture_regions.filter(
    chr21_interval.overlaps(ukb_exome_capture_regions.interval))

In [None]:
ukb_exome_capture_regions.show(5)

# Read AoU matrix table

In [None]:
aou_wgs = hl.read_matrix_table(AOU_MT)

In [None]:
aou_wgs.describe()

In [None]:
aou_wgs = aou_wgs.filter_rows(
    hl.is_defined(ukb_exome_capture_regions[aou_wgs.locus]))

In [None]:
aou_wgs.count()

In [None]:
hl.summarize_variants(aou_wgs)

# Read UKB exomes matrix table

In [None]:
ukb_exomes = hl.read_matrix_table(UKB_MT)

In [None]:
ukb_exomes.describe()

In [None]:
ukb_exomes = ukb_exomes.filter_rows(
    hl.is_defined(ukb_exome_capture_regions[ukb_exomes.locus]))

In [None]:
ukb_exomes.count()

In [None]:
hl.summarize_variants(ukb_exomes)

# Examine the data

## Are the row keys unique?

We expect them to be unique in these two joint-called datasets!

**Answer**: yes

In [None]:
aou_row_key_counts = aou_wgs.rows().group_by(*aou_wgs.row_key).aggregate(n = hl.agg.count())

In [None]:
aou_row_key_counts.filter(aou_row_key_counts.n > 1).count()

In [None]:
ukb_row_key_counts = ukb_exomes.rows().group_by(*ukb_exomes.row_key).aggregate(n = hl.agg.count())

In [None]:
ukb_row_key_counts.filter(ukb_row_key_counts.n > 1).count()

## Are the loci unique?

**Answer**: yes for AoU, no for UKB

In [None]:
aou_loci_counts = aou_wgs.aggregate_rows(hl.agg.group_by(aou_wgs.locus, hl.agg.count()))

In [None]:
aou_duplicate_loci = {k:v for k, v in aou_loci_counts.items() if v > 1}
aou_duplicate_loci

In [None]:
ukb_loci_counts = ukb_exomes.aggregate_rows(hl.agg.group_by(ukb_exomes.locus, hl.agg.count()))

In [None]:
ukb_duplicate_loci = {k:v for k, v in ukb_loci_counts.items() if v > 1}
ukb_duplicate_loci

In [None]:
ukb_exomes.filter_rows(ukb_exomes.locus == list(ukb_duplicate_loci.keys())[0]).show()

In [None]:
ukb_exomes.filter_rows(ukb_exomes.locus == list(ukb_duplicate_loci.keys())[1]).show()

In [None]:
ukb_exomes.filter_rows(ukb_exomes.locus == list(ukb_duplicate_loci.keys())[2]).show()

## What is the ratio of multi-allelic to bi-allelic sites?

**Answer**: about 3% for AoU and 9% for UKB

In [None]:
aou_bi = aou_wgs.filter_rows(hl.len(aou_wgs.alleles) <= 2)

In [None]:
aou_bi_count = aou_bi.count_rows()
aou_bi_count

In [None]:
aou_multi = aou_wgs.filter_rows(hl.len(aou_wgs.alleles) > 2)

In [None]:
aou_multi_count = aou_multi.count_rows()
aou_multi_count

In [None]:
aou_multi_count / (aou_multi_count + aou_bi_count)

In [None]:
ukb_bi = ukb_exomes.filter_rows(hl.len(ukb_exomes.alleles) <= 2)

In [None]:
ukb_bi_count = ukb_bi.count_rows()
ukb_bi_count

In [None]:
ukb_multi = ukb_exomes.filter_rows(hl.len(ukb_exomes.alleles) > 2)

In [None]:
ukb_multi_count = ukb_multi.count_rows()
ukb_multi_count

In [None]:
ukb_multi_count / (ukb_multi_count + ukb_bi_count)

In [None]:
aou_multi.rows().show(20)

In [None]:
ukb_multi.rows().show(20)

## Is the highest frequency allele always first?

**Answer:** no for AoU, yes for UKB

In [None]:
aou_wgs.aggregate_rows(hl.agg.sum(hl.max(aou_wgs.info.AF) != aou_wgs.info.AF[0]))

In [None]:
ukb_exomes.aggregate_rows(hl.agg.sum(hl.max(ukb_exomes.info.AF) != ukb_exomes.info.AF[0]))

# Omit samples that fail QC thresholds

TODO

# Omit variants that fail QC thresholds

TODO

# Perform the merge

## Split the multi-allelic sites

See also https://hail.is/docs/0.2/methods/genetics.html#hail.methods.split_multi_hts

In [None]:
# For efficiency, do no pass the biallelic variants to the split method, just add the corresponding annotations.
aou_bi = aou_wgs.filter_rows(hl.len(aou_wgs.alleles) == 2)
aou_bi = aou_bi.annotate_rows(a_index = 1)
aou_bi = aou_bi.annotate_rows(was_split = False)

# Split the multi-allelic sites into biallelic sites.
aou_multi = aou_wgs.filter_rows(hl.len(aou_wgs.alleles) > 2)
aou_split = hl.split_multi_hts(aou_multi,
                               keep_star=False,
                               left_aligned=False,
                               vep_root='vep',
                               permit_shuffle=False)

# Union the two collections and include only the row and entry fields that are needed.
aou_prepared = aou_split.union_rows(aou_bi)
aou_prepared = aou_prepared.annotate_cols(cohort='aou')
aou_prepared = aou_prepared.select_entries(aou_prepared.GT)
aou_prepared = aou_prepared.select_rows(aou_qual=aou_prepared.qual,
                                        aou_filters=aou_prepared.filters,
                                        aou_info=aou_prepared.info,
                                        aou_a_index = aou_prepared.a_index,
                                        aou_was_split=aou_prepared.was_split,
                                       )

aou_prepared.describe()

In [None]:
aou_prepared.count_rows()

In [None]:
# For efficiency, do no pass the biallelic variants to the split method, just add the corresponding annotations.
ukb_bi = ukb_exomes.filter_rows(hl.len(ukb_exomes.alleles) == 2)
ukb_bi = ukb_bi.annotate_rows(a_index = 1)
ukb_bi = ukb_bi.annotate_rows(was_split = False)

# Split the multi-allelic sites into biallelic sites.
ukb_multi = ukb_exomes.filter_rows(hl.len(ukb_exomes.alleles) > 2)
ukb_split = hl.split_multi_hts(ukb_multi,
                               keep_star=False,
                               left_aligned=False,
                               vep_root='vep',
                               permit_shuffle=False)

# Union the two collections and include only the row and entry fields that are needed.
ukb_prepared = ukb_split.union_rows(ukb_bi)
ukb_prepared = ukb_prepared.annotate_cols(cohort='ukb')
ukb_prepared = ukb_prepared.select_entries(ukb_prepared.GT)
ukb_prepared = ukb_prepared.select_rows(ukb_qual=ukb_prepared.qual,
                                        ukb_filters=ukb_prepared.filters,
                                        ukb_info=ukb_prepared.info,
                                        ukb_a_index = ukb_prepared.a_index,
                                        ukb_was_split=ukb_prepared.was_split,
                                       )

ukb_prepared.describe()

In [None]:
ukb_prepared.count_rows()

## Compute the intersection

In [None]:
start = datetime.now()
print(start)

In [None]:
intersection = aou_prepared.union_cols(ukb_prepared)

In [None]:
intersection.write(MERGED_MT, overwrite=True)

In [None]:
intersection.rows().show(10)

## Compute AoU - UKB

TODO(deflaux) retain all row fields for this extract.

In [None]:
aou_only = aou_prepared.rows().anti_join(ukb_prepared.rows())

In [None]:
aou_only.write(AOU_ONLY_MT, overwrite=True)

## Compute UKB - AoU

In [None]:
ukb_only = ukb_prepared.rows().anti_join(aou_prepared.rows())

In [None]:
ukb_only.write(UKB_ONLY_MT, overwrite=True)

## TODO: in a separate notebook, we should see that

we have retained common variants and dropped rare variants

also check for very common variants according to gnomad for the same region, they should be in the intersection

In [None]:
end = datetime.now()
print(end)
print(end - start)

# Appendix

extra cells that are useful

In [None]:
start = datetime.now()
print(start)

In [None]:
end = datetime.now()
print(end)
print(end - start)

# Provenance

In [None]:
print(datetime.now())

In [None]:
!pip3 freeze