The purpose of this script is to create a table comparing hgdp_tgp subset of gnomad v3 with the following datasets:
 - Auton et al, 2015 (Phase 3 1000 Genomes: GRCh38 (4-8X))
 - Karczewski et al (gnomAD v3.0: GRCh38)
 - Bergstrom et al, 2020 (HGDP (Bergstrom): GRCh38 (30X+))
 - Byrska-Bishop et al, 2021 (NYGC 1000 Genomes: GRCh38 (30X+))

In [1]:
import hail as hl
# tmp_dir='gs://hgdp-1kg/temporary-files/'
hl.init()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SPARKMONITOR_LISTENER: Started SparkListener for Jupyter Notebook
SPARKMONITOR_LISTENER: Port obtained from environment: 52255
SPARKMONITOR_LISTENER: Application Started: application_1706631390028_0001 ...Start Time: 1706631981697


Running on Apache Spark version 3.3.0
SparkUI available at http://znk-m.c.diverse-pop-seq-ref.internal:34881
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.126-ee77707f4fab
LOGGING: writing to /home/hail/hail-20240130-1626-0.2.126-ee77707f4fab.log


## Calculation/Data Formatting Functions

In [2]:
def group_hist(hgdp_1kg, comparison, gnomad_bool):
    
    if not gnomad_bool:
        # Removing missing values from the dataset
        hgdp_1kg = hgdp_1kg.filter(hl.is_missing(hgdp_1kg.maf_hgdp_1kg), keep=False)
        # Annotating a field with bool of whether each var is in comparison dataset
        hgdp_1kg = hgdp_1kg.annotate(in_comparison = hl.is_defined(comparison[hgdp_1kg.locus, hgdp_1kg.alleles]))
        # Summary will be returned, grouped table with maf bins
        summary = hgdp_1kg.group_by(
            cat = hl.case()
            .when(hgdp_1kg.maf_hgdp_1kg < 1e-3, "0.01%-0.1%")
            .when(hgdp_1kg.maf_hgdp_1kg < 1e-2, "0.1%-1%")
            .when(hgdp_1kg.maf_hgdp_1kg < 1e-1, "1.0%-10%")
            .default("10-50%")
        ).aggregate(
            # number of variants in comnparison
            n_var = hl.agg.count(),
            # number of variants in hgdp+1kgp also in comparison
            n_var_in_both= hl.agg.count_where(hgdp_1kg.in_comparison == True))
        
        comparison = comparison.annotate(not_in_hgdp_1kg = hl.is_missing(hgdp_1kg[comparison.locus, comparison.alleles]))
        num = comparison.aggregate(hl.agg.count_where(comparison.not_in_hgdp_1kg == True))
        
        summary = summary.union(
            hl.utils.range_table(1).key_by(cat="0%").select(n_var=num, n_var_in_both=0), unify=True)
   
    if gnomad_bool:
        # Annotating a field with bool of whether each var is in comparison dataset
        # go back to check if maf >0 in hgdp+1kg
        hgdp_1kg = hgdp_1kg.annotate(in_comparison = hgdp_1kg.gnomad_only_maf > 0)
        summary = hgdp_1kg.group_by(
            cat = hl.case()
            .when(hl.is_missing(hgdp_1kg.hdgp_1kg_only_maf), "missing")
            .when(hgdp_1kg.hdgp_1kg_only_maf < 1e-3, "0.01%-0.1%")
            .when(hgdp_1kg.hdgp_1kg_only_maf < 1e-2, "0.1%-1%")
            .when(hgdp_1kg.hdgp_1kg_only_maf < 1e-1, "1.0%-10%")
            .default("10-50%")
        ).aggregate(
            # number of variants in comparison
            n_var = hl.agg.count(),
            # number of variants in hgdp+1kgp also in comparison
            n_var_in_both = hl.agg.count_where(hgdp_1kg.in_comparison == True))
        
        comparison = comparison.annotate(not_in_hgdp_1kg = hl.is_missing(hgdp_1kg[comparison.locus, comparison.alleles]))
        res = comparison.aggregate(hl.agg.counter(comparison.not_in_hgdp_1kg))
        print(res)
        num = res[True]
        #num = comparison.aggregate(hl.agg.count_where(comparison.not_in_hgdp_1kg))
       
        print(hgdp_1kg.aggregate(hl.agg.counter(hl.is_defined(hgdp_1kg.gnomad_only_maf))))
        print(hgdp_1kg.aggregate(hl.agg.counter(hgdp_1kg.gnomad_only_maf > 0)))
        
        summary = summary.union(
            hl.utils.range_table(1).key_by(cat="0%").select(n_var=num, n_var_in_both=0), unify=True)
                                 
    return summary

In [3]:
# Given a hail matrix table, and fieldname which is the struct within the AF array is located
# creates a new field which contains the minor allele frequency 
# will create a hail table which only contains the row keys and the the maf field created 
def get_maf_ht(mt, fieldname):
    return mt.select_rows(maf = hl.min(mt[fieldname].AF)).rows()

In [4]:
# Set up function to:
# apply gnomAD's sample, variant and genotype QC filters
# remove two contaminated samples identified using CHARR - https://pubmed.ncbi.nlm.nih.gov/37425834/
# remove the gnomAD sample that's added for QC purposes
# add gnomAD's HGDP+1kGP metadata with the updated population labels as a column field 

def run_qc(mt):
    
    ## Apply sample QC filters to dataset 
    # This filters to only samples that passed gnomAD's sample QC hard filters  
    mt = mt.filter_cols(~mt.gnomad_sample_filters.hard_filtered) # removed 31 samples
    
    ## Apply variant QC filters to dataset
    # This subsets to only PASS variants - those which passed gnomAD's variant QC
    # PASS variants have an entry in the filters field 
    mt = mt.filter_rows(hl.len(mt.filters) != 0, keep=False)
    
    # Remove the two contaminated samples identified by CHARR and 'CHMI_CHMI3_WGS2'
    contaminated_samples = {'HGDP01371', 'LP6005441-DNA_A09'}
    contaminated_samples_list = hl.literal(contaminated_samples)
    mt = mt.filter_cols(~contaminated_samples_list.contains(mt['s']))
    
    # CHMI_CHMI3_WGS2 is a sample added by gnomAD for QC purposes and has no metadata info 
    mt = mt.filter_cols(mt.s == 'CHMI_CHMI3_WGS2', keep = False)

    # Only keep the variants which are found in the samples that are left 
    mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))
    
    # Read in and add the metadata with the updated population labels as a column field 
    metadata = hl.import_table(metadata_path, impute = True, key = 's') 
    mt = mt.annotate_cols(meta_updated = metadata[mt.s])
    
    ## Apply genotype QC filters to the dataset
    # This is done using a function imported from gnomAD and is the last step in the QC process
    mt = filter_to_adj(mt)

    return mt

## Reading in Datasets

In [5]:
# Pathways for comparison datasets
# Setting variables for paths makes it easier to update paths in the future
phase3_1kg_path = "gs://hgdp-1kg/hgdp_tgp/comparison_data/ALL.chr*.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz"
nygc_1kg_path = "gs://hgdp-1kg/hgdp_tgp/comparison_data/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr{[1-9],[1-9][0-9]}.recalibrated_variants.vcf.gz"
berg_path = "gs://hgdp-1kg/hgdp_tgp/comparison_data/hgdp_wgs.20190516.full.chr{[1-9],[1-9][0-9]}.vcf.gz"
gnomadv3_path = "gs://gcp-public-data--gnomad/release/3.1.2/ht/genomes/gnomad.genomes.v3.1.2.sites.ht"
hgdp_1kg_preQC_path = 'gs://gcp-public-data--gnomad/release/3.1.2/mt/genomes/gnomad.genomes.v3.1.2.hgdp_1kg_subset_dense.mt'
# Path for PCA outliers list
pca_outlier_path = 'gs://hgdp-1kg/tutorial_datasets/pca/pca_outliers.txt'

In [6]:
# Reading in comparison datasets
# use this to read in the list of vcfs
# mt_list = [hl.import_vcf(mt_path,force_bgz = True) for mt_path in mt_paths]

phase3_1kg_mt = hl.import_vcf(phase3_1kg_path, force_bgz = True, reference_genome='GRCh38')
nygc_1kg_mt = hl.import_vcf(nygc_1kg_path, reference_genome='GRCh38', force_bgz=True)
berg_mt = hl.import_vcf(berg_path, force_bgz = True, reference_genome='GRCh38')
gnomadv3_ht = hl.read_table(gnomadv3_path)
hgdp_1kg_preQC_mt = hl.read_matrix_table(hgdp_1kg_preQC_path)



In [None]:
# Run QC on the HGDP+1kGP dataset
hgdp_1kg_mt = run_qc(hgdp_1kg_preQC_mt)

In [6]:
# Remove PCA outliers from the dataset

# Use hl.hadoop_open to read in the PCA outliers file into Hail from Google Cloud Storage
with hl.utils.hadoop_open(outliers_path) as file:
    outliers = [line.rstrip('\n') for line in file]

# Use hl.literal to convert the outliers list from a python object to a Hail expression so that it can be used to filter out samples
outliers_list = hl.literal(outliers)
 
# Keep the samples which are not contained in the pca outlier list
hgdp_1kg_mt = hgdp_1kg_mt.filter_cols(~outliers_list.contains(hgdp_1kg_mt['s']))

## Splitting Multi-allelics
will need to do this for nygc, and bergstrom datasets

In [7]:
# will want to use hl.split_multi_hts
# splitting multialleleics for nygc 1kgp
nygc_split_mt = hl.split_multi_hts(nygc_1kg_mt)

In [14]:
# getting sample/var count for nygc after split multi
nygc_split_mt.count()

2022-09-13 16:43:07 Hail: INFO: Coerced sorted dataset
2022-09-13 17:24:22 Hail: INFO: Coerced sorted dataset


(128575470, 3202)

In [8]:
# splitting multiallelics for bergstrom
berg_split_mt = hl.split_multi_hts(berg_mt, permit_shuffle=True)

In [9]:
# getting sample/var count after bergstrom split multi
berg_split_mt.count()

2022-12-15 21:31:43 Hail: INFO: Coerced prefix-sorted dataset
2022-12-15 21:36:38 Hail: INFO: Coerced prefix-sorted dataset


(80857392, 929)

In [9]:
# resetting the names to the ones that are used in the downstream steps of this script
nygc_1kg_mt = nygc_split_mt
berg_mt = berg_split_mt

In [12]:
# going to change the number of partitions for each of the datasets to speed up downstream steps
nygc_mt = nygc_1kg_mt.naive_coalesce(5000)
berg_mt = berg_mt.naive_coalesce(5000)
phase3_mt = phase3_1kg_mt.naive_coalesce(5000)
hgdp_1kg_mt = hgdp_1kg_mt.naive_coalesce(5000)

## Getting Preliminary counts for all datasets
#### Get the initial counts for your datasets before any changes are made so you have a starting var/sample number to compare to once you make changes

In [18]:
hgdp_1kg_mt.count()

(159795273, 4096)

#### HGDP_1kGP Merged dataset
- Samples: 4098
- Variants: 159,795,273

In [17]:
phase3_1kg_mt.count()

2022-11-15 15:49:36 Hail: INFO: Coerced sorted dataset


(73257633, 2548)

#### Phase 3 1kGP
- Samples: 2548
- Variants: 73,257,633

In [45]:
berg_mt.count()

2022-12-16 17:07:54 Hail: INFO: Coerced prefix-sorted dataset
2022-12-16 17:12:35 Hail: INFO: Coerced prefix-sorted dataset


(80857392, 929)

#### Bergstrom 
- Samples: 929
- Variants: 80,857,392

In [17]:
nygc_1kg_mt.count()

2022-11-17 19:30:51 Hail: INFO: Coerced sorted dataset


(119895186, 3202)

#### NYGC 1kGP 
- Samples: 3202
- Variants: 119,895,186

In [18]:
gnomadv3_ht.count()

759302267

#### gnomAD 
- Variants: 759,302,267

## Dataset Comparison

#### Overall steps for comparison:
I will be creating one matrix table per comparison as well as one base matrix table of HGDP_1kGP with a bool field annotated with T/F for each of the comparison datasets

For each dataset, I need to obtain -
- number of novel variants in hgdp_tgp v3.1 compared to the comparison dataset
- number of novel singletons in hgdp_tgp v3.1 compared to the comparison dataset

Creating a table of just the MAF for each comparison dataset so that I can compare with the hgdp_1kgp dataset

In [11]:
# Running hl.variant_qc() on hgdp+1kgp dataset to get the AF
hgdp_1kg_mt_var = hl.variant_qc(hgdp_1kg_mt)

In [12]:
# Creating maf tables for all of the datasets
# done using get_maf_ht which takes the min of the allele frequency array in a dataset 
# given the name of the field which contains and array with the allele frequencies
hgdp_1kg_maf = get_maf_ht(hgdp_1kg_mt_var, 'variant_qc')
hgdp_1kg_maf = hgdp_1kg_maf.rename({'maf': 'maf_hgdp_1kg'})

nygc_maf = get_maf_ht(nygc_1kg_mt, 'info')
nygc_maf = nygc_maf.rename({'maf': 'nygc_maf'})

berg_maf = get_maf_ht(berg_mt, 'info')
berg_maf = berg_maf.rename({'maf': 'berg_maf'})

phase3_maf = get_maf_ht(phase3_1kg_mt, 'info')
phase3_maf = phase3_maf.rename({'maf': 'phase3_maf'})

# # Getting the gnomADv3 maf table
# # Cannot use the get_maf_ht function since it is already a table and the format is a bit different
# gnomadv3_ht = gnomadv3_ht.annotate(maf = hl.float64(gnomadv3_ht.freq[0].AF))
# gnomadv3_ht = gnomadv3_ht.rename({'maf': 'gnomadv3_maf'})
# gnomadv3_maf = gnomadv3_ht.select('gnomadv3_maf')

### Get a separate matrix table with only the gnomAD AF metrics needed in order to write out the gnomAD comparison histogram

In [13]:
# adding a col with a bool for if samples are ~gnomad_high_quality
# total of True in this col is equal to the number of samples which are in HGDP+1kGP but not in the gnomAD dataset
mt = hgdp_1kg_mt_anno.annotate_cols(not_in_gnomad = ~hgdp_1kg_mt_anno.gnomad_release)

In [14]:
# calculating call stats for the whole hgdp_1kg matrix table as well as only for samples which were not in gnomad
mt = mt.annotate_rows(hgdp_tgp_stats=hl.agg.call_stats(mt.GT, mt.alleles),
                      not_in_gnomad_stats=hl.agg.filter(mt.not_in_gnomad==True,
                                                        hl.agg.call_stats(mt.GT, mt.alleles)),
                      pca_outlier_stats=hl.agg.filter(mt.is_pca_outlier, hl.agg.call_stats(mt.GT, mt.alleles))
                     )

In [18]:
# mt.rows().write('gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/comparison_hists/gnomad_agg_stats_v1.ht')

2022-12-20 15:40:09 Hail: INFO: wrote table with 159795273 rows in 50000 partitions to gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/comparison_hists/gnomad_agg_stats_v1.ht


In [15]:
ht = hl.read_table('gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/comparison_hists/gnomad_agg_stats_v1.ht')

In [16]:
# this will give the frequency just for the samples which are only in gnomad (not in 1kgp or HGDP)
ht = ht.annotate(gnomad_only_AC=ht.gnomad_freq[0].AC - (ht.hgdp_tgp_stats.AC - ht.not_in_gnomad_stats.AC)) 

In [17]:
# adding a field with the calculated AN for gnomAD only
ht = ht.annotate(gnomad_only_AN=ht.gnomad_freq[0].AN - (ht.hgdp_tgp_stats.AN - ht.not_in_gnomad_stats.AN)) 

In [18]:
# Calculating the maf for variants in gnomAD only
ht = ht.annotate(gnomad_only_maf = 0.5 - hl.abs(
    0.5 - (ht.gnomad_only_AC[1]/ht.gnomad_only_AN)))

In [19]:
# calculating the maf for variants in HGDP+1kGP only
ht = ht.annotate(hdgp_1kg_only_maf = hl.min(
    (ht.not_in_gnomad_stats.AC - ht.pca_outlier_stats.AC)/(ht.not_in_gnomad_stats.AN - ht.pca_outlier_stats.AN)
))

In [20]:
ht = ht.select_globals()

In [21]:
# trying to create a ht with only the metrics needed for gnomAD comparison
hgdp_1kg_gnomad_ht = ht.select('gnomad_only_maf',
                             'hdgp_1kg_only_maf',
                             'gnomad_only_AC')

In [88]:
# hgdp_1kg_gnomad_ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'gnomad_only_maf': float64 
    'hdgp_1kg_only_maf': float64 
    'gnomad_only_AC': array<int32> 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


# New method for comparison plots

In [22]:
# Changing the number of partitions for each of the datasets to speed up downstream steps
nygc_maf = nygc_maf.naive_coalesce(5000)
berg_maf = berg_maf.naive_coalesce(5000)
phase3_maf = phase3_maf.naive_coalesce(5000)
hgdp_1kg_maf = hgdp_1kg_maf.naive_coalesce(5000)
gnomadv3_ht = gnomadv3_ht.naive_coalesce(5000)

In [23]:
# Creating a table with the aggregated values for phase3 1kGP
gnomAD_hist = group_hist(hgdp_1kg_gnomad_ht, gnomadv3_ht, True).persist()

frozendict({False: 155484090, True: 603818177})
frozendict({False: 4311183, True: 155484090})
frozendict({False: 35064853, True: 120419237, None: 4311183})


2022-12-20 21:17:11 Hail: INFO: Coerced sorted dataset
2022-12-20 21:17:11 Hail: INFO: Coerced dataset with out-of-order partitions.
2022-12-20 21:17:11 Hail: INFO: Coerced sorted dataset


In [24]:
# gnomAD_hist.show()

cat,n_var,n_var_in_both
str,int64,int64
"""0%""",603818177,0
"""0.01%-0.1%""",113052143,75383600
"""0.1%-1%""",27485532,25792458
"""1.0%-10%""",11730447,11719115
"""10-50%""",7527151,7524064


In [25]:
gnomAD_hist.export('gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/comparison_hists/gnomAD_hist.tsv')

2022-12-20 21:18:31 Hail: INFO: merging 6 files totalling 145...
2022-12-20 21:18:32 Hail: INFO: while writing:
    gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/comparison_hists/gnomAD_new_hist.tsv
  merge time: 601.119ms


In [71]:
# creating a table with the aggregated values - testing with bergstrom data 1st
bergstrom_hist = group_hist(hgdp_1kg_maf, berg_maf, False).persist()

2022-12-16 19:16:18 Hail: INFO: Coerced prefix-sorted dataset
2022-12-16 19:18:45 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-12-16 19:21:06 Hail: INFO: Coerced prefix-sorted dataset
2022-12-16 19:23:30 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-12-16 19:43:36 Hail: INFO: Coerced almost-sorted dataset
2022-12-16 19:43:36 Hail: INFO: Coerced dataset with out-of-order partitions.
2022-12-16 19:46:05 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-12-16 19:48:25 Hail: INFO: Coerced almost-sorted dataset
2022-12-16 19:48:25 Hail: INFO: Coerced dataset with out-of-order partitions.
2022-12-16 19:50:51 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-12-16 20:07:00 Hail: INFO: Coerced sorted dataset
2022-12-16 20:07:00 Hail: INFO: Coerced dataset with out-of-order partitions.
2022-12-16 20:07:00 Hail: INFO: Coerced sorted dataset


In [72]:
# bergstrom_hist.show()

cat,n_var,n_var_in_both
str,int64,int64
"""0%""",8084110,0
"""0.01%-0.1%""",120211765,38839090
"""0.1%-1%""",21980145,17369958
"""1.0%-10%""",10369257,9602871
"""10-50%""",7231815,6961356


In [49]:
bergstrom_hist.export('gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/comparison_hists/berg_hist.tsv')

2022-12-16 17:57:16 Hail: INFO: merging 6 files totalling 142...
2022-12-16 17:57:17 Hail: INFO: while writing:
    gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/comparison_hists/berg_new_hist.tsv
  merge time: 615.260ms


In [33]:
# Creating a table with the aggregated values for NYGC
nygc_hist = group_hist(hgdp_1kg_maf, nygc_maf, False).persist()

2022-12-15 19:04:11 Hail: INFO: Coerced sorted dataset
2022-12-15 19:06:42 Hail: INFO: Coerced sorted dataset
2022-12-15 19:44:10 Hail: INFO: Coerced sorted dataset
2022-12-15 19:44:10 Hail: INFO: Coerced dataset with out-of-order partitions.
2022-12-15 19:46:46 Hail: INFO: Coerced sorted dataset
2022-12-15 19:46:46 Hail: INFO: Coerced dataset with out-of-order partitions.
2022-12-15 19:57:56 Hail: INFO: Coerced sorted dataset
2022-12-15 19:57:56 Hail: INFO: Coerced dataset with out-of-order partitions.
2022-12-15 19:57:56 Hail: INFO: Coerced sorted dataset


In [34]:
# nygc_hist.show()

cat,n_var,n_var_in_both
str,int64,int64
"""0%""",12566553,0
"""0.01%-0.1%""",120211765,80874034
"""0.1%-1%""",21980145,19042508
"""1.0%-10%""",10369257,9352622
"""10-50%""",7231815,6739753


In [37]:
nygc_hist.export('gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/comparison_hists/nygc_hist.tsv')

2022-12-15 20:06:44 Hail: INFO: merging 6 files totalling 143...
2022-12-15 20:06:44 Hail: INFO: while writing:
    gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/comparison_hists/nygc_new_hist_v1.tsv
  merge time: 210.899ms


In [28]:
# Creating a table with the aggregated values for phase3 1kGP
phase3_hist = group_hist(hgdp_1kg_maf, phase3_maf, False).persist()

2022-12-15 17:06:28 Hail: INFO: Coerced sorted dataset
2022-12-15 17:51:17 Hail: INFO: Coerced sorted dataset
2022-12-15 17:51:17 Hail: INFO: Coerced dataset with out-of-order partitions.
2022-12-15 18:08:11 Hail: INFO: Coerced sorted dataset
2022-12-15 18:08:11 Hail: INFO: Coerced dataset with out-of-order partitions.
2022-12-15 18:08:11 Hail: INFO: Coerced sorted dataset


In [29]:
# phase3_hist.show()

cat,n_var,n_var_in_both
str,int64,int64
"""0%""",2129515,0
"""0.01%-0.1%""",120211765,45207677
"""0.1%-1%""",21980145,14199060
"""1.0%-10%""",10369257,6611461
"""10-50%""",7231815,5109920


In [30]:
phase3_hist.export('gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/comparison_hists/phase3_hist.tsv')

2022-12-15 18:21:57 Hail: INFO: merging 6 files totalling 142...
2022-12-15 18:21:57 Hail: INFO: while writing:
    gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/comparison_hists/phase3_new_hist.tsv
  merge time: 339.141ms
