In [2]:
import hail as hl

In [3]:
# trying to fix requester pays bucket issue
# setting requester pays bucket to use throughout tutorial
GCP_PROJECT_NAME = "diverse-pop-seq-ref"
hl.init(spark_conf={
    'spark.hadoop.fs.gs.requester.pays.mode': 'CUSTOM',
    'spark.hadoop.fs.gs.requester.pays.buckets': 'hgdp_tgp,gcp-public-data--gnomad',
    'spark.hadoop.fs.gs.requester.pays.project.id': 'diverse-pop-seq-ref'
})

Running on Apache Spark version 3.1.2
SparkUI available at http://znk-m.c.diverse-pop-seq-ref.internal:44669
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.95-513139587f57
LOGGING: writing to /home/hail/hail-20220614-1752-0.2.95-513139587f57.log


# Data Reading Function
This function serves the purpose of reading in the dataset of different stages throughout the tutorial and given certain flags, will allow the user to specify which filters they would like run on the dataset. This function helps to reduce the amount of times data needs to be written out, overall decreasing the computational and monetary cost of running the tutorials. 

In [5]:
def read_qc(
        default: bool = False,
        sample_qc: bool = False,
        variant_qc: bool = False,
        duplicate: bool = False,
        outlier_removal: bool = False,
        ld_pruning: bool = False,
        rel_unrel: str = 'default') -> hl.MatrixTable:
    """
    Wrapper function to get HGDP+1kGP data as Matrix Table at different stages of QC/filtering.
    By default, returns pre QC MatrixTable with qc filters annotated but not filtered.

    :param bool default: if True will preQC version of the dataset
    :param bool sample_qc: if True will return a post sample QC matrix table
    :param bool variant_qc: if True will return a post variant QC matrix table
    :param bool duplicate: if True will return a matrix table with duplicate samples removed
    :param bool outlier_removal: if True will return a matrix table with PCA outliers and duplicate samples removed
    :param bool ld_pruning: if True will return a matrix table that has gone through:
        - sample QC
        - variant QC
        - PCA outlier removal
        - duplicate removal
        - LD pruning
        - additional variant filtering
    :param bool rel_unrel: default will return same mt as ld pruned above
        if 'related' will return a matrix table with only related samples.
        if 'unrelated' will return my with only unrelated samples
    """
    # Reading in all the tables and matrix tables needed to generate the pre_qc matrix table
    sample_meta = hl.import_table('gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/gnomad_meta_v1.tsv')
    sample_qc_meta = hl.read_table('gs://hgdp_tgp/output/gnomad_v3.1_sample_qc_metadata_hgdp_tgp_subset.ht')
    var_meta = hl.read_table(
        'gs://gcp-public-data--gnomad/release/3.1.1/ht/genomes/gnomad.genomes.v3.1.1.sites.ht', n_partitions=5000)
    dense_mt = hl.read_matrix_table('gs://hgdp_tgp/output/tgp_hgdp.mt')

    # Takes a list of dicts and converts it to a struct format (works with nested structs too)
    def dict_to_struct(d):
        fields = {}
        for k, v in d.items():
            if isinstance(v, dict):
                v = dict_to_struct(v)
            fields[k] = v
        return hl.struct(**fields)

    # un-flattening a hail table with nested structure
    # dict to hold struct names as well as nested field names
    d = {}

    # Getting the row field names
    row = sample_meta.row_value

    # returns a dict with the struct names as keys and their inner field names as values
    for name in row:
        def recur(dict_ref, split_name):
            if len(split_name) == 1:
                dict_ref[split_name[0]] = row[name]
                return
            existing = dict_ref.get(split_name[0])
            if existing is not None:
                assert isinstance(existing, dict), existing
                recur(existing, split_name[1:])
            else:
                existing = {}
                dict_ref[split_name[0]] = existing
                recur(existing, split_name[1:])
        recur(d, name.split('.'))

    # using the dict created from flattened struct, creating new structs now un-flattened
    sample_meta = sample_meta.select(**dict_to_struct(d))
    sample_meta = sample_meta.key_by('s')

    # grabbing the columns needed from Alicia's metadata
    new_meta = sample_meta.select(sample_meta.hgdp_tgp_meta, sample_meta.bergstrom)

    # creating a table with gnomAD sample metadata and HGDP metadata
    ht = sample_qc_meta.annotate(**new_meta[sample_qc_meta.s])

    # stripping 'v3.1::' from the names to match with the densified MT
    ht = ht.key_by(s=ht.s.replace("v3.1::", ""))

    # Using hl.annotate_cols() method to annotate the gnomAD variant QC metadata onto the matrix table
    mt = dense_mt.annotate_cols(**ht[dense_mt.s])

    # annotating preQC dataset with variant metadata
    mt = mt.annotate_rows(**var_meta[mt.locus, mt.alleles])

    print(f"sample_qc: {sample_qc}\nvariant_qc: {variant_qc}\nduplicate: {duplicate}\noutlier_removal: { outlier_removal}\nld_pruning: {ld_pruning}\nrel_unrel: {rel_unrel}")

    if default:
        print("Returning default preQC matrix table")
        # returns preQC dataset
        return mt

    if sample_qc:
        print("Running sample QC")
        # run data through sample QC
        # filtering samples to those who should pass gnomADs sample QC
        # this filters to only samples that passed gnomad sample QC hard filters
        mt = mt.filter_cols(~mt.sample_filters.hard_filtered)

        # annotating partially filtered dataset with variant metadata
        mt = mt.annotate_rows(**var_meta[mt.locus, mt.alleles])

    if variant_qc:
        print("Running variant QC")
        # run data through variant QC
        # Subsetting the variants in the dataset to only PASS variants (those which passed gnomAD's variant QC)
        # PASS variants are variants which have an entry in the filters field.
        # This field contains an array which contains a bool if any variant qc filter was failed
        # This is the last step in the QC process
        mt = mt.filter_rows(hl.len(mt.filters) != 0, keep=False)

    if duplicate:
        print("Removing duplicate sample")
        # Removing any duplicates in the dataset using hl.distinct_by_col() which removes
        # columns with a duplicate column key. It keeps one column for each unique key.
        mt = mt.distinct_by_col()

    if outlier_removal:
        print("Removing PCA outliers")
        # remove PCA outliers and duplicates
        # reading in the PCA outlier list
        # To read in the PCA outlier list, first need to read the file in as a list
        # using hl.hadoop_open here which allows one to read in files into hail from Google cloud storage
        pca_outlier_path = 'gs://hgdp-1kg/hgdp_tgp/pca_outliers_v2.txt'
        with hl.utils.hadoop_open(pca_outlier_path) as file:
            outliers = [line.rstrip('\n') for line in file]

        # Using hl.literal here to convert the list from a python object to a hail expression so that it can be used
        # to filter out samples
        outliers_list = hl.literal(outliers)

        # Using the list of PCA outliers, using the ~ operator which is a negation operator and obtains the compliment
        # In this case the compliment is samples which are not contained in the pca outlier list
        mt = mt.filter_cols(~outliers_list.contains(mt['s']))

    if ld_pruning:
        print("Returning ld pruned post QC matrix table pre PCA outlier removal ")
        # read in dataset which has additional variant filtering and ld pruning run
        # data has gone through:
        #   - sample QC
        #   - variant QC
        #   - duplicate removal
        mt = hl.read_matrix_table('gs://hgdp-1kg/hgdp_tgp/intermediate_files/filtered_n_pruned_output_updated.mt')

    if rel_unrel == "all":
        print("Returning post QC matrix table pre PCA outlier removal with related & unrelated individuals")
        # need to check what steps this dataset has gone through, this is something to discuss with Mary
        # data has gone through:
        #   - sample QC
        #   - variant QC
        #   - duplicate removal
        #   - LD pruning
        mt = hl.read_matrix_table('gs://hgdp-1kg/hgdp_tgp/intermediate_files/filtered_n_pruned_output_updated.mt')

    elif rel_unrel == 'related':
        print("Returning post QC matrix table pre PCA outlier removal with only related individuals")
        # data has gone through:
        #   - sample QC
        #   - variant QC
        #   - duplicate removal
        #   - LD pruning
        #   - pc_relate - filter to only related individuals
        mt = hl.read_matrix_table('gs://hgdp-1kg/hgdp_tgp/rel_updated.mt')


    elif rel_unrel == 'unrelated':
        print("Returning post QC matrix table with only unrelated individuals")
        # data has gone through:
        #   - sample QC
        #   - variant QC
        #   - duplicate removal
        #   - LD pruning
        #   - pc_relate - filter to only unrelated individuals
        mt = hl.read_matrix_table('gs://hgdp-1kg/hgdp_tgp/unrel_updated.mt')


    return mt

In [6]:
pre_qc = read_qc(default=True)

2022-06-14 17:52:34 Hail: INFO: Loading <StructExpression of type struct{s: str, `project_meta.sample_id`: str, `project_meta.research_project_key`: str, `project_meta.seq_project`: str, `project_meta.ccdg_alternate_sample_id`: str, `project_meta.ccdg_gender`: str, `project_meta.ccdg_center`: str, `project_meta.ccdg_study`: str, `project_meta.cram_path`: str, `project_meta.project_id`: str, `project_meta.v2_age`: str, `project_meta.v2_sex`: str, `project_meta.v2_hard_filters`: str, `project_meta.v2_perm_filters`: str, `project_meta.v2_pop_platform_filters`: str, `project_meta.v2_related`: str, `project_meta.v2_data_type`: str, `project_meta.v2_product`: str, `project_meta.v2_product_simplified`: str, `project_meta.v2_qc_platform`: str, `project_meta.v2_project_id`: str, `project_meta.v2_project_description`: str, `project_meta.v2_internal`: str, `project_meta.v2_investigator`: str, `project_meta.v2_known_pop`: str, `project_meta.v2_known_subpop`: str, `project_meta.v2_pop`: str, `proje

sample_qc: False
variant_qc: False
duplicate: False
outlier_removal: False
ld_pruning: False
rel_unrel: default
Returning default preQC matrix table


In [6]:
pre_qc.count()

(211358784, 4151)

In [6]:
post_sample = read_qc(sample_qc=True)

2022-05-18 18:48:49 Hail: INFO: Loading 184 fields. Counts by type:
  str: 184


sample_qc: True
variant_qc: False
duplicate: False
outlier_removal: False
Running sample QC


In [7]:
# This takes a negligible amount of time to run
post_sample.count()

(211358784, 4120)

In [7]:
post_qc = read_qc(sample_qc=True, variant_qc=True, duplicate=True)

2022-05-23 17:07:15 Hail: INFO: Loading <StructExpression of type struct{s: str, `project_meta.sample_id`: str, `project_meta.research_project_key`: str, `project_meta.seq_project`: str, `project_meta.ccdg_alternate_sample_id`: str, `project_meta.ccdg_gender`: str, `project_meta.ccdg_center`: str, `project_meta.ccdg_study`: str, `project_meta.cram_path`: str, `project_meta.project_id`: str, `project_meta.v2_age`: str, `project_meta.v2_sex`: str, `project_meta.v2_hard_filters`: str, `project_meta.v2_perm_filters`: str, `project_meta.v2_pop_platform_filters`: str, `project_meta.v2_related`: str, `project_meta.v2_data_type`: str, `project_meta.v2_product`: str, `project_meta.v2_product_simplified`: str, `project_meta.v2_qc_platform`: str, `project_meta.v2_project_id`: str, `project_meta.v2_project_description`: str, `project_meta.v2_internal`: str, `project_meta.v2_investigator`: str, `project_meta.v2_known_pop`: str, `project_meta.v2_known_subpop`: str, `project_meta.v2_pop`: str, `proje

sample_qc: True
variant_qc: True
duplicate: True
outlier_removal: False
ld_pruning: False
rel_unrel: default
Running sample QC
Running variant QC
Removing duplicate sample


In [None]:
# with variant QC filtering, this is taking over an hour to run
post_qc.count()

In [5]:
# testing out the time for running the related steps
# This is taking over an hour to run
related = read_qc(rel_unrel='related')

2022-05-19 18:11:53 Hail: INFO: Loading 184 fields. Counts by type:
  str: 184


sample_qc: False
variant_qc: False
duplicate: False
outlier_removal: False
Returning post QC matrix table with only related individuals


2022-05-19 18:13:39 Hail: INFO: hwe_normalize: found 248634 variants after filtering out monomorphic sites.
2022-05-19 18:15:40 Hail: INFO: pca: running PCA with 20 components...
2022-05-19 18:29:25 Hail: INFO: Wrote all 122 blocks of 248634 x 4119 matrix with block size 4096.
2022-05-19 18:29:32 Hail: INFO: wrote matrix with 21 rows and 248634 columns as 61 blocks of size 4096 to /tmp/pcrelate-write-read-d2ooPbkImBuM0P1pUN2L4I.bm
2022-05-19 18:29:51 Hail: INFO: wrote matrix with 248634 rows and 4119 columns as 122 blocks of size 4096 to /tmp/pcrelate-write-read-1TFgKH2XsifznqTgZeRKbX.bm


KeyboardInterrupt: 

In [5]:
# This is reading in the ld pruned version of the dataset - takes negligible amount of time to run
ld_pruned = read_qc(ld_pruning=True)

2022-05-24 15:54:39 Hail: INFO: Loading <StructExpression of type struct{s: str, `project_meta.sample_id`: str, `project_meta.research_project_key`: str, `project_meta.seq_project`: str, `project_meta.ccdg_alternate_sample_id`: str, `project_meta.ccdg_gender`: str, `project_meta.ccdg_center`: str, `project_meta.ccdg_study`: str, `project_meta.cram_path`: str, `project_meta.project_id`: str, `project_meta.v2_age`: str, `project_meta.v2_sex`: str, `project_meta.v2_hard_filters`: str, `project_meta.v2_perm_filters`: str, `project_meta.v2_pop_platform_filters`: str, `project_meta.v2_related`: str, `project_meta.v2_data_type`: str, `project_meta.v2_product`: str, `project_meta.v2_product_simplified`: str, `project_meta.v2_qc_platform`: str, `project_meta.v2_project_id`: str, `project_meta.v2_project_description`: str, `project_meta.v2_internal`: str, `project_meta.v2_investigator`: str, `project_meta.v2_known_pop`: str, `project_meta.v2_known_subpop`: str, `project_meta.v2_pop`: str, `proje

sample_qc: False
variant_qc: False
duplicate: False
outlier_removal: False
ld_pruning: True
rel_unrel: default


In [7]:
# Method to cut down run time: downsampling the dataset before running through filters
# When I tried downsampling the same filters were still taking over an hour
# the downsampled dataset will likely need to be written out?
ds_cols_mt = pre_qc.sample_cols(0.01)

In [8]:
ds_cols_mt.count()

(211358784, 50)

In [1]:
#Sizes of all the datasets:
#Dataset Sizes - 
#    *gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/pre_qc_final.mt	3.33 TiB
#    *gs://hgdp-1kg/post_qc_final.mt	3.1 TiB
#    gs://hgdp-1kg/hgdp_tgp/unrel_updated.mt	13.15 GiB
#    gs://hgdp-1kg/hgdp_tgp/rel_updated.mt	4.11 GiB
#    gs://hgdp-1kg/hgdp_tgp/intermediate_files/pre_running_varqc.mt	3.11 TiB
# *these two can be cut down. They are way too large to be used within the tutorials or to expect any user to store