In [1]:
import hail as hl

# Functions from gnomAD library to apply genotype filters   
from gnomad.utils.filtering import filter_to_adj

In [2]:
# Initializing Hail 
hl.init()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SPARKMONITOR_LISTENER: Started SparkListener for Jupyter Notebook
SPARKMONITOR_LISTENER: Port obtained from environment: 39083
SPARKMONITOR_LISTENER: Application Started: application_1708446776077_0002 ...Start Time: 1708447432344


Running on Apache Spark version 3.3.2
SparkUI available at http://znk-postqc-m.c.diverse-pop-seq-ref.internal:34547
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.127-bb535cd096c5
LOGGING: writing to /home/hail/hail-20240220-1643-0.2.127-bb535cd096c5.log


In [3]:
# Path for HGDP+1kGP dataset prior to applying gnomAD QC filters
pre_qc_path = 'gs://gcp-public-data--gnomad/release/3.1.2/mt/genomes/gnomad.genomes.v3.1.2.hgdp_1kg_subset_dense.mt'

# PCA outliers file 
outliers_path = 'gs://gcp-public-data--gnomad/release/3.1/secondary_analyses/hgdp_1kg/pca/pca_outliers.txt'

# Path for gnomAD's HGDP+1kGP metadata with updated population labels
metadata_path = 'gs://hgdp-1kg/tutorial_datasets/metadata_and_qc/gnomad_meta_updated.tsv'

# Path for the number of structural variants (SV) counts per genome
sv_counts_path = 'gs://hgdp-1kg/tutorial_datasets/metadata_and_qc/sv_counts_per_genome.tsv'

# Paths to related and unrelated Matrix Tables (without outliers) written out in Notebook 2: PCA and Ancestry Analyses
unrelateds_path = 'gs://hgdp-1kg/tutorial_datasets/pca_results/unrelateds_without_outliers.mt'
relateds_path = 'gs://hgdp-1kg/tutorial_datasets/pca_results/relateds_without_outliers.mt'

# Path for final output table in tsv format
final_table_path = 'gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/fig1_plot_data.tsv'

In [4]:
# Set up function to:
# apply gnomAD's sample, variant and genotype QC filters
# remove two contaminated samples identified using CHARR - https://pubmed.ncbi.nlm.nih.gov/37425834/
# remove the gnomAD sample that's added for QC purposes
# add gnomAD's HGDP+1kGP metadata with the updated population labels as a column field 

def run_qc(mt, metadata_path):
    
    ## Apply sample QC filters to dataset 
    # This filters to only samples that passed gnomAD's sample QC hard filters  
    mt = mt.filter_cols(~mt.gnomad_sample_filters.hard_filtered) # removed 31 samples
    
    ## Apply variant QC filters to dataset
    # This subsets to only PASS variants - those which passed gnomAD's variant QC
    # PASS variants have an entry in the filters field 
    mt = mt.filter_rows(hl.len(mt.filters) != 0, keep=False)
    
    # Remove the two contaminated samples identified by CHARR and 'CHMI_CHMI3_WGS2'
    contaminated_samples = {'HGDP01371', 'LP6005441-DNA_A09'}
    contaminated_samples_list = hl.literal(contaminated_samples)
    mt = mt.filter_cols(~contaminated_samples_list.contains(mt['s']))
    
    # CHMI_CHMI3_WGS2 is a sample added by gnomAD for QC purposes and has no metadata info 
    mt = mt.filter_cols(mt.s == 'CHMI_CHMI3_WGS2', keep = False)

    # Only keep the variants which are found in the samples that are left 
    mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))
    
    # Read in and add the metadata with the updated population labels as a column field 
    metadata = hl.import_table(metadata_path, impute = True, key = 's') 
    mt = mt.annotate_cols(meta_updated = metadata[mt.s])
    
    ## Apply genotype QC filters to the dataset
    # This is done using a function imported from gnomAD and is the last step in the QC process
    mt = filter_to_adj(mt)

    return mt

def remove_pca_outliers(mt, outlier_path):
    # Remove PCA outliers from the HGDP+1kGP dataset
    # Use hl.hadoop_open to read in the PCA outliers file into Hail from Google Cloud Storage
    with hl.utils.hadoop_open(outlier_path) as file:
        outliers = [line.rstrip('\n') for line in file]

    # Use hl.literal to convert the outliers list from a python object 
    # to a Hail expression so that it can be used to filter out samples
    outliers_list = hl.literal(outliers)

    # Keep the samples which are not contained in the pca outlier list
    mt = mt.filter_cols(~outliers_list.contains(mt['s']))
    
    # Only keep the variants which are found in the samples that are left 
    mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))
    
    return mt

In [5]:
# Read in the HGDP+1kGP pre-QC mt
hgdp_1kg_preqc_mt = hl.read_matrix_table(pre_qc_path)

# Run QC on the HGDP+1kGP dataset
hgdp_1kg_mt = run_qc(hgdp_1kg_preqc_mt, metadata_path)

# Remove PCA outliers for the HGDP+1kGP dataset
hgdp_1kg_mt = remove_pca_outliers(hgdp_1kg_mt, outliers_path)

2024-02-20 16:44:19.373 Hail: INFO: Reading table to impute column types 1) / 1]
2024-02-20 16:44:23.733 Hail: INFO: Loading <StructExpression of type struct{s: str, `project_meta.sample_id`: str, `project_meta.research_project_key`: str, `project_meta.seq_project`: str, `project_meta.ccdg_alternate_sample_id`: str, `project_meta.ccdg_gender`: str, `project_meta.ccdg_center`: str, `project_meta.ccdg_study`: str, `project_meta.cram_path`: str, `project_meta.project_id`: str, `project_meta.v2_age`: str, `project_meta.v2_sex`: str, `project_meta.v2_hard_filters`: str, `project_meta.v2_perm_filters`: str, `project_meta.v2_pop_platform_filters`: str, `project_meta.v2_related`: str, `project_meta.v2_data_type`: str, `project_meta.v2_product`: str, `project_meta.v2_product_simplified`: str, `project_meta.v2_qc_platform`: str, `project_meta.v2_project_id`: str, `project_meta.v2_project_description`: str, `project_meta.v2_internal`: str, `project_meta.v2_investigator`: str, `project_meta.v2_kno

In [8]:
hgdp_1kg_mt = hl.sample_qc(hgdp_1kg_mt)

In [9]:
# Grab the column fields of the Matrix Table 
mt_col_table = hgdp_1kg_mt.cols()

# Write a col table with only the columns needed for table 1
mt_col_table = mt_col_table.select(mt_col_table.meta_updated['hgdp_tgp_meta.Genetic.region'],
                                   mt_col_table.meta_updated.population,
                                   mt_col_table.sample_qc.n_snp,
                                   mt_col_table.bam_metrics.pct_bases_20x,
                                   mt_col_table.bam_metrics.pct_bases_10x)

2024-02-20 16:47:12.378 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'


In [10]:
mt_col_table.count()

4094

In [None]:
'''
get table with:
    - pct_bases10x
    - pct_bases20x
    - n_snp
    - n_indel (n_insertion+n_deletion)
    - genetic_region
    - population
    - sample id
    - # SV
    - # CNV
'''

In [11]:
mt_col_table.export(final_table_path, header=True)

2024-02-20 17:13:37.825 Hail: INFO: Coerced sorted dataset     (143 + 33) / 176]
2024-02-20 17:13:39.055 Hail: INFO: merging 177 files totalling 190.0K... / 176]
2024-02-20 17:13:39.819 Hail: INFO: while writing:
    gs://hgdp-1kg/hgdp_tgp/qc_and_figure_generation/fig1_plot_data.tsv
  merge time: 763.198ms
