In [None]:
import hail as hl

# *setup_dataset* 

In [None]:
# read in the dataset Zan produced 
# two fields from Alicia’s metadata + Julia's sample QC metadata + variant QC metadata + Konrad’s densified mt
mt = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/hgdp_tgp_dense_meta_filt.mt') 

In [None]:
# editing the format of the filter names and putting them together in a set so that we won't have an issue later when filtering the matrixTable using difference()
# create a set of the gnomAD qc filters (column names under "sample filters") - looks like: {'sex_aneuploidy', 'insert_size', ...} but not in a certain order (randomly ordered)
all_sample_filters = set(mt['sample_filters']) 

In [None]:
import re # for renaming purposes

# bad_sample_filters are filters that removed whole populations despite them passing all other gnomAD filters (mostly AFR and OCE popns)
# remove "fail_" from the filter names and pick those out (9 filters) - if the filter name starts with 'fail_' then replace it with ''
bad_sample_filters = {re.sub('fail_', '', x) for x in all_sample_filters if x.startswith('fail_')} 

In [None]:
# this filters to only variants that passed all gnomad QC or only failed filters in bad_sample_filters
# 'qc_metrics_filters' is under 'sample_filters' and includes a set of all qc filters a particular sample failed 
# if a sample passed all gnomAD qc filters then the column entry for that sample under 'qc_metrics_filters' is an empty set
# so this line goes through the 'qc_metrics_filters'column and sees if there are any samples that passed all the other qc filters except for the ones in the "bad_sample_filters" set (difference()) 
# if a sample has an empty set for the 'qc_metrics_filters' column or if it only failed the filters that are found in the bad_sample_filters set, then a value of zero is returned and we would keep that sample 
# if a sample failed any filters that are not in the "bad_sample_filters" set, then remove it
mt_filt = mt.filter_cols(mt['sample_filters']['qc_metrics_filters'].difference(bad_sample_filters).length() == 0) 

In [None]:
# see how many were removed 
mt.count() # (211358784, 4151)
mt_filt.count() # (211358784, 4017)

In [None]:
# write out the filtered matrixTable temporarily to a cloud bucket (took ~29 min to run)  
mt_filt.checkpoint('gs://african-seq-data/hgdp_tgp/intersect_data_output.mt', overwrite = False, _read_if_exists = True)

In [None]:
# read filtered mt back in 
mt_filt = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/intersect_data_output.mt') 

# *ld_prune_filter* 

In [None]:
# run common variant statistics (quality control metrics) - more info (https://hail.is/docs/0.2/methods/genetics.html#hail.methods.variant_qc)  
mt_var = hl.variant_qc(mt_filt) 

In [None]:
# trying to get down to ~100-300k SNPs - might need to change values later accordingly  
# AF: allele freq and call_rate: fraction of calls neither missing nor filtered
# mt.variant_qc.AF[0] is referring to the first element of the list under that column
mt_var_filt = mt_var.filter_rows((mt_var.variant_qc.AF[0] > 0.05) & (mt_var.variant_qc.AF[0] < 0.95) & (mt_var.variant_qc.call_rate > 0.999))

In [None]:
# ~14min to run 
mt_var_filt.count() # (6844706, 4017) - 6844706 snps 

In [None]:
# ~76 min to run 
pruned = hl.ld_prune(mt_var_filt.GT, r2=0.1, bp_window_size=500000) 

In [None]:
# subset data even further   
mt_var_pru_filt = mt_var_filt.filter_rows(hl.is_defined(pruned[mt_var_filt.row_key])) 

In [None]:
# write out the output as a temp file - make sure to save the file on this step b/c the pruning step takes a while to run
# saving took ~22 min 
mt_var_pru_filt.write('gs://african-seq-data/hgdp_tgp/filtered_n_pruned_output.mt', overwrite=False)

In [None]:
# after saving the pruned file to the cloud, reading it back in for the next steps 
mt_var_pru_filt = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/filtered_n_pruned_output.mt') 

In [None]:
# how many snps are left after filtering and prunning? 
mt_var_pru_filt.count() # (255666, 4017) - 255666 snps 
# between ~100-300k so we proceed without any value adjustments  

# *run_pc_relate*  

In [None]:
# a hail table is produced (~4-5min to run) 
relatedness_ht = hl.pc_relate(mt_var_pru_filt.GT, min_individual_maf=0.05, min_kinship=0.05, statistics='kin', k=20).key_by()

In [None]:
# identify related individuals in pairs to remove - returns a list of sample IDs (took ~13min to run)
related_samples_to_remove = hl.maximal_independent_set(relatedness_ht.i, relatedness_ht.j, False)

In [None]:
# using sample IDs (col_key of the matrixTable), pick out the samples that are not found in 'related_samples_to_remove' (had 'False' values for the comparison)  
# subset the matrixTable to those only 
mt_unrel = mt_var_pru_filt.filter_cols(hl.is_defined(related_samples_to_remove[mt_var_pru_filt.col_key]), keep=False) 

In [None]:
# do the same as above but this time for the samples with 'True' values (found in 'related_samples_to_remove')  
mt_rel = mt_var_pru_filt.filter_cols(hl.is_defined(related_samples_to_remove[mt_var_pru_filt.col_key]), keep=True) 

In [None]:
# write out mts of unrelated and related samples on to the cloud 

# unrelated mt
mt_unrel.write('gs://african-seq-data/hgdp_tgp/unrel.mt', overwrite=False) 

# related mt 
mt_rel.write('gs://african-seq-data/hgdp_tgp/rel.mt', overwrite=False) 

In [None]:
# read saved mts back in 

# unrelated mt
mt_unrel = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/unrel.mt') 

# related mt 
mt_rel = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/rel.mt') 

# code addition for subcontinental pca 

# *run_pca* 

In [None]:
def run_pca(mt: hl.MatrixTable, reg_name:str, out_prefix: str, overwrite: bool = False):
    """
    Runs PCA on a dataset
    :param mt: dataset to run PCA on
    :param reg_name: region name for saving output purposes
    :param out_prefix: path for where to save the outputs
    :return:
    """

    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(mt.GT, k=20, compute_loadings=True)
    pca_mt = mt.annotate_rows(pca_af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
    pca_loadings = pca_loadings.annotate(pca_af=pca_mt.rows()[pca_loadings.key].pca_af)
    pca_scores = pca_scores.transmute(**{f'PC{i}': pca_scores.scores[i - 1] for i in range(1, 21)})
    
    pca_scores.export(out_prefix + reg_name + '_scores.txt.bgz')  # save individual-level genetic region PCs
    pca_loadings.write(out_prefix + reg_name + '_loadings.ht', overwrite)  # save PCA loadings

# *project_individuals*

In [None]:
#if running on GCP, need to add "--packages gnomad" when starting a cluster in order for the import to work  
from gnomad.sample_qc.ancestry import *

def project_individuals(pca_loadings, project_mt, reg_name:str, out_prefix: str, overwrite: bool = False):
    """
    Project samples into predefined PCA space
    :param pca_loadings: existing PCA space - unrelated samples 
    :param project_mt: matrixTable of data to project - related samples 
    :param reg_name: region name for saving output purposes
    :param project_prefix: path for where to save PCA projection outputs
    :return:
    """
    ht_projections = pc_project(project_mt, pca_loadings)  
    ht_projections = ht_projections.transmute(**{f'PC{i}': ht_projections.scores[i - 1] for i in range(1, 21)}) 
    ht_projections.export(out_prefix + reg_name + '_projected_scores.txt.bgz') # save output 
    #return ht_projections # return to user  

In [None]:
# obtain a list of the genetic regions in the dataset 
regions = mt_unrel['hgdp_tgp_meta']['Genetic']['region'].collect()
regions = list(dict.fromkeys(regions)) # 7 regions - ['EUR', 'AFR', 'AMR', 'EAS', 'CSA', 'OCE', 'MID']

In [None]:
# set argument values 
subcont_pca_prefix = 'gs://african-seq-data/hgdp_tgp/subcont_pca/subcont_pca_' # path for outputs 
overwrite = False 

In [None]:
# run 'run_pca' function for each region  
for i in regions:
    subcont_unrel = mt_unrel.filter_cols(mt_unrel['hgdp_tgp_meta']['Genetic']['region'] == i)  # filter the unrelateds per region
    run_pca(subcont_unrel, i, subcont_pca_prefix, overwrite)

In [None]:
# run 'project_relateds' function for each region 
for i in regions:
    loadings = hl.read_table(subcont_pca_prefix + i + '_loadings.ht') # for each region, read in the PCA loadings that were obtained from 'run_pca' function 
    subcont_rel = mt_rel.filter_cols(mt_rel['hgdp_tgp_meta']['Genetic']['region'] == i)  # filter the unrelateds per region 
    project_individuals(loadings, subcont_rel, i, subcont_pca_prefix, overwrite) 

## After plotting the PCAs, a couple of outliers that needed to be removed were identified


| s | Genetic region | Population | Note |
| --- | --- | --- | -- |
| NA20314 | AFR | ASW | Clusters with AMR in global PCA | 
| NA20299 | - | - | - |
| HG01880 | - | - | - |
| HG01881 | - | - | - |
| HGDP00013 | - | - | - |
| HGDP00150 | - | - | - |
| HGDP00029 | - | - | - |
| HGDP01298 | - | - | - |
| HGDP00130 | CSA | Makrani | Closer to AFR than most CSA |
| HGDP01303 | - | - | - |
| LP6005443-DNA_B02 | - | - | - |
| HGDP01300 | - | - | - |
| HG01628 | - | - | - |
| HG01629 | - | - | - |
| HG01630 | - | - | - |
| HG01694 | - | - | - |
| HG01696 | - | - | - |
| HGDP00621 | MID | Bedouin | Closer to AFR than most MID |
| HGDP01270 | MID | Mozabite | Closer to AFR than most MID |
| HGDP01271 | MID | Mozabite | Closer to AFR than most MID |


















	


In [None]:
# read back in the unrelated and related matrixTables to remove outliers and run pca 
mt_unrel_unfiltered = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/unrel.mt') 
mt_rel_unfiltered = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/rel.mt') 

In [None]:
# read the outliers file into a list
with hl.utils.hadoop_open('gs://african-seq-data/hgdp_tgp/pca_outliers.txt') as file: 
    outliers = [line.rstrip('\n') for line in file]
    
# capture and broadcast the list as an expression
outliers_list = hl.literal(outliers)

In [None]:
# remove outliers 
mt_unrel = mt_unrel_unfiltered.filter_cols(~outliers_list.contains(mt_unrel_unfiltered['s']))
mt_rel = mt_rel_unfiltered.filter_cols(~outliers_list.contains(mt_rel_unfiltered['s']))

In [94]:
# sanity check 
print('Unrelated: Before filtering ' + str(mt_unrel_unfiltered.count()[1]) + ' | After filtering ' + str(mt_unrel.count()[1]))
print('Related: Before filtering: ' + str(mt_rel_unfiltered.count()[1]) + ' | After filtering ' + str(mt_rel.count()[1]))

num_outliers = (mt_unrel_unfiltered.count()[1] - mt_unrel.count()[1]) + (mt_rel_unfiltered.count()[1] - mt_rel.count()[1])
print('Total outliers removed = ' + str(num_outliers))

Unrelated: Before filtering 3344 | After filtering 3327
Related: Before filtering: 673 | After filtering 670
Total outliers removed = 20


## - The following steps are similar to the ones prior to removing the outliers except now we are using the updated unrelated & related dataset, and a new GC bucket path to save the outputs 

In [None]:
# obtain a list of the genetic regions in the dataset - used the unrelated dataset since it had more samples  
regions = mt_unrel['hgdp_tgp_meta']['Genetic']['region'].collect()
regions = list(dict.fromkeys(regions)) # 7 regions - ['EUR', 'AFR', 'AMR', 'EAS', 'CSA', 'OCE', 'MID']

In [None]:
# set argument values 
subcont_pca_prefix = 'gs://african-seq-data/hgdp_tgp/subcont_pca_outliers_removed/subcont_pca_' # path for outputs 
overwrite = False 

In [None]:
# run 'run_pca' function (located above) for each region - took roughly 25-30 min (notebook became slow)
for i in regions:
    subcont_unrel = mt_unrel.filter_cols(mt_unrel['hgdp_tgp_meta']['Genetic']['region'] == i)  # filter the unrelateds per region
    run_pca(subcont_unrel, i, subcont_pca_prefix, overwrite)

In [None]:
# run 'project_relateds' function (located above) for each region - took ~3min 
for i in regions:
    loadings = hl.read_table(subcont_pca_prefix + i + '_loadings.ht') # for each region, read in the PCA loadings that were obtained from 'run_pca' function 
    subcont_rel = mt_rel.filter_cols(mt_rel['hgdp_tgp_meta']['Genetic']['region'] == i)  # filter the unrelateds per region 
    project_individuals(loadings, subcont_rel, i, subcont_pca_prefix, overwrite) 