In [1]:
import hail as hl

# *set up dataset* 

In [2]:
# read in the dataset Zan produced 
# metadata from Alicia + sample QC metadata from Julia + densified mt from Konrad
# no samples or variants removed yet  
mt = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/hgdp_tgp_dense_meta_preQC.mt') # 211358784 snps & 4151 samples

Initializing Hail with default parameters...
Running on Apache Spark version 3.1.1
SparkUI available at http://mty-m.c.neurogap-analysis.internal:39245
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.65-367cf1874d85
LOGGING: writing to /home/hail/hail-20210726-2222-0.2.65-367cf1874d85.log


In [3]:
# read in variant QC metadata
var_meta = hl.read_table('gs://gcp-public-data--gnomad/release/3.1.1/ht/genomes/gnomad.genomes.v3.1.1.sites.ht')

# annotate variant QC metadata onto mt 
mt = mt.annotate_rows(**var_meta[mt.locus, mt.alleles]) 

In [3]:
# read in the new dataset (including samples that were removed unknowngly)  
mt_post = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/new_hgdp_tgp_postQC.mt') # (155648020, 4099)

# *gnomAD filter QC*

In [4]:
# editing the format of the filter names and putting them together in a set so that we won't have an issue later when filtering the matrixTable using difference()
# create a set of the gnomAD qc filters (column names under "sample filters") - looks like: {'sex_aneuploidy', 'insert_size', ...} but not in a certain order (randomly ordered)
all_sample_filters = set(mt['sample_filters']) 

In [5]:
import re # for renaming purposes

# bad_sample_filters are filters that removed whole populations despite them passing all other gnomAD filters (mostly AFR and OCE popns)
# remove "fail_" from the filter names and pick those out (9 filters) - if the filter name starts with 'fail_' then replace it with ''
bad_sample_filters = {re.sub('fail_', '', x) for x in all_sample_filters if x.startswith('fail_')} 

In [6]:
# this filters to only samples that passed all gnomad QC or only failed filters in bad_sample_filters
# 'qc_metrics_filters' is under 'sample_filters' and includes a set of all qc filters a particular sample failed 
# if a sample passed all gnomAD qc filters then the column entry for that sample under 'qc_metrics_filters' is an empty set
# so this line goes through the 'qc_metrics_filters'column and sees if there are any samples that passed all the other qc filters except for the ones in the "bad_sample_filters" set (difference()) 
# if a sample has an empty set for the 'qc_metrics_filters' column or if it only failed the filters that are found in the bad_sample_filters set, then a value of zero is returned and we would keep that sample 
# if a sample failed any filters that are not in the "bad_sample_filters" set, remove it
# same as gs://african-seq-data/hgdp_tgp/hgdp_tgp_dense_meta_filt.mt - 211358784 snps & 4120 samples  
mt_filt = mt.filter_cols(mt['sample_filters']['qc_metrics_filters'].difference(bad_sample_filters).length() == 0) 

In [7]:
# How many samples were removed by the initial QC?

print('Num of samples before initial QC = ' + str(mt.count()[1])) # 4151
print('Num of samples after initial QC = ' + str(mt_filt.count()[1])) # 4120
print('Samples removed = ' + str(mt.count()[1] - mt_filt.count()[1])) # 31

Num of samples before initial QC = 4151
Num of samples after initial QC = 4120
Samples removed = 31


# *remove duplicate sample*

In [8]:
# duplicate sample - NA06985
mt_filt = mt_filt.distinct_by_col()
print('Num of samples after removal of duplicate sample = ' + str(mt_filt.count()[1])) # 4119

Num of samples after removal of duplicate sample = 4119


# *keep only PASS variants*

In [9]:
# subset to only PASS variants (those which passed variant QC) ~6min to run 
mt_filt = mt_filt.filter_rows(hl.len(mt_filt.filters) !=0, keep=False)
print('Num of only PASS variants = ' + str(mt_filt.count()[0])) # 155648020

Num of only PASS variants = 155648020


# *variant filter and ld pruning* 

In [10]:
# run common variant statistics (quality control metrics) - more info https://hail.is/docs/0.2/methods/genetics.html#hail.methods.variant_qc  
mt_var = hl.variant_qc(mt_filt) 

In [11]:
# trying to get down to ~100-300k SNPs - might need to change values later accordingly  
# AF: allele freq and call_rate: fraction of calls neither missing nor filtered
# mt.variant_qc.AF[0] is referring to the first element of the list under that column field  
mt_var_filt = mt_var.filter_rows((mt_var.variant_qc.AF[0] > 0.05) & (mt_var.variant_qc.AF[0] < 0.95) & (mt_var.variant_qc.call_rate > 0.999))

In [12]:
# ~20min to run 
mt_var_filt.count() # started with 155648020 snps and ended up with 6787034 snps 

(6787034, 4119)

In [13]:
# LD pruning (~113 min to run) 
pruned = hl.ld_prune(mt_var_filt.GT, r2=0.1, bp_window_size=500000) 

2021-07-26 22:52:54 Hail: INFO: ld_prune: running local pruning stage with max queue size of 62138 variants
2021-07-26 23:16:39 Hail: INFO: wrote table with 274800 rows in 5000 partitions to /tmp/SzTfV8GDfzsG9mBobMHawS
    Total size: 9.59 MiB
    * Rows: 9.59 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  359 rows (11.46 KiB)
2021-07-27 00:41:45 Hail: INFO: Wrote all 136 blocks of 274800 x 4119 matrix with block size 4096.
2021-07-27 00:46:32 Hail: INFO: wrote table with 43257 rows in 135 partitions to /tmp/3kwmZug0i0cztPhCBfhJwk
    Total size: 4.38 MiB
    * Rows: 749.66 KiB
    * Globals: 3.65 MiB
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  1263 rows (22.23 KiB)


In [14]:
# subset data even further   
mt_var_pru_filt = mt_var_filt.filter_rows(hl.is_defined(pruned[mt_var_filt.row_key])) 

In [16]:
# write out the output as a temp file - make sure to save the file on this step b/c the pruning step takes a while to run
# saving took ~23 min 
mt_var_pru_filt.write('gs://african-seq-data/hgdp_tgp/filtered_n_pruned_output_updated.mt', overwrite=False)

2021-07-27 01:18:32 Hail: INFO: wrote matrix table with 248634 rows and 4119 columns in 5000 partitions to gs://african-seq-data/hgdp_tgp/filtered_n_pruned_output_updated.mt
    Total size: 15.53 GiB
    * Rows/entries: 15.53 GiB
    * Columns: 1.70 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  357 rows (23.90 MiB)


In [2]:
# after saving the pruned file to the cloud, reading it back in for the next steps 
mt_var_pru_filt = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/filtered_n_pruned_output_updated.mt') 

Initializing Hail with default parameters...
Running on Apache Spark version 3.1.1
SparkUI available at http://mty-m.c.neurogap-analysis.internal:40349
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.74-0c3a74d12093
LOGGING: writing to /home/hail/hail-20210729-1724-0.2.74-0c3a74d12093.log


In [3]:
# how many snps are left after filtering and prunning? 
mt_var_pru_filt.count() # 248,634 snps 
# between ~100-300k so we proceed without any value adjustments  

(248634, 4119)

# *run pc_relate*  

In [11]:
# compute relatedness estimates between individuals using a variant of the PC-Relate method (https://hail.is/docs/0.2/methods/relatedness.html#hail.methods.pc_relate)
# only compute the kinship statistic using:
# a minimum minor allele frequency filter of 0.05, 
# excluding sample-pairs with kinship less than 0.05, and 
# 20 principal components to control for population structure 
# a hail table is produced (~4min to run) 
relatedness_ht = hl.pc_relate(mt_var_pru_filt.GT, min_individual_maf=0.05, min_kinship=0.05, statistics='kin', k=20).key_by()

2021-07-29 20:05:08 Hail: INFO: hwe_normalized_pca: running PCA using 248634 variants.
2021-07-29 20:05:18 Hail: INFO: pca: running PCA with 20 components...
2021-07-29 20:09:25 Hail: INFO: Wrote all 122 blocks of 248634 x 4119 matrix with block size 4096.


In [5]:
# write out result - for Julia (~2hr 19min to run)
# includes i – first sample, j – second sample, and kin – kinship estimate
relatedness_ht.write('gs://african-seq-data/hgdp_tgp/relatedness.ht')

2021-07-29 17:30:34 Hail: INFO: wrote matrix with 21 rows and 248634 columns as 61 blocks of size 4096 to /tmp/pcrelate-write-read-OzfLBhWkelepw7GaY2fPwS.bm
2021-07-29 17:30:39 Hail: INFO: wrote matrix with 248634 rows and 4119 columns as 122 blocks of size 4096 to /tmp/pcrelate-write-read-MH5U1wCqjkPwn4btHf78GX.bm
2021-07-29 18:39:57 Hail: INFO: wrote matrix with 4119 rows and 4119 columns as 4 blocks of size 4096 to /tmp/pcrelate-write-read-17rZmsqlzGxTPJV4RYvzm6.bm
2021-07-29 19:49:07 Hail: INFO: wrote matrix with 4119 rows and 4119 columns as 4 blocks of size 4096 to /tmp/pcrelate-write-read-KN9lD8cZysgwjNlZF99mMJ.bm
2021-07-29 19:49:09 Hail: INFO: wrote matrix with 4119 rows and 4119 columns as 4 blocks of size 4096 to /tmp/pcrelate-write-read-jwgmibzfuGIFM1XojhxyfY.bm
2021-07-29 19:49:09 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-07-29 19:49:14 Hail: INFO: wrote table with 1389 rows in 4 partitions to gs://african-seq-data/hgdp_tgp/relatedness.ht
    Total si

In [14]:
# read back in
relatedness_ht = hl.read_table('gs://african-seq-data/hgdp_tgp/relatedness.ht')

In [7]:
# identify related individuals in pairs to remove - returns a list of sample IDs (~2hr & 22 min to run) - previous one took ~13min
related_samples_to_remove = hl.maximal_independent_set(relatedness_ht.i, relatedness_ht.j, False)

2021-07-29 20:03:45 Hail: INFO: wrote table with 1389 rows in 4 partitions to /tmp/Hj7jfbWKn4ZHoXUOs6merW
    Total size: 13.54 KiB
    * Rows: 13.53 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  849 rows (8.15 KiB)


In [21]:
# unkey table for exporting purposes - for Julia 
unkeyed_tbl = related_samples_to_remove.expand_types()

# export sample IDs of related individuals  
unkeyed_tbl.node.s.export('gs://african-seq-data/hgdp_tgp/related_sample_ids.txt', header=False)

# import back to see if format is correct  
#tbl = hl.import_table('gs://african-seq-data/hgdp_tgp/related_sample_ids.txt', impute=True, no_header=True)
#tbl.show()

2021-07-27 04:20:36 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-07-27 04:20:37 Hail: INFO: merging 4 files totalling 5.8K...
2021-07-27 04:20:37 Hail: INFO: while writing:
    gs://african-seq-data/hgdp_tgp/related_sample_ids.txt
  merge time: 167.695ms


In [22]:
# using sample IDs (col_key of the matrixTable), pick out the samples that are not found in 'related_samples_to_remove' (had 'False' values for the comparison)  
# subset the mt to those only 
mt_unrel = mt_var_pru_filt.filter_cols(hl.is_defined(related_samples_to_remove[mt_var_pru_filt.col_key]), keep=False) 

2021-07-27 04:30:06 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'


In [23]:
# do the same as above but this time for the samples with 'True' values (found in 'related_samples_to_remove')  
mt_rel = mt_var_pru_filt.filter_cols(hl.is_defined(related_samples_to_remove[mt_var_pru_filt.col_key]), keep=True) 

In [25]:
# write out mts of unrelated and related samples on to the cloud 

# unrelated mt
mt_unrel.write('gs://african-seq-data/hgdp_tgp/unrel_updated.mt', overwrite=False) 

# related mt 
mt_rel.write('gs://african-seq-data/hgdp_tgp/rel_updated.mt', overwrite=False) 

2021-07-27 04:30:35 Hail: INFO: Coerced sorted dataset
2021-07-27 04:30:35 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-07-27 04:31:36 Hail: INFO: wrote matrix table with 248634 rows and 3399 columns in 5000 partitions to gs://african-seq-data/hgdp_tgp/unrel_updated.mt
    Total size: 13.14 GiB
    * Rows/entries: 13.14 GiB
    * Columns: 1.41 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  357 rows (20.17 MiB)
2021-07-27 04:31:38 Hail: INFO: Coerced sorted dataset
2021-07-27 04:31:40 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-07-27 04:32:25 Hail: INFO: wrote matrix table with 248634 rows and 720 columns in 5000 partitions to gs://african-seq-data/hgdp_tgp/rel_updated.mt
    Total size: 4.11 GiB
    * Rows/entries: 4.11 GiB
    * Columns: 301.36 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  357 rows (6.21 MiB)


In [16]:
# read saved mts back in 

# unrelated mt
mt_unrel = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/unrel_updated.mt') 

# related mt 
mt_rel = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/rel_updated.mt') 

# PCA

# *run pca* 

In [8]:
def run_pca(mt: hl.MatrixTable, reg_name:str, out_prefix: str, overwrite: bool = False):
    """
    Runs PCA on a dataset
    :param mt: dataset to run PCA on
    :param reg_name: region name for saving output purposes
    :param out_prefix: path for where to save the outputs
    :return:
    """

    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(mt.GT, k=20, compute_loadings=True)
    pca_mt = mt.annotate_rows(pca_af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
    pca_loadings = pca_loadings.annotate(pca_af=pca_mt.rows()[pca_loadings.key].pca_af)
    pca_scores = pca_scores.transmute(**{f'PC{i}': pca_scores.scores[i - 1] for i in range(1, 21)})
    
    pca_scores.export(out_prefix + reg_name + '_scores.txt.bgz')  # save individual-level genetic region PCs
    pca_loadings.write(out_prefix + reg_name + '_loadings.ht', overwrite)  # save PCA loadings

# *project related individuals*

In [9]:
#if running on GCP, need to add "--packages gnomad" when starting a cluster in order for the import to work  
from gnomad.sample_qc.ancestry import *

def project_individuals(pca_loadings, project_mt, reg_name:str, out_prefix: str, overwrite: bool = False):
    """
    Project samples into predefined PCA space
    :param pca_loadings: existing PCA space - unrelated samples 
    :param project_mt: matrixTable of data to project - related samples 
    :param reg_name: region name for saving output purposes
    :param project_prefix: path for where to save PCA projection outputs
    :return:
    """
    ht_projections = pc_project(project_mt, pca_loadings)  
    ht_projections = ht_projections.transmute(**{f'PC{i}': ht_projections.scores[i - 1] for i in range(1, 21)}) 
    ht_projections.export(out_prefix + reg_name + '_projected_scores.txt.bgz') # save output 
    #return ht_projections # return to user  

# *global pca*

In [8]:
# run 'run_pca' function for global pca   
run_pca(mt_unrel, 'global', 'gs://african-seq-data/hgdp_tgp/pca_preoutlier/', False)

2021-07-27 16:05:30 Hail: INFO: hwe_normalized_pca: running PCA using 248634 variants.
2021-07-27 16:05:41 Hail: INFO: pca: running PCA with 20 components...
2021-07-27 16:07:59 Hail: INFO: Coerced sorted dataset
2021-07-27 16:08:00 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-07-27 16:08:03 Hail: INFO: merging 16 files totalling 272.0K...
2021-07-27 16:08:03 Hail: INFO: while writing:
    gs://african-seq-data/hgdp_tgp/pca_preoutlier/global_scores.txt.bgz
  merge time: 279.465ms
2021-07-27 16:08:42 Hail: INFO: wrote table with 248634 rows in 4916 partitions to gs://african-seq-data/hgdp_tgp/pca_preoutlier/global_loadings.ht
    Total size: 45.23 MiB
    * Rows: 45.23 MiB
    * Globals: 11.00 B
    * Smallest partition: 1 rows (261.00 B)
    * Largest partition:  357 rows (65.26 KiB)


In [9]:
# run 'project_relateds' function for global pca 
loadings = hl.read_table('gs://african-seq-data/hgdp_tgp/pca_preoutlier/global_loadings.ht') # read in the PCA loadings that were obtained from 'run_pca' function 
project_individuals(loadings, mt_rel, 'global', 'gs://african-seq-data/hgdp_tgp/pca_preoutlier/', False) 

2021-07-27 16:10:42 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2021-07-27 16:11:02 Hail: INFO: Coerced sorted dataset
2021-07-27 16:11:04 Hail: INFO: merging 16 files totalling 60.7K...
2021-07-27 16:11:04 Hail: INFO: while writing:
    gs://african-seq-data/hgdp_tgp/pca_preoutlier/global_projected_scores.txt.bgz
  merge time: 211.694ms


# *subcontinental pca* 

In [None]:
# obtain a list of the genetic regions in the dataset - used the unrelated dataset since it had more samples 
regions = mt_unrel['hgdp_tgp_meta']['Genetic']['region'].collect()
regions = list(dict.fromkeys(regions)) # 7 regions - ['EUR', 'AFR', 'AMR', 'EAS', 'CSA', 'OCE', 'MID']

In [11]:
# set argument values 
subcont_pca_prefix = 'gs://african-seq-data/hgdp_tgp/pca_preoutlier/subcont_pca/subcont_pca_' # path for outputs 
overwrite = False 

In [None]:
# run 'run_pca' function for each region - nb freezes after printing the log for AMR  
# don't restart it - just let it run and you can follow the progress through the SparkUI
# even after all the outputs are produced and the run is complete, the code chunk will seem as if it's still running (* in the left square bracket)
# can check if the run is complete by either checking the output files in the Google cloud bucket or using the SparkUI 
# after checking the desired outputs are generated and the run is done, exit the current nb, open a new session, and proceed to the next step
# ~27min to run 
for i in regions:
    subcont_unrel = mt_unrel.filter_cols(mt_unrel['hgdp_tgp_meta']['Genetic']['region'] == i)  # filter the unrelateds per region
    run_pca(subcont_unrel, i, subcont_pca_prefix, overwrite)

2021-07-27 16:30:27 Hail: INFO: hwe_normalized_pca: running PCA using 245567 variants.
2021-07-27 16:30:44 Hail: INFO: pca: running PCA with 20 components...
2021-07-27 16:34:59 Hail: INFO: Coerced sorted dataset
2021-07-27 16:35:00 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-07-27 16:35:01 Hail: INFO: merging 16 files totalling 55.6K...
2021-07-27 16:35:01 Hail: INFO: while writing:
    gs://african-seq-data/hgdp_tgp/pca_preoutlier/subcont_pca/subcont_pca_EUR_scores.txt.bgz
  merge time: 202.187ms
2021-07-27 16:35:40 Hail: INFO: wrote table with 245567 rows in 4916 partitions to gs://african-seq-data/hgdp_tgp/pca_preoutlier/subcont_pca/subcont_pca_EUR_loadings.ht
    Total size: 44.42 MiB
    * Rows: 44.42 MiB
    * Globals: 11.00 B
    * Smallest partition: 1 rows (261.00 B)
    * Largest partition:  354 rows (64.04 KiB)
2021-07-27 16:35:48 Hail: INFO: hwe_normalized_pca: running PCA using 247516 variants.
2021-07-27 16:35:56 Hail: INFO: pca: running PCA with 20 c

In [13]:
# run 'project_relateds' function for each region (~2min to run)
for i in regions:
    loadings = hl.read_table(subcont_pca_prefix + i + '_loadings.ht') # for each region, read in the PCA loadings that were obtained from 'run_pca' function 
    subcont_rel = mt_rel.filter_cols(mt_rel['hgdp_tgp_meta']['Genetic']['region'] == i)  # filter the relateds per region 
    project_individuals(loadings, subcont_rel, i, subcont_pca_prefix, overwrite) 

2021-07-27 17:14:52 Hail: INFO: Coerced sorted dataset
2021-07-27 17:14:52 Hail: INFO: merging 16 files totalling 10.3K...
2021-07-27 17:14:52 Hail: INFO: while writing:
    gs://african-seq-data/hgdp_tgp/pca_preoutlier/subcont_pca/subcont_pca_EUR_projected_scores.txt.bgz
  merge time: 246.192ms
2021-07-27 17:15:07 Hail: INFO: Coerced sorted dataset
2021-07-27 17:15:08 Hail: INFO: merging 16 files totalling 20.7K...
2021-07-27 17:15:08 Hail: INFO: while writing:
    gs://african-seq-data/hgdp_tgp/pca_preoutlier/subcont_pca/subcont_pca_AFR_projected_scores.txt.bgz
  merge time: 242.575ms
2021-07-27 17:15:24 Hail: INFO: Coerced sorted dataset
2021-07-27 17:15:25 Hail: INFO: merging 16 files totalling 14.2K...
2021-07-27 17:15:25 Hail: INFO: while writing:
    gs://african-seq-data/hgdp_tgp/pca_preoutlier/subcont_pca/subcont_pca_AMR_projected_scores.txt.bgz
  merge time: 243.723ms
2021-07-27 17:15:39 Hail: INFO: Coerced sorted dataset
2021-07-27 17:15:40 Hail: INFO: merging 16 files total

# *outlier removal* 
#### After plotting the PCs, 22 outliers that need to be removed were identified (the table below will be completed for the final report)


| s | Genetic region | Population | Note |
| --- | --- | --- | -- |
| NA20314 | AFR | ASW | Clusters with AMR in global PCA | 
| NA20299 | - | - | - |
| NA20274 | - | - | - |
| HG01880 | - | - | - |
| HG01881 | - | - | - |
| HG01628 | - | - | - |
| HG01629 | - | - | - |
| HG01630 | - | - | - |
| HG01694 | - | - | - |
| HG01696 | - | - | - |
| HGDP00013 | - | - | - |
| HGDP00150 | - | - | - |
| HGDP00029 | - | - | - |
| HGDP01298 | - | - | - |
| HGDP00130 | CSA | Makrani | Closer to AFR than most CSA |
| HGDP01303 | - | - | - |
| HGDP01300 | - | - | - |
| HGDP00621 | MID | Bedouin | Closer to AFR than most MID |
| HGDP01270 | MID | Mozabite | Closer to AFR than most MID |
| HGDP01271 | MID | Mozabite | Closer to AFR than most MID |
| HGDP00057 | - | - | - | 
| LP6005443-DNA_B02 | - | - | - |


















	


In [2]:
# read back in the unrelated and related mts to remove outliers and run pca 
mt_unrel_unfiltered = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/unrel_updated.mt') # unrelated mt
mt_rel_unfiltered = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/rel_updated.mt') # related mt  

Initializing Hail with default parameters...
Running on Apache Spark version 3.1.1
SparkUI available at http://mty-m.c.neurogap-analysis.internal:35449
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.74-0c3a74d12093
LOGGING: writing to /home/hail/hail-20210803-1848-0.2.74-0c3a74d12093.log


In [3]:
# read the outliers file into a list
with hl.utils.hadoop_open('gs://african-seq-data/hgdp_tgp/pca_outliers_v2.txt') as file: 
    outliers = [line.rstrip('\n') for line in file]
    
# capture and broadcast the list as an expression
outliers_list = hl.literal(outliers)

In [4]:
# remove 22 outliers 
mt_unrel = mt_unrel_unfiltered.filter_cols(~outliers_list.contains(mt_unrel_unfiltered['s']))
mt_rel = mt_rel_unfiltered.filter_cols(~outliers_list.contains(mt_rel_unfiltered['s']))

In [5]:
# sanity check 
print('Unrelated: Before filtering ' + str(mt_unrel_unfiltered.count()[1]) + ' | After filtering ' + str(mt_unrel.count()[1]))
print('Related: Before filtering: ' + str(mt_rel_unfiltered.count()[1]) + ' | After filtering ' + str(mt_rel.count()[1]))

num_outliers = (mt_unrel_unfiltered.count()[1] - mt_unrel.count()[1]) + (mt_rel_unfiltered.count()[1] - mt_rel.count()[1])
print('Total samples removed = ' + str(num_outliers))

Unrelated: Before filtering 3399 | After filtering 3380
Related: Before filtering: 720 | After filtering 717
Total samples removed = 22


# rerun PCA
### - The following steps are similar to the ones prior to removing the outliers except now we are using the updated unrelated & related dataset and a new GCS bucket path to save the outputs 

# *global pca*

In [10]:
# run 'run_pca' function for global pca - make sure the code block for the function (located above) is run prior to running this    
run_pca(mt_unrel, 'global', 'gs://african-seq-data/hgdp_tgp/pca_postoutlier/', False)

2021-08-03 17:45:09 Hail: INFO: hwe_normalized_pca: running PCA using 248634 variants.
2021-08-03 17:45:22 Hail: INFO: pca: running PCA with 20 components...
2021-08-03 17:47:33 Hail: INFO: Coerced sorted dataset
2021-08-03 17:47:34 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-08-03 17:47:37 Hail: INFO: merging 16 files totalling 270.3K...
2021-08-03 17:47:37 Hail: INFO: while writing:
    gs://african-seq-data/hgdp_tgp/pca_postoutlier/global_scores.txt.bgz
  merge time: 621.536ms
2021-08-03 17:48:11 Hail: INFO: wrote table with 248634 rows in 4916 partitions to gs://african-seq-data/hgdp_tgp/pca_postoutlier/global_loadings.ht
    Total size: 45.21 MiB
    * Rows: 45.21 MiB
    * Globals: 11.00 B
    * Smallest partition: 1 rows (261.00 B)
    * Largest partition:  357 rows (65.13 KiB)


In [11]:
# run 'project_relateds' function for global pca - make sure the code block for the function (located above) is run prior to running this    
loadings = hl.read_table('gs://african-seq-data/hgdp_tgp/pca_postoutlier/global_loadings.ht') # read in the PCA loadings that were obtained from 'run_pca' function 
project_individuals(loadings, mt_rel, 'global', 'gs://african-seq-data/hgdp_tgp/pca_postoutlier/', False) 

2021-08-03 17:50:20 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2021-08-03 17:51:23 Hail: INFO: Coerced sorted dataset
2021-08-03 17:51:25 Hail: INFO: merging 16 files totalling 60.3K...
2021-08-03 17:51:25 Hail: INFO: while writing:
    gs://african-seq-data/hgdp_tgp/pca_postoutlier/global_projected_scores.txt.bgz
  merge time: 205.077ms


# *subcontinental pca* 

In [6]:
# obtain a list of the genetic regions in the dataset - used the unrelated dataset since it had more samples  
regions = mt_unrel['hgdp_tgp_meta']['Genetic']['region'].collect()
regions = list(dict.fromkeys(regions)) # 7 regions - ['EUR', 'AFR', 'AMR', 'EAS', 'CSA', 'OCE', 'MID']

In [7]:
# set argument values 
subcont_pca_prefix = 'gs://african-seq-data/hgdp_tgp/pca_postoutlier/subcont_pca/subcont_pca_' # path for outputs 
overwrite = False 

In [None]:
# run 'run_pca' function (located above) for each region 
# notebook became slow and got stuck - don't restart it, just let it run and you can follow the progress through the SparkUI
# after checking the desired outputs are generated (GCS bucket) and the run is done (SparkUI), exit the current nb, open a new session, and proceed to the next step
# took roughly 25-27 min  
for i in regions:
    subcont_unrel = mt_unrel.filter_cols(mt_unrel['hgdp_tgp_meta']['Genetic']['region'] == i)  # filter the unrelateds per region
    run_pca(subcont_unrel, i, subcont_pca_prefix, overwrite)

In [10]:
# run 'project_relateds' function (located above) for each region - took ~3min 
for i in regions:
    loadings = hl.read_table(subcont_pca_prefix + i + '_loadings.ht') # for each region, read in the PCA loadings that were obtained from 'run_pca' function 
    subcont_rel = mt_rel.filter_cols(mt_rel['hgdp_tgp_meta']['Genetic']['region'] == i)  # filter the relateds per region 
    project_individuals(loadings, subcont_rel, i, subcont_pca_prefix, overwrite) 

2021-08-03 18:49:41 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2021-08-03 18:50:38 Hail: INFO: Coerced sorted dataset
2021-08-03 18:50:39 Hail: INFO: merging 16 files totalling 10.2K...
2021-08-03 18:50:40 Hail: INFO: while writing:
    gs://african-seq-data/hgdp_tgp/pca_postoutlier/subcont_pca/subcont_pca_EUR_projected_scores.txt.bgz
  merge time: 439.201ms
2021-08-03 18:51:13 Hail: INFO: Coerced sorted dataset
2021-08-03 18:51:14 Hail: INFO: merging 16 files totalling 20.8K...
2021-08-03 18:51:14 Hail: INFO: while writing:
    gs://african-seq-data/hgdp_tgp/pca_postoutlier/subcont_pca/subcont_pca_AFR_projected_scores.txt.bgz
  merge time: 205.399ms
2021-08-03 18:51:45 Hail: INFO: Coerced sorted dataset
2021-08-03 18:51:46 Hail: INFO: merging 16 files totalling 14.2K...
2021-08-03 18:51:46 Hail: INFO: while writing:
    gs://african-seq-data/hgdp_tgp/pca_postoutlier/subcont_pca/

# FST
### For FST, we are using the data we had prior to running pc_relate (*filtered_n_pruned_output_updated.mt*)

In [124]:
# read filtered and pruned mt (prior to pc_relate) back in for FST analysis   
mt_var_pru_filt = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/filtered_n_pruned_output_updated.mt') 

# num of samples before outlier removal 
print('Before filtering: ' + str(mt_var_pru_filt.count()[1])) 

Before filtering: 4119


In [125]:
# read the outliers file into a list
with hl.utils.hadoop_open('gs://african-seq-data/hgdp_tgp/pca_outliers_v2.txt') as file: 
    outliers = [line.rstrip('\n') for line in file]
    
# capture and broadcast the list as an expression
outliers_list = hl.literal(outliers)

In [126]:
# remove 22 outliers 
mt_var_pru_filt = mt_var_pru_filt.filter_cols(~outliers_list.contains(mt_var_pru_filt['s']))

In [127]:
# sanity check 
print('After filtering: ' + str(mt_var_pru_filt.count()[1]))

After filtering: 4097


## *pair-wise comparison*

Formula to calculate number of pair-wise comparisons = (k * (k-1))/2

So in our case, since we have 78 populations, we would expect = (78 * (78-1))/2 = 6006/2 = 3003 pair-wise comparisons

In [128]:
pop = mt_var_pru_filt['hgdp_tgp_meta']['Population'].collect()
pop = list(dict.fromkeys(pop)) 
len(pop) # 78 populations in total 

78

In [129]:
# example 
ex = ['a','b','c']
# pair-wise comparison 
ex_pair_com = [[x,y] for i, x in enumerate(ex) for j,y in enumerate(ex) if i<j]
ex_pair_com

[['a', 'b'], ['a', 'c'], ['b', 'c']]

In [130]:
# pair-wise comparison - creating list of lists 
# enumerate gives index values for each population in the 'pop' list (ex. 0 CEU, 1 YRI, 2 LWK ...) and then by 
# comparing those index values, we create a pair-wise comparison between the populations 
# i < j so that it only does a single comparison among two different populations 
# ex. for a comparison between populations CEU and YRI, it only keeps CEU-YRI and discards YRI-CEU, CEU-CEU and YRI-YRI
pair_com = [[x,y] for i, x in enumerate(pop) for j,y in enumerate(pop) if i<j]

In [131]:
# first 5 elements in the list  
pair_com[0:5]

[['CEU', 'YRI'],
 ['CEU', 'LWK'],
 ['CEU', 'ESN'],
 ['CEU', 'TSI'],
 ['CEU', 'CLM']]

In [132]:
# sanity check 
len(pair_com)

3003

## *subset mt into popns according to the pair-wise comparisons and run common variant statistics*

In [133]:
pair_com[0]

['CEU', 'YRI']

In [134]:
## example - pair_com[0] = ['CEU', 'YRI'] and pair_com[0][0] = 'CEU'
CEU_mt = mt_var_pru_filt.filter_cols(mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pair_com[0][0])
YRI_mt = mt_var_pru_filt.filter_cols(mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pair_com[0][1])
CEU_YRI_mt = mt_var_pru_filt.filter_cols((mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pair_com[0][0]) | (mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pair_com[0][1]))

In [135]:
# sanity check 
CEU_mt.count()[1] + YRI_mt.count()[1] == CEU_YRI_mt.count()[1] # 175 + 170 = 345

True

In [136]:
# run common variant statistics for each population and their combined mt 
CEU_var = hl.variant_qc(CEU_mt) # individual 
YRI_var = hl.variant_qc(YRI_mt) # individual
CEU_YRI_var = hl.variant_qc(CEU_YRI_mt) # total 

### *Set up mt table for FST calculation - the next code is run for each population and their combos*

##### *population 1*

In [137]:
# drop certain fields first to make mt smaller 

# drop all entry fields
# everything except for 's' (key) from the column fields
# everything from the row fields except for the keys -'locus' and 'alleles' and row field 'variant_qc'  
CEU_interm = CEU_var.drop(*list(CEU_var.entry), *list(CEU_var.col)[1:], *list(CEU_var.row)[2:-1])

# only select the row field keys (locus and allele) and row fields 'AF' & 'AN' which are under 'variant_qc'
CEU_interm2 = CEU_interm.select_rows(CEU_interm['variant_qc']['AF'], CEU_interm['variant_qc']['AN'])  

# quick look at the condensed mt 
CEU_interm2.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'AF': array<float64>
    'AN': int32
----------------------------------------
Entry fields:
    None
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


In [138]:
CEU_interm2.rows().show(5)

locus,alleles,AF,AN
locus<GRCh38>,array<str>,array<float64>,int32
chr1:16487,"[""T"",""C""]","[8.43e-01,1.57e-01]",350
chr1:133160,"[""G"",""A""]","[8.89e-01,1.11e-01]",352
chr1:138593,"[""G"",""T""]","[8.98e-01,1.02e-01]",352
chr1:736852,"[""C"",""T""]","[9.20e-01,7.95e-02]",352
chr1:771265,"[""A"",""C""]","[1.00e+00,0.00e+00]",352


In [139]:
# only include the second entry of the array from the row field 'AF' 
CEU_interm3 = CEU_interm2.transmute_rows(AF = CEU_interm2.AF[1])

# previous code
# key the rows only by 'locus' so that the 'allele' row field can be split into two row fields (one for each allele)
# also, only include the second entry of the array from 'AF' row field  
#CEU_interm3 = CEU_interm2.key_rows_by('locus')
#CEU_interm3 = CEU_interm3.transmute_rows(AF = CEU_interm3.AF[1], A1 = CEU_interm3.alleles[0], A2 = CEU_interm3.alleles[1])

# add a row field with population name to keep track of which mt it came from 
CEU_final = CEU_interm3.annotate_rows(pop = pair_com[0][0])
CEU_final.rows().show(5)

locus,alleles,AF,AN,pop
locus<GRCh38>,array<str>,float64,int32,str
chr1:16487,"[""T"",""C""]",0.157,350,"""CEU"""
chr1:133160,"[""G"",""A""]",0.111,352,"""CEU"""
chr1:138593,"[""G"",""T""]",0.102,352,"""CEU"""
chr1:736852,"[""C"",""T""]",0.0795,352,"""CEU"""
chr1:771265,"[""A"",""C""]",0.0,352,"""CEU"""


##### *population 2*

In [140]:
# drop fields  

# drop all entry fields
# everything except for 's' (key) from the column fields
# everything from the row fields except for the keys -'locus' and 'alleles' and row field 'variant_qc'  
CEU_YRI_interm = CEU_YRI_var.drop(*list(CEU_YRI_var.entry), *list(CEU_YRI_var.col)[1:], *list(CEU_YRI_var.row)[2:-1])

# only select the row field keys (locus and allele) and row fields 'AF' & 'AN' which are under 'variant_qc'
CEU_YRI_interm2 = CEU_YRI_interm.select_rows(CEU_YRI_interm['variant_qc']['AF'], CEU_YRI_interm['variant_qc']['AN'])  

# quick look at the condensed mt 
CEU_YRI_interm2.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'AF': array<float64>
    'AN': int32
----------------------------------------
Entry fields:
    None
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


In [141]:
CEU_YRI_interm2.rows().show(5)

locus,alleles,AF,AN
locus<GRCh38>,array<str>,array<float64>,int32
chr1:16487,"[""T"",""C""]","[9.19e-01,8.14e-02]",700
chr1:133160,"[""G"",""A""]","[8.54e-01,1.46e-01]",700
chr1:138593,"[""G"",""T""]","[9.22e-01,7.83e-02]",702
chr1:736852,"[""C"",""T""]","[9.47e-01,5.30e-02]",698
chr1:771265,"[""A"",""C""]","[8.66e-01,1.34e-01]",702


In [142]:
# only include the second entry of the array from the row field 'AF' 
CEU_YRI_interm3 = CEU_YRI_interm2.transmute_rows(AF = CEU_YRI_interm2.AF[1])

# previous code 
# key the rows only by 'locus' so that the 'allele' row field can be split into two row fields (one for each allele)
# also, only include the second entry of the array from 'AF' row field  
#CEU_YRI_interm3 = CEU_YRI_interm2.key_rows_by('locus')
#CEU_YRI_interm3 = CEU_YRI_interm3.transmute_rows(AF = CEU_YRI_interm3.AF[1], A1 = CEU_YRI_interm3.alleles[0], A2 = CEU_YRI_interm3.alleles[1])

# add a row field with population name to keep track of which mt it came from 
CEU_YRI_final = CEU_YRI_interm3.annotate_rows(pop = f'{pair_com[0][0]}-{pair_com[0][1]}')
CEU_YRI_final.rows().show(5)

locus,alleles,AF,AN,pop
locus<GRCh38>,array<str>,float64,int32,str
chr1:16487,"[""T"",""C""]",0.0814,700,"""CEU-YRI"""
chr1:133160,"[""G"",""A""]",0.146,700,"""CEU-YRI"""
chr1:138593,"[""G"",""T""]",0.0783,702,"""CEU-YRI"""
chr1:736852,"[""C"",""T""]",0.053,698,"""CEU-YRI"""
chr1:771265,"[""A"",""C""]",0.134,702,"""CEU-YRI"""


### *FST formula pre-setup* - trial run

#### *Variables needed for FST calculation* 

In [285]:
# converting lists into numpy arrarys cause it is easier to work with and more readable

# assign populations to formula variables 
pop1 = CEU_final
pop2 = CEU_YRI_final

# number of alleles 
n1 = np.array(pop1.AN.collect())
n2 = np.array(pop2.AN.collect())

# allele frequencies 
FREQpop1 = np.array(pop1.AF.collect()) 
FREQpop2 = np.array(pop2.AF.collect())  

#### *Weighted average allele frequency*

In [286]:
FREQ = ((n1*FREQpop1) + (n2*FREQpop2)) / (n1+n2)

# sanity checks
print(((n1[0]*FREQpop1[0]) + (n2[0]*FREQpop2[0])) / (n1[0]+n2[0]) == FREQ[0])
print(len(FREQ) == len(FREQpop1)) # length of output should be equal to the length of arrays we started with

True
True


#### *Filter to only freqs between 0 and 1*

In [287]:
INCLUDE=(FREQ>0) & (FREQ<1) # only include ave freq between 0 and 1 - started with FREQ = 248634
print(np.count_nonzero(INCLUDE)) # 246984 ave freq values were between 0 and 1 - returned True to the conditions above; 248634 - 246984 = 1650 were False 

# subset allele frequencies 
FREQpop1=FREQpop1[INCLUDE]
FREQpop2=FREQpop2[INCLUDE]
FREQ=FREQ[INCLUDE]

# sanity check 
print(len(FREQpop1) == np.count_nonzero(INCLUDE)) # TRUE

# subset the number of alleles 
n1 = n1[INCLUDE]
n2 = n2[INCLUDE]

# sanity check 
print(len(n1) == np.count_nonzero(INCLUDE)) # TRUE

246984
True
True


#### *FST Estimate -  W&C ESTIMATOR*

In [290]:
## average sample size that incorporates variance
nc =((1/(s-1)) * (n1+n2)) - ((np.square(n1) + np.square(n2))/(n1+n2))

msa= (1/(s-1))*((n1*(np.square(FREQpop1-FREQ)))+(n2*(np.square(FREQpop2-FREQ))))

msw =  (1/((n1-1)+(n2-1))) * ((n1*(FREQpop1*(1-FREQpop1))) + (n2*(FREQpop2*(1-FREQpop2))))

numer = msa-msw

denom = msa + ((nc-1)*msw)

FST_val = numer/denom

# sanity check using the first element 
nc_0 =((1/(s-1)) * (n1[0]+n2[0])) - ((np.square(n1[0]) + np.square(n2[0]))/(n1[0]+n2[0]))

msa_0= (1/(s-1))*((n1[0]*(np.square(FREQpop1[0]-FREQ[0])))+(n2[0]*(np.square(FREQpop2[0]-FREQ[0]))))

msw_0 =  (1/((n1[0]-1)+(n2[0]-1))) * ((n1[0]*(FREQpop1[0]*(1-FREQpop1[0]))) + (n2[0]*(FREQpop2[0]*(1-FREQpop2[0]))))

numer_0 = msa_0-msw_0

denom_0 = msa_0 + ((nc_0-1)*msw_0)

FST_0 = numer_0/denom_0

print(FST_0 == FST_val[0]) # TRUE

True


In [291]:
FST_val

array([ 0.02750891,  0.00311057,  0.00149225, ..., -0.00213329,
       -0.00203709, -0.00171703])

## *Which FST value is for which locus-allele?* - actual run

In [293]:
# resetting variables for the actual FST run 

# assign populations to formula variables 
pop1 = CEU_final
pop2 = CEU_YRI_final

# number of alleles 
n1 = np.array(pop1.AN.collect())
n2 = np.array(pop2.AN.collect())

# allele frequencies 
FREQpop1 = np.array(pop1.AF.collect()) 
FREQpop2 = np.array(pop2.AF.collect())  

# locus + alleles = keys - needed for reference purposes - these values are uniform across all populations 
locus = np.array(hl.str(pop1.locus).collect())
alleles = np.array(hl.str(pop1.alleles).collect())
key = np.array([i + ' ' + j for i, j in zip(locus, alleles)])

In [294]:
s=2   # s is the number of populations - since we are calculating pair-wise FSTs, this is always 2 
key_FST = {}
for i in range(len(key)):
    FREQ = ((n1[i]*FREQpop1[i]) + (n2[i]*FREQpop2[i])) / (n1[i]+n2[i])
    
    if (FREQ>0) & (FREQ<1): # only include ave freq between 0 and 1
        
    ## average sample size that incorporates variance
        nc = ((1/(s-1)) * (n1[i]+n2[i])) - ((np.square(n1[i]) + np.square(n2[i]))/(n1[i]+n2[i]))

        msa= (1/(s-1))*((n1[i]*(np.square(FREQpop1[i]-FREQ)))+(n2[i]*(np.square(FREQpop2[i]-FREQ))))

        msw = (1/((n1[i]-1)+(n2[i]-1))) * ((n1[i]*(FREQpop1[i]*(1-FREQpop1[i]))) + (n2[i]*(FREQpop2[i]*(1-FREQpop2[i]))))

        numer = msa-msw

        denom = msa + ((nc-1)*msw)

        FST = numer/denom
        
        key_FST[key[i]] = FST

In [331]:
key_FST

{'chr1:16487 ["T","C"]': 0.02750890602075159,
 'chr1:133160 ["G","A"]': 0.003110568631892113,
 'chr1:138593 ["G","T"]': 0.0014922480633877278,
 'chr1:736852 ["C","T"]': 0.0039158145324851654,
 'chr1:771265 ["A","C"]': 0.10210109418003628,
 'chr1:797421 ["A","G"]': 0.06245896580362819,
 'chr1:800302 ["G","C"]': 0.00937832955004036,
 'chr1:807641 ["T","C"]': 0.06553706304863907,
 'chr1:817186 ["G","A"]': 0.13830272811268304,
 'chr1:817416 ["C","T"]': 0.00950978566989048,
 'chr1:826950 ["G","T"]': 0.0503733032652502,
 'chr1:841166 ["A","G"]': 0.06400455020158805,
 'chr1:841852 ["C","T"]': 0.015736655613342882,
 'chr1:859236 ["T","C"]': 0.10210109418003628,
 'chr1:861347 ["G","T"]': 0.030561140126427164,
 'chr1:890030 ["G","A"]': 0.05004725217012239,
 'chr1:893503 ["G","A"]': -0.0021285987364131295,
 'chr1:908025 ["A","G"]': -0.0021047665547462274,
 'chr1:910958 ["A","G"]': 0.017644994411523802,
 'chr1:919397 ["A","G"]': 0.0004297609524191273,
 'chr1:928119 ["T","C"]': 0.021022915546556022

In [295]:
# sanity checks 
print(all(np.array(list(key_FST.values())) == FST_val)) # True 
print(len(key_FST) == len(FST_val)) # True

True
True


## *other pair*

### population 3

In [296]:
# population - YRI
# same steps we did to CEU

YRI_interm = YRI_var.drop(*list(YRI_var.entry), *list(YRI_var.col)[1:], *list(YRI_var.row)[2:-1])

# only select the row field keys (locus and allele) and row fields 'AF' & 'AN' which are under 'variant_qc'
YRI_interm2 = YRI_interm.select_rows(YRI_interm['variant_qc']['AF'], YRI_interm['variant_qc']['AN'])  

# only include the second entry of the array from the row field 'AF' 
YRI_interm3 = YRI_interm2.transmute_rows(AF = YRI_interm2.AF[1])

# add a row field with population name to keep track of which mt it came from 
YRI_final = YRI_interm3.annotate_rows(pop = pair_com[0][1])
YRI_final.rows().show(5)

locus,alleles,AF,AN,pop
locus<GRCh38>,array<str>,float64,int32,str
chr1:16487,"[""T"",""C""]",0.00571,350,"""YRI"""
chr1:133160,"[""G"",""A""]",0.181,348,"""YRI"""
chr1:138593,"[""G"",""T""]",0.0543,350,"""YRI"""
chr1:736852,"[""C"",""T""]",0.026,346,"""YRI"""
chr1:771265,"[""A"",""C""]",0.269,350,"""YRI"""


### *FST*

In [297]:
# resetting variables for the actual FST run 

# assign populations to formula variables 
pop1 = YRI_final
pop2 = CEU_YRI_final

# number of alleles 
n1 = np.array(pop1.AN.collect())
n2 = np.array(pop2.AN.collect())

# allele frequencies 
FREQpop1 = np.array(pop1.AF.collect()) 
FREQpop2 = np.array(pop2.AF.collect())  

# locus + alleles = keys - needed for reference purposes - these values are uniform across all populations 
locus = np.array(hl.str(pop1.locus).collect())
alleles = np.array(hl.str(pop1.alleles).collect())
key = np.array([i + ' ' + j for i, j in zip(locus, alleles)])

In [298]:
s=2   # s is the number of populations - since we are calculating pair-wise FSTs, this is always 2 
key_FST_YRI = {}
for i in range(len(key)):
    FREQ = ((n1[i]*FREQpop1[i]) + (n2[i]*FREQpop2[i])) / (n1[i]+n2[i])
    
    if (FREQ>0) & (FREQ<1): # only include ave freq between 0 and 1
        
    ## average sample size that incorporates variance
        nc = ((1/(s-1)) * (n1[i]+n2[i])) - ((np.square(n1[i]) + np.square(n2[i]))/(n1[i]+n2[i]))

        msa= (1/(s-1))*((n1[i]*(np.square(FREQpop1[i]-FREQ)))+(n2[i]*(np.square(FREQpop2[i]-FREQ))))

        msw = (1/((n1[i]-1)+(n2[i]-1))) * ((n1[i]*(FREQpop1[i]*(1-FREQpop1[i]))) + (n2[i]*(FREQpop2[i]*(1-FREQpop2[i]))))

        numer = msa-msw

        denom = msa + ((nc-1)*msw)

        FST = numer/denom
        
        key_FST_YRI[key[i]] = FST

In [330]:
key_FST_YRI

{'chr1:16487 ["T","C"]': 0.05044941867701216,
 'chr1:133160 ["G","A"]': 0.0025453980088959126,
 'chr1:138593 ["G","T"]': 0.002281044043290102,
 'chr1:736852 ["C","T"]': 0.006465139020490581,
 'chr1:771265 ["A","C"]': 0.057726365214853366,
 'chr1:797421 ["A","G"]': 0.033108449717149296,
 'chr1:800302 ["G","C"]': 0.02049231948972641,
 'chr1:807641 ["T","C"]': 0.1157238204620783,
 'chr1:817186 ["G","A"]': 0.1236728005169633,
 'chr1:817416 ["C","T"]': 0.0066634585718547724,
 'chr1:826950 ["G","T"]': 0.027738776634440934,
 'chr1:841166 ["A","G"]': 0.11341414703039004,
 'chr1:841852 ["C","T"]': 0.03248716489041391,
 'chr1:859236 ["T","C"]': 0.057726365214853366,
 'chr1:861347 ["G","T"]': 0.02645791642071565,
 'chr1:890030 ["G","A"]': 0.048552156289163526,
 'chr1:893503 ["G","A"]': -0.0021365791011485036,
 'chr1:908025 ["A","G"]': -0.002112687285265169,
 'chr1:910958 ["A","G"]': 0.015757663121868555,
 'chr1:919397 ["A","G"]': 0.0005281259533773563,
 'chr1:928119 ["T","C"]': 0.0135727418795865

## *three popn pairs*

In [306]:
## example using three sample pairs ['CEU', 'YRI'], ['CEU', 'LWK'], ['CEU', 'ESN'] and setting up the function 
example_pairs = pair_com[0:3]

ex_dict = {} # empty dictionary to hold final outputs 
for pairs in example_pairs:
    l = [] # empty list to hold the subsetted datasets 
    l.append(mt_var_pru_filt.filter_cols(mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pairs[0])) # first population 
    l.append(mt_var_pru_filt.filter_cols(mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pairs[1])) # second population 
    l.append(mt_var_pru_filt.filter_cols((mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pairs[0]) | (mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pairs[1]))) # first + second = total population
    
    # sanity check - the sample count of the first and second subset mts should be equal to the total subset mt 
    if l[0].count()[1] + l[1].count()[1] == l[2].count()[1]: 
        v = [] # empty list to hold output mts from running common variant statistics 
        # run common variant statistics for each population and their combined mt
        v.append(hl.variant_qc(l[0])) # first population  
        v.append(hl.variant_qc(l[1])) # second population 
        v.append(hl.variant_qc(l[2])) # both/total population
        
        # add to dictionary 
        ex_dict["-".join(pairs)] = v

In [307]:
# three mt subsets per comparison pair - set up as a dictionary 
ex_dict

{'CEU-YRI': [<hail.matrixtable.MatrixTable at 0x7fa0746468e0>,
  <hail.matrixtable.MatrixTable at 0x7fa0b47b8730>,
  <hail.matrixtable.MatrixTable at 0x7fa0b49bf7c0>],
 'CEU-LWK': [<hail.matrixtable.MatrixTable at 0x7fa0b4a03df0>,
  <hail.matrixtable.MatrixTable at 0x7fa0b4a01ac0>,
  <hail.matrixtable.MatrixTable at 0x7fa09d6ac790>],
 'CEU-ESN': [<hail.matrixtable.MatrixTable at 0x7fa09d094760>,
  <hail.matrixtable.MatrixTable at 0x7fa09ffa7520>,
  <hail.matrixtable.MatrixTable at 0x7fa0745af1f0>]}

In [None]:
# same as CEU_var['variant_qc'].show(5)
ex_dict['CEU-YRI'][0]['variant_qc'].show(5)

In [365]:
# final fst run basic structure - incomplete 
final_dic = {}
for pair in ex_dict.keys(): # for each population pair 
    u = [] # list to hold updated mts  
    for i in range(len(ex_dict[pair])): # for each population (each mt)
        # pop1
        # drop certain fields and only keep the ones we need 
        interm = ex_dict[pair][i].drop(*list(ex_dict[pair][i].entry), *list(ex_dict[pair][i].col)[1:], *list(ex_dict[pair][i].row)[2:-1])
        interm2 = interm.select_rows(interm['variant_qc']['AF'], interm['variant_qc']['AN'])  
        interm3 = interm2.transmute_rows(AF = interm2.AF[1])
        #final = interm3.annotate_rows(pop = pair) # keep track of which mt it came from
        u.append(interm3) # add updated mt to list 
    
    # variables for FST run 

    # assign populations to formula variables 
    pop1 = u[0]
    pop2 = u[1]
    total = u[2]
        
    # number of alleles 
    n1 = np.array(pop1.AN.collect())
    n2 = np.array(pop2.AN.collect())
    total_n = np.array(total.AN.collect())

    # allele frequencies 
    FREQpop1 = np.array(pop1.AF.collect()) 
    FREQpop2 = np.array(pop2.AF.collect())
    total_FREQ = np.array(total.AF.collect()) 
    
    # locus + alleles = keys - needed for reference purposes during FST calculations - these values are uniform across all populations 
    locus = np.array(hl.str(pop1.locus).collect())
    alleles = np.array(hl.str(pop1.alleles).collect())
    key = np.array([i + ' ' + j for i, j in zip(locus, alleles)])
    
    s=2   # s is the number of populations - since we are calculating pair-wise FSTs, this is always 2 
    
    # FST pop1 and total popn
    key_pop1_total = {}
    for i in range(len(key)):
        FREQ = ((n1[i]*FREQpop1[i]) + (total_n[i]*total_FREQ[i])) / (n1[i]+total_n[i])

        if (FREQ>0) & (FREQ<1): # only include ave freq between 0 and 1

        ## average sample size that incorporates variance
            nc = ((1/(s-1)) * (n1[i]+total_n[i])) - ((np.square(n1[i]) + np.square(total_n[i]))/(n1[i]+total_n[i]))

            msa= (1/(s-1))*((n1[i]*(np.square(FREQpop1[i]-FREQ)))+(total_n[i]*(np.square(total_FREQ[i]-FREQ))))

            msw = (1/((n1[i]-1)+(total_n[i]-1))) * ((n1[i]*(FREQpop1[i]*(1-FREQpop1[i]))) + (total_n[i]*(total_FREQ[i]*(1-total_FREQ[i]))))

            numer = msa-msw

            denom = msa + ((nc-1)*msw)

            FST = numer/denom

            key_pop1_total[key[i]] = FST
            
    # FST pop2 and total popn
    key_pop2_total = {}
    for i in range(len(key)):
        FREQ = ((n2[i]*FREQpop2[i]) + (total_n[i]*total_FREQ[i])) / (n2[i]+total_n[i])

        if (FREQ>0) & (FREQ<1): # only include ave freq between 0 and 1

        ## average sample size that incorporates variance
            nc = ((1/(s-1)) * (n2[i]+total_n[i])) - ((np.square(n2[i]) + np.square(total_n[i]))/(n2[i]+total_n[i]))

            msa= (1/(s-1))*((n2[i]*(np.square(FREQpop2[i]-FREQ)))+(total_n[i]*(np.square(total_FREQ[i]-FREQ))))

            msw = (1/((n2[i]-1)+(total_n[i]-1))) * ((n2[i]*(FREQpop2[i]*(1-FREQpop2[i]))) + (total_n[i]*(total_FREQ[i]*(1-total_FREQ[i]))))

            numer = msa-msw

            denom = msa + ((nc-1)*msw)

            FST = numer/denom

            key_pop2_total[key[i]] = FST
    
    # merge the two FST results together
    from collections import defaultdict

    dd = defaultdict(list)

    for d in (key_pop1_total, key_pop2_total):
        for key, value in d.items():
            dd[key].append(value)
    
    final_dic[pair] = dd

-

## junk code 1

In [None]:
# population - YRI
# same steps we did to CEU


YRI_var == ex_dict['CEU-YRI'][0]

YRI_interm = ex_dict['CEU-YRI'][0].drop(*list(ex_dict['CEU-YRI'][0].entry)


YRI_interm = ex_dict['CEU-YRI'][0].drop(*list(ex_dict['CEU-YRI'][0].entry), *list(ex_dict['CEU-YRI'][0].col)[1:], *list(ex_dict['CEU-YRI'][0].row)[2:-1])

# only select the row field keys (locus and allele) and row fields 'AF' & 'AN' which are under 'variant_qc'
YRI_interm2 = YRI_interm.select_rows(YRI_interm['variant_qc']['AF'], YRI_interm['variant_qc']['AN'])  

# only include the second entry of the array from the row field 'AF' 
YRI_interm3 = YRI_interm2.transmute_rows(AF = YRI_interm2.AF[1])

# add a row field with population name to keep track of which mt it came from 
YRI_final = YRI_interm3.annotate_rows(pop = pairs[0])
YRI_final.rows().show(5)

In [318]:
len(ex_dict['CEU-YRI'])

3

In [332]:
a = ['CEU-YRI','CEU-LWK', 'CEU-ESN']
b = [0,1,2]
dc = {}
for i in a:
    li = []
    for j in b:
        li.append(str(j) + i)
    dc[i] = li 
        

In [350]:
for i in range(len(v)-1):
    print(i)

0
1


In [None]:
from collections import defaultdict

dd = defaultdict(list)

for d in (key_FST, key_FST_YRI):
    print(d)
    #for key, value in d.items():
        #dd[key].append(value)

In [362]:
range(len(ex_dict[pair]))

range(0, 3)

In [369]:
# convert to a table 
import pandas as pd

df = pd.DataFrame(final_dic) 


In [377]:
len(final_dic['CEU-YRI']) # 246984

246984

In [378]:
## example - pair_com[0] = ['CEU', 'YRI'] and pair_com[0][0] = 'CEU'
CEU_mt = mt_var_pru_filt.filter_cols(mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pair_com[1][0])
LWK_mt = mt_var_pru_filt.filter_cols(mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pair_com[1][1])
CEU_LWK_mt = mt_var_pru_filt.filter_cols((mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pair_com[1][0]) | (mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pair_com[1][1]))

# run common variant statistics for each population and their combined mt 
CEU_var = hl.variant_qc(CEU_mt) # individual 
LWK_var = hl.variant_qc(LWK_mt) # individual
CEU_LWK_var = hl.variant_qc(CEU_YRI_mt) # total 

In [381]:
LWK_var.count()

(248634, 97)

In [374]:
# population - YRI
# same steps we did to CEU

YRI_interm = YRI_var.drop(*list(YRI_var.entry), *list(YRI_var.col)[1:], *list(YRI_var.row)[2:-1])

# only select the row field keys (locus and allele) and row fields 'AF' & 'AN' which are under 'variant_qc'
YRI_interm2 = YRI_interm.select_rows(YRI_interm['variant_qc']['AF'], YRI_interm['variant_qc']['AN'])  

# only include the second entry of the array from the row field 'AF' 
YRI_interm3 = YRI_interm2.transmute_rows(AF = YRI_interm2.AF[1])

# add a row field with population name to keep track of which mt it came from 
YRI_final = YRI_interm3.annotate_rows(pop = pair_com[0][1])
YRI_final.rows().show(5)

dict_keys(['CEU-YRI', 'CEU-LWK', 'CEU-ESN'])

In [310]:
for pair in ex_dict.keys(): # for each population pair 
    for i in range(len(ex_dict[i])): # for each population 
    



        interm = ex_dict[pair][i].drop(*list(ex_dict[pair][i].entry), *list(ex_dict[pair][i].col)[1:], *list(ex_dict[pair][i].row)[2:-1])

        # only select the row field keys (locus and allele) and row fields 'AF' & 'AN' which are under 'variant_qc'
        interm2 = interm.select_rows(interm['variant_qc']['AF'], interm['variant_qc']['AN'])  

        # only include the second entry of the array from the row field 'AF' 
        interm3 = interm2.transmute_rows(AF = interm2.AF[1])

        # add a row field with population name to keep track of which mt it came from 
        final = interm3.annotate_rows(pop = pair)
        final.rows().show(5)
    

CEU-YRI
CEU-LWK
CEU-ESN


In [None]:
%%time
# actual function/run using all population pairs
dict = {} # empty dictionary to hold final outputs 
for pairs in pair_com:
    l = [] # empty list to hold the subsetted datasets 
    l.append(mt_var_pru_filt.filter_cols(mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pairs[0])) # first population 
    l.append(mt_var_pru_filt.filter_cols(mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pairs[1])) # second population 
    l.append(mt_var_pru_filt.filter_cols((mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pairs[0]) | (mt_var_pru_filt['hgdp_tgp_meta']['Population'] == pairs[1]))) # first + second = total population
    
    # sanity check - the sample count of the first and second subset mts should be equal to the total subset mt 
    if l[0].count()[1] + l[1].count()[1] == l[2].count()[1]: 
        v = [] # empty list to hold output mts from running common variant statistics 
        # run common variant statistics for each population and their combined mt
        v.append(hl.variant_qc(l[0])) # first population  
        v.append(hl.variant_qc(l[1])) # second population 
        v.append(hl.variant_qc(l[2])) # both/total population
        
        # add to dictionary 
        dict["-".join(pairs)] = v

In [None]:
len(dict)

In [None]:
dict

In [None]:
dict['CEU-YRI'][0]['variant_qc'].show(5)

In [None]:
dict['CEU-YRI'][1]['variant_qc'].show(5)

In [None]:
dict['CEU-YRI'][2]['variant_qc'].show(5)

In [None]:
# accessing dictionary element with index 
ex_dict[list(ex_dict)[0]][0]['variant_qc'].show(5)

In [None]:
for l in list(ex_dict):
    print(ex_dict[l][1]['variant_qc']['AF'][1].show(5))

In [None]:
list(ex_dict)

In [None]:
CEU_af_freq = ex_dict[list(ex_dict)[0]][0]['variant_qc']['AF'][1]

In [None]:
play_mt = hl.utils.range_matrix_table(0, 6)

In [None]:
ex_dict[list(ex_dict)[0]][0].cols().show(5)

In [None]:
mt.select_rows(mt.r1, mt.r2,
r3=hl.coalesce(mt.r1, mt.r2))

mt.select_cols(mt.c2,
sum=mt.c2+mt.c1)

In [None]:
play_mt = ex_dict[list(ex_dict)[0]][0]

In [None]:
row_subsetted_mt.cols().show(5)

In [None]:
CEU_af_freq = CEU_af_freq.annotate_cols(AN=ex_dict[list(ex_dict)[0]][0]['variant_qc']['AN'])

In [None]:
mtA = mtA.annotate_rows(phenos = hl.dict(hl.agg.collect((mtA.pheno, mtA.value))))
mtB = mtB.annotate_cols(
    phenos = mtA.rows()[mtB.col_key].phenos)

In [None]:
# additional stuff 

CEU_var = hl.variant_qc(CEU_mt) 
CEU__YRI_var = hl.variant_qc(CEU_YRI_mt) 
a, b, c
a -> ac
B -> cb
a -> ab
CEU_var.row.show()
CEU_mt['variant_qc']['AF'][0]*CEU_mt['variant_qc']['AF'][1]
CEU_YRI_mt['variant_qc']['AF'][0]*CEU_YRI_mt['variant_qc']['AF'][1]

-

## junk code 2

In [None]:
# this code is if the alleles were split into their separate columns and if we expect a mismatch across popns 

# remove indels - only include single letter varients for each allele in both populations 
# this is b/c the FST formula is set up for single letter alleles 
#pop1 = CEU_final.filter_rows((CEU_final.A1.length() == 1) & (CEU_final.A2.length() == 1))
#pop2 = CEU_YRI_final.filter_rows((CEU_YRI_final.A1.length() == 1) & (CEU_YRI_final.A2.length() == 1))


# sanity check 
#A1 = pop1.A1.collect()
#A1 =  list(set(A1)) # OR can also do: 
### from collections import OrderedDict 
### A1 = list(OrderedDict.fromkeys(A1))

#print(A1) 
#len(A1) == 4

# total # of snps at the beginning - 255666 
# unique snps before removing indels - 2712 
# total # of snps after removing indels - 221017 (34649 snps were indels for A1, A2 or both)
# unique snps after removing indels - 4 ['C', 'A', 'T', 'G'] - which is what we expect 



## *use the same reference allele - A2 is minor allele here*  

# get the minor alleles from both populations  
#pop1_A2 = pop1.A2.collect()
#pop2_A2 = pop2.A2.collect()


# find values that are unequal 
#import numpy as np
#switch1 = (np.array(pop1_A2) != np.array(pop2_A2))
#print(switch1.all()) # all comparisons returned 'FALSE' which means that all variants that were compared are the same 

# sanity check 
#print(len(pop1_A2) == len(pop2_A2) == len(switch1)) # True 


### *if there is a variant mismatch among the minor alleles of the two populations*
# in case there was a comparison that didn't match correctly among the minor alleles of the two populations, we would adjust the allele frequency(AF) accordingly   
#new_frq = pop2.AF.collect() 
#new_frq = np.array(new_frq) # convert to numpy array for the next step

# explanation (with an example) for what this does is right below it 
#new_frq[switch1] = 1-(new_frq[switch1]) 
# Example: for pop_1, A1 and A2 are 'T' and 'C' with AF of 0.25 
# and for pop_2, A1 and A2 are 'C and 'T' with AF of 0.25
# then since the same reference allele is not used (alleles don't correctly align) in this case, 
# we would subtract the AF of pop_2 from 1, to get the correct allele frequency 
# the AF of pop_2 with A1 and A2 oriented the same way as pop_1: 'T' and 'C', would be 1-0.25 = 0.75 (w/c is the correct AF)

# if we wanted to convert array back to list 
#pop2_frq = new_frq.tolist() 


# junk code 
#pop2.rows().show(5)

#p = pop2.filter_rows(str(pop2.locus) =='chr10:38960343')
p.row.show()


# for i in locus:
#     if i =='chr1:94607079':
#         print ("True")
        
sum(num == dup for num,dup in zip(locus, d))

In [None]:
# code to check if there are duplicates in a list and print them out 
#import collections
#dup = [item for item, count in collections.Counter(key).items() if count > 1]
#print('Num of duplicate loci: ' + str(len(dup))) 
#print(dup)

In [None]:
# which FST value is for which locus? 
key_freq1 = {key[i]: FREQpop1[i] for i in range(len(key))}
key_freq2 = {key[i]: FREQpop2[i] for i in range(len(key))}


key_n1 = {key[i]: n1[i] for i in range(len(key))}
key_n2 = {key[i]: n2[i] for i in range(len(key))}

# for key,value in zip (locus, FREQpop1):
#     print(dict(key, value))
#for v1,v2 in zip(list(locus_freq1.values())[0:5], list(locus_freq2.values())[0:5]):
    #lq = ((n1*locus_freq1.values()) + (n2*locus_freq2.values())) / (n1+n2)
    #print(key,value)

In [None]:
#locus #220945
#len(set(FREQpop1))


# check if there are duplicates in locus list and print them out - 72 duplicates  
# import collections
# d = [item for item, count in collections.Counter(locus).items() if count > 1]

# list.sort(locus)
#locus

# from collections import Counter
# [k for k,v in Counter(locus).items() if v>1]

# where are each of the duplicated loci located?
from collections import defaultdict

D = defaultdict(list)
for i,item in enumerate(locus):
    D[item].append(i)
D = {k:v for k,v in D.items() if len(v)>1}
locus[6202]

In [None]:
bad_locus = locus[INCLUDE=='FALSE']

# ave freq values that were not between 0 and 1 - returned FALSE to the conditions in the above chuck of code 
print(np.count_nonzero(INCLUDE==0))
DONT_INCLUDE= (FREQ=='') & (FREQ>=1)
np.count_nonzero(DONT_INCLUDE)

In [None]:
# convert the output from the preimp_qc module (qced.mt) into a vcf file in Hail 
import hail as hl 
mt = hl.read_matrix_table('gs://nepal-geno/GWASpy/Preimp_QC/Nepal_PTSD_GSA_Updated_May2021_qced.mt')
hl.export_vcf(mt, 'gs://nepal-geno/Nepal_PTSD_GSA_Updated_May2021_qced.vcf.bgz')