In [1]:
# standard imports
import os
import sys
import pandas as pd
import re

import math
import datetime
from dateutil import parser
from time import sleep
from typing import List
import pprint

from google.cloud import storage
my_storage_client = storage.Client()

from tqdm.notebook import tqdm

In [2]:
import firecloud.api as fapi
import hail as hl

In [3]:
def load_table(namespace, workspace, root_entity_type):
    ent = fapi.get_entities(namespace, workspace, root_entity_type).json()
    tbl = pd.DataFrame(list(map(lambda e: e['attributes'], ent)))
    tbl[f"entity:{root_entity_type}_id"] = list(map(lambda f: f['name'], ent))

    return tbl

In [4]:
current_account = fapi.whoami()

In [5]:
bucket = os.environ['WORKSPACE_BUCKET']       # gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34
workspace = os.environ['WORKSPACE_NAME']      # AoU_DRC_WGS_LongReads_PacBio
namespace = os.environ['WORKSPACE_NAMESPACE'] # allofus-drc-wgs-lr-prod

In [6]:
tbl = load_table(namespace, workspace, 'ha-sample-grch38_set')

In [7]:
tbl

Unnamed: 0,joint_bcf,joint_gvcf_tbi,joint_gvcf,joint_mt,ha-sample-grch38s,entity:ha-sample-grch38_set_id
0,gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1b...,gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1b...,gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1b...,gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1b...,"{'itemsType': 'EntityReference', 'items': [{'e...",cohort_for_GLNexus_2023Q1_1040
1,gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1b...,gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1b...,gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1b...,gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1b...,"{'itemsType': 'EntityReference', 'items': [{'e...",cohort_for_GLNexus_alpha1_704
2,,,,,"{'itemsType': 'EntityReference', 'items': [{'e...",ComputeReadLengths_2022-12-01T01-45-18
3,,,,,"{'itemsType': 'EntityReference', 'items': [{'e...",ComputeReadLengths_2023-01-04T23-34-55
4,,,,,"{'itemsType': 'EntityReference', 'items': [{'e...",CoverageOverWGS-GRCh38-sample_2022-07-14T18-57-54
...,...,...,...,...,...,...
58,,,,,"{'itemsType': 'EntityReference', 'items': [{'e...",SummarizeSnifflesSVs-HG38_2022-12-13T02-52-33
59,,,,,"{'itemsType': 'EntityReference', 'items': [{'e...",SummarizeSnifflesSVs-HG38_2022-12-13T14-27-26
60,,,,,"{'itemsType': 'EntityReference', 'items': [{'e...",SummarizeSnifflesSVs-HG38_2022-12-13T16-12-01
61,,,,,"{'itemsType': 'EntityReference', 'items': [{'e...",VerifyFingerprintCCSSample_2022-11-30T14-19-13


In [13]:
mt_704_path = list(tbl[tbl['entity:ha-sample-grch38_set_id'] == 'cohort_for_GLNexus_alpha1_704']['joint_mt'])[0]
mt_704_path

'gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_alpha1_704/cohort_for_GLNexus_alpha1_704.mt'

In [14]:
mt_1040_path = list(tbl[tbl['entity:ha-sample-grch38_set_id'] == 'cohort_for_GLNexus_2023Q1_1040']['joint_mt'])[0]
mt_1040_path

'gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/cohort_for_GLNexus_2023Q1_1040.mt'

In [15]:
!gsutil du -hcs {mt_704_path}
!gsutil du -hcs {mt_1040_path}

42.24 GiB    gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_alpha1_704/cohort_for_GLNexus_alpha1_704.mt
42.24 GiB    total
62.4 GiB     gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/cohort_for_GLNexus_2023Q1_1040.mt
62.4 GiB     total


In [4]:
#hl.stop()

In [16]:
hl.init(default_reference='GRCh38')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.1.3
SparkUI available at http://saturn-36cf326a-bbf5-4d32-bd8e-5fd9c2e2aa29-m.c.terra-7a376e4e.internal:36455
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.107-2387bb00ceee
LOGGING: writing to /home/jupyter/AoU_DRC_WGS_LongReads_PacBio/edit/hail-20230109-0312-0.2.107-2387bb00ceee.log


In [5]:
from hail.plot import show
from pprint import pprint
hl.plot.output_notebook()

In [17]:
joint_mt_704 = hl.read_matrix_table(mt_704_path)

In [19]:
print('Samples: %d  Variants: %d' % (joint_mt_704.count_cols(), joint_mt_704.count_rows()))

Samples: 704  Variants: 56897081


In [20]:
joint_mt_1040 = hl.read_matrix_table(mt_1040_path)

In [21]:
print('Samples: %d  Variants: %d' % (joint_mt_1040.count_cols(), joint_mt_1040.count_rows()))

Samples: 1040  Variants: 66016771


In [15]:
hl.summarize_variants(joint_mt)

ERROR:root:KeyboardInterrupt while sending command.               (0 + 0) / 453]
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.7/site-packages/py4j/java_gateway.py", line 1217, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [17]:
# run sample QC and save into matrix table
joint_mt = hl.sample_qc(joint_mt)

In [18]:
# run variant QC and save into matrix table
joint_mt = hl.variant_qc(joint_mt)

In [19]:
test_intervals = [
    'chr1',  'chr2',  'chr3',  'chr4',  'chr5',  'chr6',
    'chr7',  'chr8',  'chr9',  'chr10', 'chr11', 'chr12',
    'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18',
    'chr19', 'chr20', 'chr21', 'chr22'
] 
joint_mt = hl.filter_intervals(joint_mt, [hl.parse_locus_interval(x,) for x in test_intervals])

In [20]:
# filter for variants that are >95% call rate, >1% frequency, and remove sites far from HWE
joint_mt = joint_mt.filter_rows(joint_mt.variant_qc.call_rate > 0.95)
joint_mt = joint_mt.filter_rows(joint_mt.variant_qc.AF[1] > 0.01)
joint_mt = joint_mt.filter_rows(joint_mt.variant_qc.p_value_hwe > 0.005)
joint_mt = joint_mt.filter_rows(hl.len(joint_mt.alleles) == 2)

In [21]:
pruned_variant_table = hl.ld_prune(joint_mt.GT, r2=0.1, bp_window_size=500000)
joint_mt = joint_mt.filter_rows(hl.is_defined(pruned_variant_table[joint_mt.row_key]))

2023-01-02 21:07:59.592 Hail: INFO: ld_prune: running local pruning stage with max queue size of 216481 variants
2023-01-02 22:07:51.773 Hail: INFO: wrote table with 864345 rows in 436 partitions to /tmp/q2qoakD5ORbzusGVBoKu21
    Total size: 25.08 MiB
    * Rows: 25.08 MiB
    * Globals: 11.00 B
    * Smallest partition: 293 rows (9.04 KiB)
    * Largest partition:  3619 rows (106.56 KiB)
2023-01-03 00:02:03.288 Hail: INFO: Wrote all 212 blocks of 864345 x 1040 matrix with block size 4096.
2023-01-03 01:45:38.686 Hail: INFO: wrote table with 15381 rows in 423 partitions to /tmp/EmJ45gSwE8xqvCnC5BtFVT
    Total size: 12.38 MiB
    * Rows: 328.09 KiB
    * Globals: 12.06 MiB
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  170 rows (3.46 KiB)

In [22]:
print('Samples: %d  Variants: %d' % (joint_mt.count_cols(), joint_mt.count_rows()))



Samples: 1040  Variants: 853874


In [23]:
hl.summarize_variants(joint_mt)



Number of alleles,Count
2,853874

Allele type,Count
SNP,807162
Deletion,25921
Insertion,20736
Complex,55

Metric,Value
Transitions,559408.0
Transversions,247754.0
Ratio,2.26

Contig,Count
chr1,67638
chr2,67173
chr3,56875
chr4,54086
chr5,52157
chr6,48576
chr7,47096
chr8,44697
chr9,39775
chr10,42486


In [24]:
pruned_variant_table.write("gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/pruned_tbl.ht")

2023-01-03 04:39:53.631 Hail: INFO: wrote table with 853874 rows in 436 partitions to gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/pruned_tbl.ht


In [26]:
joint_mt.write("gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/pruned.mt", overwrite=True)

2023-01-03 07:00:45.359 Hail: INFO: wrote matrix table with 853874 rows and 1040 columns in 436 partitions to gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/pruned.mt


In [31]:
!gsutil ls -lh gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/
    

 69.66 GiB  2022-12-30T23:32:15Z  gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/cohort_for_GLNexus_2023Q1_1040.bcf
  56.6 GiB  2022-12-31T07:44:41Z  gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/cohort_for_GLNexus_2023Q1_1040.g.vcf.bgz
  2.64 MiB  2022-12-31T07:44:46Z  gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/cohort_for_GLNexus_2023Q1_1040.g.vcf.bgz.tbi
                                 gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/cohort_for_GLNexus_2023Q1_1040.mt/
                                 gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/pruned.mt/
                                 gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GR

In [6]:
pruned_mt = hl.read_matrix_table('gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/pruned.mt')


In [7]:
hl.summarize_variants(pruned_mt)



Number of alleles,Count
2,853874

Allele type,Count
SNP,807162
Deletion,25921
Insertion,20736
Complex,55

Metric,Value
Transitions,559408.0
Transversions,247754.0
Ratio,2.26

Contig,Count
chr1,67638
chr2,67173
chr3,56875
chr4,54086
chr5,52157
chr6,48576
chr7,47096
chr8,44697
chr9,39775
chr10,42486


In [32]:
#rel = hl.pc_relate(pruned_mt.GT, 0.01, k=10, min_kinship=0.1)

2023-01-03 07:04:00.876 Hail: INFO: hwe_normalize: found 853874 variants after filtering out monomorphic sites.
2023-01-03 07:04:49.038 Hail: INFO: pca: running PCA with 10 components.../ 436]
2023-01-03 07:34:26.957 Hail: INFO: Wrote all 209 blocks of 853874 x 1040 matrix with block size 4096.


In [None]:
#pairs = rel.filter(rel['kin'] > 0.125)

In [None]:
#related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False)
#result = pruned_mt.filter_cols(hl.is_defined(related_samples_to_remove[mt.col_key]), keep=False)

In [8]:
eigenvalues, pcs, _ = hl.hwe_normalized_pca(pruned_mt.GT, k=10, compute_loadings=False)

2023-01-03 20:59:35.919 Hail: INFO: hwe_normalize: found 853874 variants after filtering out monomorphic sites.
2023-01-03 21:00:13.014 Hail: INFO: pca: running PCA with 10 components.../ 436]

In [10]:
rel = hl.pc_relate(pruned_mt.GT,
                   0.01,
                   scores_expr=pcs[pruned_mt.col_key].scores,
                   min_kinship=0.1)

2023-01-03 21:32:50.374 Hail: INFO: Wrote all 209 blocks of 853874 x 1040 matrix with block size 4096.


In [17]:
related_samples_to_remove = hl.maximal_independent_set(rel.i, rel.j, keep=False)

2023-01-04 01:45:05.976 Hail: INFO: wrote matrix with 11 rows and 853874 columns as 209 blocks of size 4096 to /tmp/pcrelate-write-read-nbkt1zbujNPAFNOoFXUCpq.bm
2023-01-04 01:46:00.946 Hail: INFO: wrote matrix with 853874 rows and 1040 columns as 209 blocks of size 4096 to /tmp/pcrelate-write-read-R0T1Npmh8gfIAp0ZJhgveu.bm
2023-01-04 01:47:29.071 Hail: INFO: wrote matrix with 853874 rows and 1040 columns as 209 blocks of size 4096 to /tmp/pcrelate-write-read-ICTlxqs5jmIypXYKo3Qugj.bm
2023-01-04 02:08:16.483 Hail: INFO: wrote matrix with 1040 rows and 1040 columns as 1 block of size 4096 to /tmp/pcrelate-write-read-2CowH914Eq6hOC5J1xLFCQ.bm
2023-01-04 02:27:21.232 Hail: INFO: wrote matrix with 1040 rows and 1040 columns as 1 block of size 4096 to /tmp/pcrelate-write-read-YkQHxrARTLw4kUFUOscCAJ.bm
2023-01-04 02:27:22.902 Hail: INFO: wrote matrix with 1040 rows and 1040 columns as 1 block of size 4096 to /tmp/pcrelate-write-read-5n6sKAI4ns5VJwnK5ssaGb.bm
2023-01-04 02:59:43.514 Hail: INF

In [20]:
related_samples_to_remove.count()

[Stage 454:>                                                        (0 + 1) / 1]

0

In [21]:
related_samples_to_remove.write('gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/related_samples_to_remove.ht')


2023-01-04 04:46:21.192 Hail: INFO: wrote table with 0 rows in 0 partitions to gs://fc-secure-f7d80b48-be60-426f-aa6b-f037a1bf7f34/outputs/GRCh38/JointCallGVCFs/cohort_for_GLNexus_2023Q1_1040/related_samples_to_remove.ht


In [12]:
rel.show()

2023-01-03 21:33:43.382 Hail: INFO: wrote matrix with 11 rows and 853874 columns as 209 blocks of size 4096 to /tmp/pcrelate-write-read-hEiLhgOJTXsghPYRjBRK6U.bm
2023-01-03 21:34:42.311 Hail: INFO: wrote matrix with 853874 rows and 1040 columns as 209 blocks of size 4096 to /tmp/pcrelate-write-read-orlweYX3DqkayXZoYaeA2M.bm
2023-01-03 21:36:24.134 Hail: INFO: wrote matrix with 853874 rows and 1040 columns as 209 blocks of size 4096 to /tmp/pcrelate-write-read-nvgXJIPeioJV1QcjFeuDGk.bm
2023-01-03 21:57:08.190 Hail: INFO: wrote matrix with 1040 rows and 1040 columns as 1 block of size 4096 to /tmp/pcrelate-write-read-wPG1bvWsVh1J6eUlVeTNTj.bm
2023-01-03 22:19:13.619 Hail: INFO: wrote matrix with 1040 rows and 1040 columns as 1 block of size 4096 to /tmp/pcrelate-write-read-aHBO4fbx58em8SZBKcZdOq.bm
2023-01-03 22:19:16.106 Hail: INFO: wrote matrix with 1040 rows and 1040 columns as 1 block of size 4096 to /tmp/pcrelate-write-read-riL8RrBJVuPiAK6J25pLlu.bm
2023-01-03 22:49:32.129 Hail: INF

i,j,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0
s,s,kin,ibd0,ibd1,ibd2
str,str,float64,float64,float64,float64


In [13]:
pprint(eigenvalues)

[7.147688957963624,
 1.4392500557665275,
 1.3079805973804715,
 1.2469208350544243,
 1.2321902349701732,
 1.2238326988444663,
 1.219314472217862,
 1.2163613175965091,
 1.2129833239363441,
 1.204569401443862]


In [14]:
pcs.show(5, width=100)

s,scores
str,array<float64>
"""1000151""","[-3.09e-02,1.26e-02,8.74e-03,-1.02e-02,-7.68e-03,2.18e-02,8.62e-03,4.31e-02,-4.03e-03,-5.02e-02]"
"""1000513""","[3.60e-02,7.10e-03,8.22e-03,-1.09e-02,1.16e-02,1.09e-02,-3.58e-02,2.64e-03,1.88e-02,-3.98e-02]"
"""1000920""","[-3.15e-02,-6.97e-03,2.54e-02,9.61e-03,-3.99e-02,-2.23e-02,7.97e-02,-3.45e-02,2.02e-03,-6.40e-02]"
"""1001399""","[-3.48e-02,8.12e-03,5.50e-03,3.69e-02,-3.41e-02,3.71e-02,4.31e-03,4.78e-02,5.29e-02,1.74e-02]"
"""1001980""","[1.72e-01,1.66e-02,-2.70e-03,-5.55e-02,-2.34e-02,-1.40e-02,2.51e-02,2.07e-02,7.42e-02,-8.66e-03]"


In [15]:
pruned_mt = pruned_mt.annotate_cols(scores = pcs[pruned_mt.s].scores)

In [16]:
p = hl.plot.scatter(pruned_mt.scores[0],
                    pruned_mt.scores[1],
                    title='PCA', xlabel='PC1', ylabel='PC2')
show(p)

In [64]:
mt = hl.experimental.load_dataset(name='gnomad_hgdp_1kg_subset_dense',
                                  version='3.1.2',
                                  reference_genome='GRCh38',
                                  region='us',
                                  cloud='gcp')

In [66]:
mt.rows().select().show(5)



locus,alleles
locus<GRCh38>,array<str>
chr1:10055,"[""T"",""C""]"
chr1:10061,"[""T"",""C""]"
chr1:10109,"[""A"",""T""]"
chr1:10109,"[""AACCCT"",""A""]"
chr1:10114,"[""T"",""C""]"


In [67]:
mt.row_key.show(5)



locus,alleles
locus<GRCh38>,array<str>
chr1:10055,"[""T"",""C""]"
chr1:10061,"[""T"",""C""]"
chr1:10109,"[""A"",""T""]"
chr1:10109,"[""AACCCT"",""A""]"
chr1:10114,"[""T"",""C""]"


In [83]:
mt.cols().count()

4151

In [27]:
hl.utils.get_1kg('data/')

2022-12-11 19:08:51.845 Hail: INFO: downloading 1KG VCF ...
  Source: https://storage.googleapis.com/hail-tutorial/1kg.vcf.bgz
2022-12-11 19:08:52.790 Hail: INFO: importing VCF and writing to matrix table...
2022-12-11 19:08:55.927 Hail: INFO: scanning VCF for sortedness...
2022-12-11 19:09:06.617 Hail: INFO: Coerced sorted VCF - no additional import work to do
2022-12-11 19:09:20.008 Hail: INFO: wrote matrix table with 10879 rows and 284 columns in 16 partitions to data/1kg.mt
2022-12-11 19:09:20.168 Hail: INFO: downloading 1KG annotations ...
  Source: https://storage.googleapis.com/hail-tutorial/1kg_annotations.txt
2022-12-11 19:09:20.427 Hail: INFO: downloading Ensembl gene annotations ...
  Source: https://storage.googleapis.com/hail-tutorial/ensembl_gene_annotations.txt
2022-12-11 19:09:21.150 Hail: INFO: Done!


In [28]:
hl.import_vcf('data/1kg.vcf.bgz').write('data/1kg.mt', overwrite=True)

2022-12-11 19:09:36.114 Hail: INFO: scanning VCF for sortedness...
2022-12-11 19:09:37.957 Hail: INFO: Coerced sorted VCF - no additional import work to do
2022-12-11 19:09:45.011 Hail: INFO: wrote matrix table with 10879 rows and 284 columns in 1 partition to data/1kg.mt


In [29]:
mt = hl.read_matrix_table('data/1kg.mt')

In [30]:
mt.rows().select().show(5)

locus,alleles
locus<GRCh37>,array<str>
1:904165,"[""G"",""A""]"
1:909917,"[""G"",""A""]"
1:986963,"[""C"",""T""]"
1:1563691,"[""T"",""G""]"
1:1707740,"[""T"",""G""]"


In [31]:
mt.row_key.show(5)

locus,alleles
locus<GRCh37>,array<str>
1:904165,"[""G"",""A""]"
1:909917,"[""G"",""A""]"
1:986963,"[""C"",""T""]"
1:1563691,"[""T"",""G""]"
1:1707740,"[""T"",""G""]"


In [32]:
mt.s.show(5)

str
"""HG00096"""
"""HG00099"""
"""HG00105"""
"""HG00118"""
"""HG00129"""


In [33]:
mt.entry.take(5)

[Struct(GT=Call(alleles=[0, 0], phased=False), AD=[4, 0], DP=4, GQ=12, PL=[0, 12, 147]),
 Struct(GT=Call(alleles=[0, 0], phased=False), AD=[8, 0], DP=8, GQ=24, PL=[0, 24, 315]),
 Struct(GT=Call(alleles=[0, 0], phased=False), AD=[8, 0], DP=8, GQ=23, PL=[0, 23, 230]),
 Struct(GT=Call(alleles=[0, 0], phased=False), AD=[7, 0], DP=7, GQ=21, PL=[0, 21, 270]),
 Struct(GT=Call(alleles=[0, 0], phased=False), AD=[5, 0], DP=5, GQ=15, PL=[0, 15, 205])]

In [34]:
table = (hl.import_table('data/1kg_annotations.txt', impute=True)
         .key_by('Sample'))

2022-12-11 19:11:52.732 Hail: INFO: Reading table to impute column types
2022-12-11 19:11:54.183 Hail: INFO: Finished type imputation
  Loading field 'Sample' as type str (imputed)
  Loading field 'Population' as type str (imputed)
  Loading field 'SuperPopulation' as type str (imputed)
  Loading field 'isFemale' as type bool (imputed)
  Loading field 'PurpleHair' as type bool (imputed)
  Loading field 'CaffeineConsumption' as type int32 (imputed)


In [35]:
table.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'Sample': str 
    'Population': str 
    'SuperPopulation': str 
    'isFemale': bool 
    'PurpleHair': bool 
    'CaffeineConsumption': int32 
----------------------------------------
Key: ['Sample']
----------------------------------------


In [36]:
table.show(width=100)

Sample,Population,SuperPopulation,isFemale,PurpleHair,CaffeineConsumption
str,str,str,bool,bool,int32
"""HG00096""","""GBR""","""EUR""",False,False,4
"""HG00097""","""GBR""","""EUR""",True,True,4
"""HG00098""","""GBR""","""EUR""",False,False,5
"""HG00099""","""GBR""","""EUR""",True,False,4
"""HG00100""","""GBR""","""EUR""",True,False,5
"""HG00101""","""GBR""","""EUR""",False,True,1
"""HG00102""","""GBR""","""EUR""",True,True,6
"""HG00103""","""GBR""","""EUR""",False,True,5
"""HG00104""","""GBR""","""EUR""",True,False,5
"""HG00105""","""GBR""","""EUR""",False,False,4


In [37]:
print(mt.col.dtype)

struct{s: str}


In [38]:
mt = mt.annotate_cols(pheno = table[mt.s])

In [39]:
mt.col.describe()

--------------------------------------------------------
Type:
        struct {
        s: str, 
        pheno: struct {
            Population: str, 
            SuperPopulation: str, 
            isFemale: bool, 
            PurpleHair: bool, 
            CaffeineConsumption: int32
        }
    }
--------------------------------------------------------
Source:
    <hail.matrixtable.MatrixTable object at 0x7f1fcaa004d0>
Index:
    ['column']
--------------------------------------------------------


In [40]:
pprint(table.aggregate(hl.agg.counter(table.SuperPopulation)))

{'AFR': 1018, 'AMR': 535, 'EAS': 617, 'EUR': 669, 'SAS': 661}


In [41]:
pprint(table.aggregate(hl.agg.stats(table.CaffeineConsumption)))

{'max': 10.0,
 'mean': 3.9837142857142855,
 'min': -1.0,
 'n': 3500,
 'stdev': 1.7021055628070711,
 'sum': 13943.0}


In [42]:
table.count()

3500

In [43]:
mt.count_cols()

284

In [44]:
mt.aggregate_cols(hl.agg.counter(mt.pheno.SuperPopulation))

{'AFR': 76, 'AMR': 34, 'EAS': 72, 'EUR': 47, 'SAS': 55}

In [45]:
pprint(mt.aggregate_cols(hl.agg.stats(mt.pheno.CaffeineConsumption)))

{'max': 9.0,
 'mean': 4.415492957746479,
 'min': 0.0,
 'n': 284,
 'stdev': 1.577763427465917,
 'sum': 1254.0}


In [46]:
snp_counts = mt.aggregate_rows(hl.agg.counter(hl.Struct(ref=mt.alleles[0], alt=mt.alleles[1])))
pprint(snp_counts)

{Struct(ref='C', alt='T'): 2418,
 Struct(ref='T', alt='A'): 77,
 Struct(ref='G', alt='A'): 2367,
 Struct(ref='G', alt='C'): 111,
 Struct(ref='A', alt='C'): 451,
 Struct(ref='C', alt='A'): 494,
 Struct(ref='G', alt='T'): 477,
 Struct(ref='A', alt='T'): 75,
 Struct(ref='C', alt='G'): 150,
 Struct(ref='A', alt='G'): 1929,
 Struct(ref='T', alt='G'): 466,
 Struct(ref='T', alt='C'): 1864}


In [47]:
from collections import Counter
counts = Counter(snp_counts)
counts.most_common()

[(Struct(ref='C', alt='T'), 2418),
 (Struct(ref='G', alt='A'), 2367),
 (Struct(ref='A', alt='G'), 1929),
 (Struct(ref='T', alt='C'), 1864),
 (Struct(ref='C', alt='A'), 494),
 (Struct(ref='G', alt='T'), 477),
 (Struct(ref='T', alt='G'), 466),
 (Struct(ref='A', alt='C'), 451),
 (Struct(ref='C', alt='G'), 150),
 (Struct(ref='G', alt='C'), 111),
 (Struct(ref='T', alt='A'), 77),
 (Struct(ref='A', alt='T'), 75)]

In [48]:
p = hl.plot.histogram(mt.DP, range=(0,30), bins=30, title='DP Histogram', legend='DP')
show(p)

[Stage 24:>                                                         (0 + 1) / 1]

In [49]:
mt.col.describe()

--------------------------------------------------------
Type:
        struct {
        s: str, 
        pheno: struct {
            Population: str, 
            SuperPopulation: str, 
            isFemale: bool, 
            PurpleHair: bool, 
            CaffeineConsumption: int32
        }
    }
--------------------------------------------------------
Source:
    <hail.matrixtable.MatrixTable object at 0x7f1fcaa004d0>
Index:
    ['column']
--------------------------------------------------------


In [50]:
mt = hl.sample_qc(mt)

In [51]:
mt.col.describe()

--------------------------------------------------------
Type:
        struct {
        s: str, 
        pheno: struct {
            Population: str, 
            SuperPopulation: str, 
            isFemale: bool, 
            PurpleHair: bool, 
            CaffeineConsumption: int32
        }, 
        sample_qc: struct {
            dp_stats: struct {
                mean: float64, 
                stdev: float64, 
                min: float64, 
                max: float64
            }, 
            gq_stats: struct {
                mean: float64, 
                stdev: float64, 
                min: float64, 
                max: float64
            }, 
            call_rate: float64, 
            n_called: int64, 
            n_not_called: int64, 
            n_filtered: int64, 
            n_hom_ref: int64, 
            n_het: int64, 
            n_hom_var: int64, 
            n_non_ref: int64, 
            n_singleton: int64, 
            n_snp: int64, 
            n_insertio

In [52]:
p = hl.plot.histogram(mt.sample_qc.call_rate, range=(.88,1), legend='Call Rate')
show(p)

In [53]:
p = hl.plot.histogram(mt.sample_qc.gq_stats.mean, range=(10,70), legend='Mean Sample GQ')
show(p)

In [54]:
p = hl.plot.scatter(mt.sample_qc.dp_stats.mean, mt.sample_qc.call_rate, xlabel='Mean DP', ylabel='Call Rate')
show(p)

In [55]:
mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97))
print('After filter, %d/284 samples remain.' % mt.count_cols())

[Stage 31:>                                                         (0 + 1) / 1]

After filter, 250/284 samples remain.


In [56]:
ab = mt.AD[1] / hl.sum(mt.AD)

filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) |
                        (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
                        (mt.GT.is_hom_var() & (ab >= 0.9)))

fraction_filtered = mt.aggregate_entries(hl.agg.fraction(~filter_condition_ab))
print(f'Filtering {fraction_filtered * 100:.2f}% entries out of downstream analysis.')
mt = mt.filter_entries(filter_condition_ab)

[Stage 32:>                                                         (0 + 1) / 1]

Filtering 3.60% entries out of downstream analysis.


[Stage 33:>                                                         (0 + 1) / 1]

In [57]:
mt = hl.variant_qc(mt)

In [58]:
mt.row.describe()

--------------------------------------------------------
Type:
        struct {
        locus: locus<GRCh37>, 
        alleles: array<str>, 
        rsid: str, 
        qual: float64, 
        filters: set<str>, 
        info: struct {
            AC: array<int32>, 
            AF: array<float64>, 
            AN: int32, 
            BaseQRankSum: float64, 
            ClippingRankSum: float64, 
            DP: int32, 
            DS: bool, 
            FS: float64, 
            HaplotypeScore: float64, 
            InbreedingCoeff: float64, 
            MLEAC: array<int32>, 
            MLEAF: array<float64>, 
            MQ: float64, 
            MQ0: int32, 
            MQRankSum: float64, 
            QD: float64, 
            ReadPosRankSum: float64, 
            set: str
        }, 
        variant_qc: struct {
            dp_stats: struct {
                mean: float64, 
                stdev: float64, 
                min: float64, 
                max: float64
            }, 

In [59]:
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)

In [60]:
mt = mt.filter_rows(mt.variant_qc.p_value_hwe > 1e-6)

In [61]:
print('Samples: %d  Variants: %d' % (mt.count_cols(), mt.count_rows()))

[Stage 36:>                                                         (0 + 1) / 1]

Samples: 250  Variants: 7774


In [62]:
gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                                 x=mt.GT.n_alt_alleles(),
                                 covariates=[1.0])
gwas.row.describe()

2022-12-11 19:20:13.729 Hail: INFO: linear_regression_rows: running on 250 samples for 1 response variable y,
    with input variable x, and 1 additional covariate...


--------------------------------------------------------
Type:
        struct {
        locus: locus<GRCh37>, 
        alleles: array<str>, 
        n: int32, 
        sum_x: float64, 
        y_transpose_x: float64, 
        beta: float64, 
        standard_error: float64, 
        t_stat: float64, 
        p_value: float64
    }
--------------------------------------------------------
Source:
    <hail.table.Table object at 0x7f1fcb4cabd0>
Index:
    ['row']
--------------------------------------------------------


In [63]:
p = hl.plot.manhattan(gwas.p_value)
show(p)

[Stage 40:>                                                         (0 + 1) / 1]