This repository has been archived by the owner on Dec 7, 2022. It is now read-only.
/
constraint_basics.py
1240 lines (1055 loc) · 70.8 KB
/
constraint_basics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
__author__ = 'konradk'
from typing import Any, Tuple, Union, Optional, List
from .generic import *
import pickle
import copy
import uuid
root = 'gs://gnomad-public/papers/2019-flagship-lof/v1.0'
# Unprocessed files
fasta_path = f'{root}/context/source/Homo_sapiens_assembly19.fasta'
gerp_annotations_path = f'{root}/annotations/gerp.scores.GRCh37.ht' # Gerp is in here as S
# Input datasets
raw_context_txt_path = f'{root}/context/source/Homo_sapiens_assembly19.fasta.snps_only.vep.txt.bgz/*'
raw_context_ht_path = f'{root}/context/Homo_sapiens_assembly19.fasta.snps_only.unsplit.ht'
vep_context_ht_path = f'{root}/context/Homo_sapiens_assembly19.fasta.snps_only.unsplit.vep_20181129.ht'
context_ht_path = f'{root}/context/Homo_sapiens_assembly19.fasta.snps_only.vep_20181129.ht'
processed_genomes_ht_path = f'{root}/model/genomes_processed.ht'
processed_exomes_ht_path = f'{root}/model/exomes_processed.ht'
# Possible variant files
all_possible_summary_pickle = f'{root}/model/tmp/all_possible_counts_by_context.pckl'
all_possible_summary_unfiltered_pickle = f'{root}/model/tmp/all_possible_counts_by_context_unfiltered.pckl'
mutation_rate_ht_path = f'{root}/model/mutation_rate_methylation_bins.ht'
po_coverage_ht_path = f'{root}/model/prop_observed_by_coverage_no_common_pass_filtered_bins.ht'
po_ht_path = f'{root}/{{subdir}}/prop_observed_{{subdir}}.ht'
raw_constraint_ht_path = f'{root}/{{subdir}}/constraint_{{subdir}}.ht'
final_constraint_ht_path = f'{root}/{{subdir}}/constraint_final_{{subdir}}.ht'
HIGH_COVERAGE_CUTOFF = 40
VARIANT_TYPES_FOR_MODEL = ('ACG', 'TCG', 'CCG', 'GCG', 'non-CpG')
POPS = ('global', 'afr', 'amr', 'eas', 'nfe', 'sas')
MODEL_KEYS = {
'worst_csq': ['gene'],
'tx_annotation': ['gene', 'expressed'],
'standard': ['gene', 'transcript', 'canonical']
}
# Data loads
def get_old_mu_data() -> hl.Table:
old_mu_data = hl.import_table(f'{root}/old_exac_data/fordist_1KG_mutation_rate_table.txt',
delimiter=' ', impute=True)
return old_mu_data.transmute(context=old_mu_data['from'], ref=old_mu_data['from'][1],
alt=old_mu_data.to[1]).key_by('context', 'ref', 'alt')
def load_all_possible_summary(filtered: bool = True) -> Dict[hl.Struct, int]:
fname = all_possible_summary_pickle if filtered else all_possible_summary_unfiltered_pickle
with hl.hadoop_open(fname, 'rb') as f:
return pickle.load(f)
def load_tx_expression_data(context: bool =True) -> hl.Table:
"""
Load tx MatrixTable with processed expression data.
:param context: Whether to load the Table with proportion expressed across transcripts (pext) score annotations
for all possible bases or the Table with pext scores for sites found in gnomAD v2.1.1 exomes, defaults to True.
:return: tx Table with expression data.
"""
tx_ht_file = 'gs://gnomad-public/papers/2019-tx-annotation/pre_computed/all.possible.snvs.tx_annotated.021819.ht' if context \
else 'gs://gnomad-public/papers/2019-tx-annotation/gnomad.exomes.r2.1.1.sites.tx_annotated.021319.ht'
tx_ht = hl.read_matrix_table(tx_ht_file).rows()
def process_expression_data(csq_expression: hl.expr.StructExpression) -> hl.expr.StructExpression:
"""
Process expression data by adding annotations and drop tissue columns other than Brain_Cortex.
Function will add the following annotations to tx Table:
- ensg
- csq
- symbol
- lof
- mean_expression
- max_expression
- mean_brain_expression
- Brain_Cortex
:param csq_expression: 'tx_annotation' (a StructExpression) within tx Table
:return: Expression data
"""
exprs_to_drop = ['ensg', 'csq', 'symbol', 'lof', 'lof_flag', 'mean_proportion']
expression_data = csq_expression.drop(*exprs_to_drop)
all_tissues = list(expression_data.values())
expression_data_list = list(zip(list(expression_data), all_tissues))
brain_tissues = [x[1] for x in expression_data_list if 'Brain' in x[0]]
return csq_expression.select('ensg', 'csq', 'symbol', 'lof',
mean_expression=hl.mean(hl.filter(lambda e: ~hl.is_nan(e), all_tissues), filter_missing=True),
max_expression=hl.max(hl.filter(lambda e: ~hl.is_nan(e), all_tissues), filter_missing=True),
mean_brain_expression=hl.mean(hl.filter(lambda k: ~hl.is_nan(k), brain_tissues), filter_missing=True),
Brain_Cortex=csq_expression.Brain_Cortex
)
return tx_ht.annotate(tx_annotation=tx_ht.tx_annotation.map(process_expression_data))
def annotate_distance_to_splice(input_ht: hl.Table) -> hl.Table:
"""
Annotate distance to the nearest splice site.
:param input_ht: Input exome or context Table.
:return: Table with 'nearest_splice' annotation added (distance to nearest splice site).
"""
# Load GTF file
tmp_path = f'/tmp_{uuid.uuid4()}.ht'
ht = hl.experimental.import_gtf('gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz', 'GRCh37', True, min_partitions=100)
ht = ht.filter((ht.feature == 'exon') & (ht.transcript_type == 'protein_coding')).key_by()
ht = ht.select(gene_id=ht.gene_id.split('\\.')[0], transcript_id=ht.transcript_id.split('\\.')[0],
loc=[hl.struct(acceptor=ht.strand == '+', locus=ht.interval.start),
hl.struct(acceptor=ht.strand != '+', locus=ht.interval.end)]).explode('loc')
ht.transmute(**ht.loc).key_by('locus').collect_by_key().write(tmp_path, True)
# Scan to get bounds, create intervals
tmp_path2 = f'/tmp_{uuid.uuid4()}.ht'
ht = hl.read_table(tmp_path)
last_locus = hl.scan.take(ht.row, 1, ordering=-ht.locus.global_position())
ht.key_by(
interval=hl.or_missing((hl.len(last_locus) > 0) &
(last_locus[0].contig == ht.locus.contig),
hl.interval(last_locus[0], ht.locus))
).write(tmp_path2)
ht = hl.read_table(tmp_path2)
# join against intervals
data = ht[input_ht.locus]
# find nearer splice site
return input_ht.transmute(
nearest_splice=hl.cond(
hl.abs(input_ht.locus.position - input_ht.data.locus.position) <= hl.abs(
input_ht.locus.position - data.last.locus.position),
hl.struct(dist=input_ht.locus.position - data.locus.position, sites=input_ht.data.values),
hl.struct(dist=input_ht.locus.position - data.last.locus.position, sites=input_ht.data.last.values)
)
)
def annotate_tx_expression_data(ht, tx_ht, location):
key = ht.key if isinstance(ht, hl.Table) else ht.row_key
return hl.find(lambda csq: (csq.ensg == location.gene_id) &
(csq.csq == location.most_severe_consequence),
tx_ht[key].tx_annotation)
# Pre-process
def export_fasta(hc) -> None:
# Done with an 0.1 jar
hc.read('gs://gnomad-resources/Homo_sapiens_assembly19.fasta.snps_only.vep.vds').export_variants('gs://gnomad-resources/context/source/Homo_sapiens_assembly19.fasta.snps_only.vep.txt.bgz', 'v, va.context, va.vep', types=True, parallel=True)
def import_fasta(raw_context_txt_path: str, raw_context_ht_path: str, overwrite: bool = False) -> None:
# Works with --jar gs://hail-common/builds/devel/jars/hail-devel-47de006f2c62-Spark-2.2.0.jar
ht = hl.import_table(raw_context_txt_path, no_header=True, min_partitions=40000,
types={'f0': hl.tstr,
'f1': hl.tstr,
'f2': hl.tstruct(
assembly_name=hl.tstr,
allele_string=hl.tstr,
ancestral=hl.tstr,
colocated_variants=hl.tarray(
hl.tstruct(aa_allele=hl.tstr, aa_maf=hl.tfloat, afr_allele=hl.tstr, afr_maf=hl.tfloat, allele_string=hl.tstr, amr_allele=hl.tstr, amr_maf=hl.tfloat, clin_sig=hl.tarray(hl.tstr),
end=hl.tint, eas_allele=hl.tstr, eas_maf=hl.tfloat, ea_allele=hl.tstr, ea_maf=hl.tfloat, eur_allele=hl.tstr, eur_maf=hl.tfloat, exac_adj_allele=hl.tstr, exac_adj_maf=hl.tfloat,
exac_allele=hl.tstr, exac_afr_allele=hl.tstr, exac_afr_maf=hl.tfloat, exac_amr_allele=hl.tstr, exac_amr_maf=hl.tfloat, exac_eas_allele=hl.tstr, exac_eas_maf=hl.tfloat,
exac_fin_allele=hl.tstr, exac_fin_maf=hl.tfloat, exac_maf=hl.tfloat, exac_nfe_allele=hl.tstr, exac_nfe_maf=hl.tfloat, exac_oth_allele=hl.tstr, exac_oth_maf=hl.tfloat,
exac_sas_allele=hl.tstr, exac_sas_maf=hl.tfloat, id=hl.tstr, minor_allele=hl.tstr, minor_allele_freq=hl.tfloat, phenotype_or_disease=hl.tint, pubmed=hl.tarray(hl.tint),
sas_allele=hl.tstr, sas_maf=hl.tfloat, somatic=hl.tint, start=hl.tint, strand=hl.tint),
),
context=hl.tstr, end=hl.tint, id=hl.tstr, input=hl.tstr,
intergenic_consequences=hl.tarray(
hl.tstruct(allele_num=hl.tint, consequence_terms=hl.tarray(hl.tstr), impact=hl.tstr, minimised=hl.tint, variant_allele=hl.tstr)),
most_severe_consequence=hl.tstr, motif_feature_consequences=hl.tarray(
hl.tstruct(allele_num=hl.tint, consequence_terms=hl.tarray(hl.tstr), high_inf_pos=hl.tstr,impact=hl.tstr, minimised=hl.tint, motif_feature_id=hl.tstr, motif_name=hl.tstr, motif_pos=hl.tint, motif_score_change=hl.tfloat, strand=hl.tint, variant_allele=hl.tstr)),
regulatory_feature_consequences=hl.tarray(hl.tstruct(
allele_num=hl.tint, biotype=hl.tstr, consequence_terms=hl.tarray(hl.tstr), impact=hl.tstr, minimised=hl.tint, regulatory_feature_id=hl.tstr, variant_allele=hl.tstr)
), seq_region_name=hl.tstr, start=hl.tint, strand=hl.tint, transcript_consequences=hl.tarray(hl.tstruct(
allele_num=hl.tint, amino_acids=hl.tstr, biotype=hl.tstr, canonical=hl.tint, ccds=hl.tstr, cdna_start=hl.tint, cdna_end=hl.tint, cds_end=hl.tint, cds_start=hl.tint,
codons=hl.tstr, consequence_terms=hl.tarray(hl.tstr), distance=hl.tint, domains=hl.tarray(hl.tstruct(db=hl.tstr, name=hl.tstr)), exon=hl.tstr,
gene_id=hl.tstr, gene_pheno=hl.tint, gene_symbol=hl.tstr, gene_symbol_source=hl.tstr, hgnc_id=hl.tint, hgvsc=hl.tstr, hgvsp=hl.tstr, hgvs_offset=hl.tint,
impact=hl.tstr, intron=hl.tstr, lof=hl.tstr, lof_flags=hl.tstr, lof_filter=hl.tstr, lof_info=hl.tstr, minimised=hl.tint, polyphen_prediction=hl.tstr, polyphen_score=hl.tfloat,
protein_end=hl.tint, protein_start=hl.tint, protein_id=hl.tstr, sift_prediction=hl.tstr, sift_score=hl.tfloat, strand=hl.tint, swissprot=hl.tstr, transcript_id=hl.tstr, trembl=hl.tstr, uniparc=hl.tstr, variant_allele=hl.tstr
)), variant_class=hl.tstr
)}
).rename({'f0': 'v', 'f1': 'context', 'f2': 'vep'})
ht.transmute(**hl.parse_variant(ht.v)).key_by('locus', 'alleles').write(raw_context_ht_path, overwrite)
def vep_context_ht(raw_context_ht_path: str, vep_context_ht_path: str, overwrite: bool = False):
ht_full = hl.read_table(raw_context_ht_path)
chunks = ('1-2', '3', '4', '5-6', '7-9', '10-12', '13-15', '16-18', '19-22', 'X-Y')
for i in chunks:
print(i)
ht = hl.filter_intervals(ht_full, [hl.parse_locus_interval(i)])
hl.vep(ht, vep_config).write(f'{root}/context/parts/Homo_sapiens_assembly19.fasta.snps_only.{i}.ht', overwrite=overwrite, stage_locally=True)
hts = [hl.read_table(f'{root}/context/parts/Homo_sapiens_assembly19.fasta.snps_only.{i}.ht') for i in chunks]
ht = hts[0].union(*hts[1:])
ht.write(vep_context_ht_path, overwrite)
def split_context_mt(raw_context_ht_path: str, coverage_ht_paths: Dict[str, str], methylation_ht_path: str,
context_ht_path: str, overwrite: bool = False) -> None:
raw_context_ht = hl.split_multi_hts(hl.read_table(raw_context_ht_path))
raw_context_ht.write(f'{raw_context_ht_path}.temp_split.ht', overwrite)
raw_context_ht = hl.read_table(f'{raw_context_ht_path}.temp_split.ht')
coverage_hts = {loc: hl.read_table(coverage_ht_path) for loc, coverage_ht_path in coverage_ht_paths.items()}
coverage_hts = {loc: coverage_ht.drop('#chrom', 'pos') if '#chrom' in list(coverage_ht.row) else coverage_ht
for loc, coverage_ht in coverage_hts.items()}
methylation_ht = hl.read_table(methylation_ht_path)
gerp_ht = hl.read_table(gerp_annotations_path)
raw_context_ht = raw_context_ht.annotate(
methylation=methylation_ht[raw_context_ht.locus],
coverage=hl.struct(**{loc: coverage_ht[raw_context_ht.locus] for loc, coverage_ht in coverage_hts.items()}),
gerp=gerp_ht[raw_context_ht.locus].S)
raw_context_ht.write(context_ht_path, overwrite)
def pre_process_data(ht: hl.Table, split_context_ht_path: str,
output_ht_path: str, overwrite: bool = False) -> None:
"""
Add annotations from VEP context Table to gnomAD data.
Function adds the following annotations:
- context
- methylation
- coverage
- gerp
- pass_filters
Function drops `a_index`, `was_split`, and`colocated_variants` annotations from gnomAD data.
.. note::
Function expects that multiallelic variants in the VEP context Table have been split.
:param ht: gnomAD exomes or genomes public Hail Table.
:param split_context_ht_path: Path to VEP context Table.
:param output_ht_path: Path to output Table.
:param overwrite: Whether to overwrite existing data. Defaults to False.
"""
context_ht = hl.read_table(split_context_ht_path).drop('a_index', 'was_split')
context_ht = context_ht.annotate(vep=context_ht.vep.drop('colocated_variants'))
ht.annotate(**context_ht[ht.key], pass_filters=hl.len(ht.filters) == 0).write(output_ht_path, overwrite)
def prepare_ht(ht, trimer: bool = False, annotate_coverage: bool = True):
"""
Filter input Table and add annotations used in constraint calculations.
Function filters to SNPs, removes rows with undefined contexts, collapses strands
to deduplicate trimer or heptamer contexts, and annotates the input Table.
Function adds the following annotations:
- ref
- alt
- was_flipped
- methylation_level
- cpg
- transition
- variant_type
- variant_type_model
- exome_coverage
:param ht: Input Table to be annotated.
:param trimer: Whether to use trimers or heptamers. Defaults to False.
:param annotate_coverage: Whether to annotate the coverage of exome. Defaults to True.
:return: Table with annotations.
"""
if trimer:
ht = trimer_from_heptamer(ht)
str_len = 3 if trimer else 7
if isinstance(ht, hl.Table):
ht = ht.annotate(ref=ht.alleles[0], alt=ht.alleles[1])
ht = ht.filter((hl.len(ht.ref) == 1) & (hl.len(ht.alt) == 1) & ht.context.matches(f'[ATCG]{{{str_len}}}'))
ht = annotate_variant_types(collapse_strand(ht), not trimer)
else:
ht = ht.annotate_rows(ref=ht.alleles[0], alt=ht.alleles[1])
ht = ht.filter_rows((hl.len(ht.ref) == 1) & (hl.len(ht.alt) == 1) & ht.context.matches(f'[ATCG]{{{str_len}}}'))
ht = annotate_variant_types(collapse_strand(ht), not trimer)
annotation = {
'methylation_level': hl.case().when(
ht.cpg & (ht.methylation.MEAN > 0.6), 2
).when(
ht.cpg & (ht.methylation.MEAN > 0.2), 1
).default(0)
}
if annotate_coverage:
annotation['exome_coverage'] = ht.coverage.exomes.median
return ht.annotate(**annotation) if isinstance(ht, hl.Table) else ht.annotate_rows(**annotation)
def filter_to_autosomes_par(ht: Union[hl.Table, hl.MatrixTable]) -> Union[hl.Table, hl.MatrixTable]:
return ht.filter(ht.locus.in_autosome_or_par())
def annotate_with_mu(ht: hl.Table, mutation_ht: hl.Table, output_loc: str = 'mu_snp',
keys: Tuple[str] = ('context', 'ref', 'alt', 'methylation_level')) -> hl.Table:
"""
Annotate SNP mutation rate for the input Table.
:param ht: Input Table.
:param mutation_ht: Mutation rate Table.
:param output_loc: Name for mutational rate annotation. Defaults to 'mu_snp'.
:param keys: Common keys between mutation rate Table and input Table. Defaults to ('context', 'ref', 'alt', 'methylation_level').
:return: Table with mutational rate annotation added (default name for annotation is 'mu_snp').
"""
mu = hl.literal(mutation_ht.aggregate(hl.dict(hl.agg.collect(
(hl.struct(**{k: mutation_ht[k] for k in keys}), mutation_ht.mu_snp)))))
mu = mu.get(hl.struct(**{k: ht[k] for k in keys}))
return ht.annotate(**{output_loc: hl.case().when(hl.is_defined(mu), mu).or_error('Missing mu')})
def calculate_mu_by_downsampling(genome_ht: hl.Table, raw_context_ht: hl.MatrixTable,
recalculate_all_possible_summary: bool = True,
recalculate_all_possible_summary_unfiltered: bool = False,
omit_methylation: bool = False, count_singletons: bool = False,
remove_common_downsampled: bool = True, remove_common_ordinary: bool = False,
grouping_variables: Optional[Tuple[str]] = (), summary_file: str = None
) -> hl.Table:
context_ht = filter_to_autosomes(remove_coverage_outliers(raw_context_ht))
genome_ht = filter_to_autosomes(remove_coverage_outliers(genome_ht))
if grouping_variables:
context_ht = context_ht.filter(hl.all(lambda x: x, [hl.is_defined(context_ht[x]) for x in grouping_variables]))
genome_ht = genome_ht.filter(hl.all(lambda x: x, [hl.is_defined(genome_ht[x]) for x in grouping_variables]))
else:
context_ht = filter_for_mu(context_ht)
genome_ht = filter_for_mu(genome_ht)
context_ht = context_ht.select('context', 'ref', 'alt', 'methylation_level', *grouping_variables)
genome_ht = genome_ht.select('context', 'ref', 'alt', 'methylation_level', 'freq', 'pass_filters', *grouping_variables)
genome_join = genome_ht[context_ht.key]
if remove_common_downsampled:
ac_cutoff = 5
downsampling_level = 1000
freq_index = hl.eval(
hl.zip_with_index(genome_ht.freq_meta).find(lambda f:
(f[1].get('downsampling') == str(downsampling_level)) &
(f[1].get('pop') == 'global') &
(f[1].get('group') == 'adj') &
(f[1].size() == 3)))[0]
# In the denominator, only keep variants not in the genome dataset, or with AC <= ac_cutoff and passing filters
context_ht = context_ht.filter(
hl.is_missing(genome_join) | ((genome_join.freq[freq_index].AC <= ac_cutoff) & genome_join.pass_filters)
)
# Keep only AC <= ac_cutoff in numerator
genome_ht = genome_ht.filter(genome_ht.freq[freq_index].AC <= ac_cutoff)
elif remove_common_ordinary:
af_cutoff = 0.001
context_ht = context_ht.filter(
hl.is_missing(genome_join) | (
(genome_join.freq[0].AF <= af_cutoff) & genome_join.pass_filters
)
)
# Keep only AF < af_cutoff in numerator
genome_ht = filter_by_frequency(genome_ht, 'below', frequency=af_cutoff, adj=True)
else:
context_ht = context_ht.filter(
hl.is_missing(genome_join) | genome_join.pass_filters
)
genome_ht = genome_ht.filter(genome_ht.pass_filters)
if not summary_file: summary_file = all_possible_summary_pickle
all_possible_dtype = count_variants(context_ht, omit_methylation=omit_methylation,
additional_grouping=grouping_variables, return_type_only=True)
if recalculate_all_possible_summary:
all_possible = count_variants(context_ht, omit_methylation=omit_methylation,
additional_grouping=grouping_variables).variant_count
with hl.hadoop_open(summary_file, 'wb') as f:
pickle.dump(all_possible, f)
with hl.hadoop_open(summary_file, 'rb') as f:
all_possible = pickle.load(f)
if recalculate_all_possible_summary_unfiltered:
all_possible_unfiltered = count_variants(raw_context_ht.rows(), omit_methylation=omit_methylation).variant_count
all_possible_unfiltered = {x: y for x, y in list(all_possible_unfiltered.items()) if x.context is not None}
with hl.hadoop_open(all_possible_summary_unfiltered_pickle, 'wb') as f:
pickle.dump(all_possible_unfiltered, f)
# Uncomment this line and next few commented lines to back-calculate total_mu from the old mutation rate dataset
# all_possible_unfiltered = load_all_possible_summary(filtered=False)
genome_ht = count_variants(genome_ht, count_downsamplings=['global', 'nfe', 'afr'],
count_singletons=count_singletons,
additional_grouping=grouping_variables, omit_methylation=omit_methylation)
old_mu_data = get_old_mu_data()
ht = genome_ht.annotate(possible_variants=hl.literal(all_possible, dtype=all_possible_dtype)[genome_ht.key],
# possible_variants_unfiltered=hl.literal(all_possible_unfiltered, dtype=all_possible_dtype)[genome_ht.key],
old_mu_snp=old_mu_data[hl.struct(
context=genome_ht.context, ref=genome_ht.ref, alt=genome_ht.alt)].mu_snp)
ht = ht.persist()
total_bases = ht.aggregate(hl.agg.sum(ht.possible_variants)) // 3
# total_bases_unfiltered = ht.aggregate(hl.agg.sum(ht.possible_variants_unfiltered)) // 3
# total_mu = ht.aggregate(hl.agg.sum(ht.old_mu_snp * ht.possible_variants_unfiltered) / total_bases_unfiltered)
total_mu = 1.2e-08
correction_factors = ht.aggregate(total_mu / (hl.agg.array_sum(ht.downsampling_counts_global) / total_bases))
correction_factors_nfe = ht.aggregate(total_mu / (hl.agg.array_sum(ht.downsampling_counts_nfe) / total_bases))
correction_factors_afr = ht.aggregate(total_mu / (hl.agg.array_sum(ht.downsampling_counts_afr) / total_bases))
ht = annotate_variant_types(ht.annotate(downsamplings_frac_observed=ht.downsampling_counts_global / ht.possible_variants,
downsamplings_mu_snp=hl.literal(correction_factors) * ht.downsampling_counts_global / ht.possible_variants,
downsamplings_mu_nfe=hl.literal(correction_factors_nfe) * ht.downsampling_counts_nfe / ht.possible_variants,
downsamplings_mu_afr=hl.literal(correction_factors_afr) * ht.downsampling_counts_afr / ht.possible_variants))
downsamplings = list(map(lambda x: x[1], get_downsamplings(ht)))
index_1kg = downsamplings.index(1000)
return ht.annotate(proportion_observed_1kg=ht.downsampling_counts_global[index_1kg] / ht.possible_variants,
proportion_observed=ht.variant_count / ht.possible_variants,
mu_snp=ht.downsamplings_mu_snp[index_1kg],
mu_snp_nfe=ht.downsamplings_mu_nfe[index_1kg],
mu_snp_afr=ht.downsamplings_mu_afr[index_1kg])
def get_proportion_observed_by_coverage(exome_ht: hl.Table, context_ht: hl.Table, mutation_ht: hl.Table,
recompute_possible: bool = False, dataset: str = 'gnomad',
impose_high_af_cutoff_upfront: bool = True) -> hl.Table:
"""
Count the observed variants and possible variants by exome coverage.
:param exome_ht: Preprocessed exome Table.
:param context_ht: Preprocessed context Table.
:param mutation_ht: Preprocessed mutation rate Table.
:param recompute_possible: Whether to use context Table to recompute the number of possible variants instead of using a precomputed intermediate Table if it exists. Defaults to False.
:param dataset: Dataset to use when computing frequency index. Defaults to 'gnomad'.
:param impose_high_af_cutoff_upfront: Whether to remove high frequency alleles. Defaults to True.
:return: Table with observed variant and possible variant count.
"""
exome_ht = add_most_severe_csq_to_tc_within_ht(exome_ht)
context_ht = add_most_severe_csq_to_tc_within_ht(context_ht)
context_ht = fast_filter_vep(context_ht).select('context', 'ref', 'alt', 'methylation_level', 'exome_coverage')
context_ht = context_ht.filter(hl.is_defined(context_ht.exome_coverage))
exome_ht = fast_filter_vep(exome_ht).select('context', 'ref', 'alt', 'methylation_level', 'exome_coverage', 'freq', 'pass_filters')
grouping = ('exome_coverage',)
af_cutoff = 0.001
exome_join = exome_ht[context_ht.key]
freq_index = exome_ht.freq_index_dict.collect()[0][dataset]
def keep_criteria(ht: Union[hl.Table, hl.expr.StructExpression]) -> hl.expr.BooleanExpression:
"""
Generate expression to keep PASS variants with an allele count greater than 0.
If `impose_high_af_cutoff_upfront` is True, only keep variants with an AF less than or equal to
`af_cutoff` (default is 0.001).
:param ht: Table or StructExpression of variants that have a defined key in the `context_ht`.
:return: BooleanExpression indicating if the row should be kept.
"""
crit = (ht.freq[freq_index].AC > 0) & ht.pass_filters
if impose_high_af_cutoff_upfront:
crit &= (ht.freq[freq_index].AF <= af_cutoff)
return crit
context_ht = context_ht.filter(hl.is_missing(exome_join) | keep_criteria(exome_join))
exome_ht = exome_ht.filter(keep_criteria(exome_ht))
possible_file = f'{root}/tmp/possible_coverage.ht'
if recompute_possible:
possible_ht = count_variants(context_ht, additional_grouping=grouping, force_grouping=True)
possible_ht = annotate_with_mu(possible_ht, mutation_ht)
possible_ht.transmute(possible_variants=possible_ht.variant_count).write(possible_file, True)
possible_ht = hl.read_table(possible_file)
ht = count_variants(exome_ht, additional_grouping=grouping, partition_hint=100, count_downsamplings=POPS,
impose_high_af_cutoff_here=not impose_high_af_cutoff_upfront)
ht = ht.join(possible_ht, 'outer')
return ht
def build_models(coverage_ht: hl.Table, trimers: bool = False, weighted: bool = False, half_cutoff = False,
) -> Tuple[Tuple[float, float], Dict[str, Tuple[float, float]]]:
"""
Build coverage model and plateau models.
This function builds plateau models to calibrate mutation rate estimates against the proportion observed
of each substitution, context, and methylation level in `coverage_ht` considering only high coverage sites,
or sites above a median coverage of `HIGH_COVERAGE_CUTOFF` (or half of `HIGH_COVERAGE_CUTOFF` if `half_cutoff`
is True). If using the output of `get_proportion_observed_by_coverage` as `coverage_ht`, the proportion observed
will be high-quality variants below 0.1% frequency at synonymous sites. Two plateau models are fit, one for CpG
transitions and one for the remainder of sites (transversions and non-CpG transitions).
The x and y of plateau models:
x: `mu_snp` - mutation rate
y: proportion observed ('variant_count' or 'observed_{pop}' / 'possible_variants')
For low coverage sites, or sites below `HIGH_COVERAGE_CUTOFF` (or half of `HIGH_COVERAGE_CUTOFF` if `half_cutoff` is
True), this function performs a base-level resolution rather than exon-level to compute a coverage correction factor
to reduce the inaccuracy of expected variant counts caused by low coverage on each base. The coverage models are built
by first defining a metric that is derived by dividing the number of observed variants with the total number of
possible variants times the mutation rate summed across all substitutions, contexts, and methylation level. If using
the output of `get_proportion_observed_by_coverage` as `coverage_ht`, the number of observed variants and possible
variants will be at synonymous sites. The function computes this metric for high coverage sites as a global scaling
factor, and divides this metric at low coverage sites by this scaling factor to create an observed:expected ratio. Then the
coverage model is built of log10(coverage) to this scaled ratio as a correction factor for low coverage sites.
The x and y of the coverage model:
x: log10('exome_coverage') at low coverage site
y: sum('variant_count')/ (`high_coverage_scale_factor` * sum('possible_variants' * 'mu_snp') at low coverage site
where `high_coverage_scale_factor` = sum('variant_count') / sum('possible_variants' * 'mu_snp') at high coverage site
.. note::
This function expects that the input `coverage_ht` is the output of `get_proportion_observed_by_coverage`, and
therefore the following fields should be present in `coverage_ht`:
- context - trinucleotide genomic context
- ref - the middle base of `context`
- alt - the alternate base
- methylation_level - methylation level
- exome_coverage - median exome coverage at integer values between 1-100
- variant_count - the number of observed variants in the dataset for each substitution (`alt`), `context`,
`methylation_level`, and median exome coverage (`exome_coverage`)
- downsampling_counts_{pop} (pop defaults to `POPS`) - array of observed variant counts per population after downsampling
- mu_snp - mutation rate
- possible_variants - the number of possible variants in the dataset for each substitution (`alt`), `context`,
`methylation_level`, and median exome coverage (`exome_coverage`)
:param coverage_ht: Input coverage Table.
:param trimers: Whether the contexts were trimmed or not. Defaults to False.
:param weighted: Whether to weight the high coverage model (a linear regression model) by 'possible_variants'. Defaults to False.
:param half_cutoff: Whether to use half of `HIGH_COVERAGE_CUTOFF` as coverage cutoff. Otherwise `HIGH_COVERAGE_CUTOFF` will be used. Defaults to False.
:return: Coverage model and plateau models.
"""
keys = ['context', 'ref', 'alt', 'methylation_level', 'mu_snp']
cov_cutoff = (HIGH_COVERAGE_CUTOFF / half_cutoff) if half_cutoff else HIGH_COVERAGE_CUTOFF
all_high_coverage_ht = coverage_ht.filter(coverage_ht.exome_coverage >= cov_cutoff)
agg_expr = {
'observed_variants': hl.agg.sum(all_high_coverage_ht.variant_count),
'possible_variants': hl.agg.sum(all_high_coverage_ht.possible_variants)
}
for pop in POPS:
agg_expr[f'observed_{pop}'] = hl.agg.array_sum(all_high_coverage_ht[f'downsampling_counts_{pop}'])
high_coverage_ht = all_high_coverage_ht.group_by(*keys).aggregate(**agg_expr)
high_coverage_ht = annotate_variant_types(high_coverage_ht, not trimers)
plateau_models = build_plateau_models_pop(high_coverage_ht, weighted=weighted)
high_coverage_scale_factor = all_high_coverage_ht.aggregate(
hl.agg.sum(all_high_coverage_ht.variant_count) /
hl.agg.sum(all_high_coverage_ht.possible_variants * all_high_coverage_ht.mu_snp))
all_low_coverage_ht = coverage_ht.filter((coverage_ht.exome_coverage < cov_cutoff) &
(coverage_ht.exome_coverage > 0))
low_coverage_ht = all_low_coverage_ht.group_by(log_coverage=hl.log10(all_low_coverage_ht.exome_coverage)).aggregate(
low_coverage_obs_exp=hl.agg.sum(all_low_coverage_ht.variant_count) /
(high_coverage_scale_factor * hl.agg.sum(all_low_coverage_ht.possible_variants * all_low_coverage_ht.mu_snp)))
coverage_model = build_coverage_model(low_coverage_ht)
# TODO: consider weighting here as well
return coverage_model, plateau_models
def add_most_severe_csq_to_tc_within_ht(t: Union[hl.Table, hl.MatrixTable]) -> Union[hl.Table, hl.MatrixTable]:
"""
Add most_severe_consequence annotation to 'transcript_consequences' within the vep annotation.
:param t: Input Table or MatrixTable.
:return: Input Table or MatrixTable with most_severe_consequence annotation added.
"""
annotation = t.vep.annotate(transcript_consequences=t.vep.transcript_consequences.map(
add_most_severe_consequence_to_consequence))
return t.annotate_rows(vep=annotation) if isinstance(t, hl.MatrixTable) else t.annotate(vep=annotation)
def take_one_annotation_from_tc_within_ht(t: Union[hl.Table, hl.MatrixTable]) -> Union[hl.Table, hl.MatrixTable]:
"""
Annotate 'transcript_consequences' using only the first consequence (index 0) of 'transcript_consequences' within the
vep annotation of the input Table.
:param t: Input Table or MatrixTable.
:return: Table or MatrixTable with only the first consequence for the 'transcript_consequences' annotation.
"""
annotation = t.vep.annotate(transcript_consequences=t.vep.transcript_consequences[0])
return t.annotate_rows(vep=annotation) if isinstance(t, hl.MatrixTable) else t.annotate(vep=annotation)
def get_proportion_observed(exome_ht: hl.Table, context_ht: hl.Table, mutation_ht: hl.Table,
plateau_models: Dict[str, Tuple[float, float]], coverage_model: Tuple[float, float],
recompute_possible: bool = False, remove_from_denominator: bool = True,
custom_model: str = None, dataset: str = 'gnomad',
impose_high_af_cutoff_upfront: bool = True, half_cutoff = False) -> hl.Table:
"""
Compute the expected number of variants and observed:expected ratio using plateau models and coverage model.
This function sums the number of possible variants times the mutation rate for all variants, and applies the calibration
model separately for CpG transitions and other sites. For sites with coverage lower than the coverage cutoff, the value obtained
from the previous step is multiplied by the coverage correction factor. These values are summed across the set of variants
of interest to obtain the expected number of variants.
A brief view of how to get the expected number of variants:
mu_agg = the number of possible variants * the mutation rate (all variants)
adjusted_mutation_rate = sum(plateau model slop * mu_agg + plateau model intercept) (separately for CpG transitions and other sites)
if 0 < coverage < coverage cutoff:
coverage_correction = coverage_model slope * log10(coverage) + coverage_model intercept
expected_variants = sum(adjusted_mutation_rate * coverage_correction)
else:
expected_variants = sum(adjusted_mutation_rate)
The expected_variants are summed across the set of variants of interest to obtain the final expected number of variants.
Function adds the following annotations:
- variant_count - observed variant counts annotated by `count_variants` function grouped by groupings (output of `annotate_constraint_groupings`)
- adjusted_mutation_rate (including those for each population) - the sum of mutation rate adjusted by plateau models and possible variant counts grouped by groupings
- possible_variants (including those for each population) - the sum of possible variant counts derived from the context Table grouped by groupings
- expected_variants (including those for each population) - the sum of expected variant counts grouped by groupings
- mu - sum(mu_snp * possible_variant * coverage_correction) grouped by groupings
- obs_exp - observed:expected ratio
- annotations annotated by `annotate_constraint_groupings`
:param exome_ht: Exome site Table (output of `prepare_ht`) filtered to autosomes and pseudoautosomal regions.
:param context_ht: Context Table (output of `prepare_ht`) filtered to autosomes and pseudoautosomal regions.
:param mutation_ht: Mutation rate Table with 'mu_snp' field.
:param plateau_models: Linear models (output of `build_plateau_models_pop`), with the values of the dictionary formatted as a Tuple of intercept and slope, that calibrates mutation rate to proportion observed for high coverage exome. It includes models for CpG site, non-CpG site, and each population in `POPS`.
:param coverage_model: A linear model (output of `build_coverage_model`), formatted as a Tuple of intercept and slope, that calibrates a given coverage level to observed:expected ratio. It's a correction factor for low coverage sites.
:param recompute_possible: Whether to use context Table to recompute the number of possible variants instead of using a precomputed intermediate Table if it exists. Defaults to False.
:param remove_from_denominator: Whether to remove alleles in context Table if found in 'exome_ht' and is not a PASS variant with an allele count greater than 0, defaults to True
:param custom_model: The customized model (one of "standard" or "worst_csq" for now), defaults to None.
:param dataset: Dataset to use when computing frequency index, defaults to 'gnomad'.
:param impose_high_af_cutoff_upfront: Whether to remove alleles with allele frequency larger than `af_cutoff` (0.001), defaults to True.
:param half_cutoff: Whether to use half of `HIGH_COVERAGE_CUTOFF` as coverage cutoff. Otherwise `HIGH_COVERAGE_CUTOFF` will be used, defaults to False.
:return: Table with `expected_variants` (expected variant counts) and `obs_exp` (observed:expected ratio) annotations.
"""
exome_ht = add_most_severe_csq_to_tc_within_ht(exome_ht)
context_ht = add_most_severe_csq_to_tc_within_ht(context_ht)
if custom_model == 'syn_canonical':
context_ht = take_one_annotation_from_tc_within_ht(fast_filter_vep(context_ht))
context_ht = context_ht.transmute(transcript_consequences=context_ht.vep.transcript_consequences)
exome_ht = take_one_annotation_from_tc_within_ht(fast_filter_vep(exome_ht))
exome_ht = exome_ht.transmute(transcript_consequences=exome_ht.vep.transcript_consequences)
elif custom_model == 'worst_csq':
context_ht = process_consequences(context_ht)
context_ht = context_ht.transmute(worst_csq_by_gene=context_ht.vep.worst_csq_by_gene)
context_ht = context_ht.explode(context_ht.worst_csq_by_gene)
exome_ht = process_consequences(exome_ht)
exome_ht = exome_ht.transmute(worst_csq_by_gene=exome_ht.vep.worst_csq_by_gene)
exome_ht = exome_ht.explode(exome_ht.worst_csq_by_gene)
elif custom_model == 'tx_annotation':
tx_ht = load_tx_expression_data()
context_ht = context_ht.annotate(**tx_ht[context_ht.key])
context_ht = context_ht.explode(context_ht.tx_annotation)
tx_ht = load_tx_expression_data(context=False)
exome_ht = exome_ht.annotate(**tx_ht[exome_ht.key])
exome_ht = exome_ht.explode(exome_ht.tx_annotation)
elif custom_model == 'distance_to_splice':
context_ht = annotate_distance_to_splice(context_ht)
exome_ht = annotate_distance_to_splice(exome_ht)
# TODO:
# context_ht = context_ht.explode(context_ht.tx_annotation)
# exome_ht = exome_ht.explode(exome_ht.tx_annotation)
else:
context_ht = context_ht.transmute(transcript_consequences=context_ht.vep.transcript_consequences)
context_ht = context_ht.explode(context_ht.transcript_consequences)
exome_ht = exome_ht.transmute(transcript_consequences=exome_ht.vep.transcript_consequences)
exome_ht = exome_ht.explode(exome_ht.transcript_consequences)
context_ht, _ = annotate_constraint_groupings(context_ht, custom_model=custom_model)
exome_ht, grouping = annotate_constraint_groupings(exome_ht, custom_model=custom_model)
context_ht = context_ht.filter(hl.is_defined(context_ht.exome_coverage)).select(
'context', 'ref', 'alt', 'methylation_level', *grouping)
exome_ht = exome_ht.select(
'context', 'ref', 'alt', 'methylation_level', 'freq', 'pass_filters', *grouping)
af_cutoff = 0.001
freq_index = exome_ht.freq_index_dict.collect()[0][dataset]
def keep_criteria(ht):
crit = (ht.freq[freq_index].AC > 0) & ht.pass_filters & (ht.coverage > 0)
if impose_high_af_cutoff_upfront:
crit &= (ht.freq[freq_index].AF <= af_cutoff)
return crit
exome_join = exome_ht[context_ht.key]
if remove_from_denominator:
context_ht = context_ht.filter(hl.is_missing(exome_join) | keep_criteria(exome_join))
exome_ht = exome_ht.filter(keep_criteria(exome_ht))
possible_file = f'{root}/model/possible_data/possible_transcript_pop_{custom_model}.ht'
if recompute_possible:
ht = count_variants(context_ht, additional_grouping=grouping, partition_hint=2000, force_grouping=True)
ht = annotate_with_mu(ht, mutation_ht)
ht = ht.transmute(possible_variants=ht.variant_count)
ht = annotate_variant_types(ht.annotate(mu_agg=ht.mu_snp * ht.possible_variants))
model = hl.literal(plateau_models.total)[ht.cpg]
cov_cutoff = (HIGH_COVERAGE_CUTOFF / half_cutoff) if half_cutoff else HIGH_COVERAGE_CUTOFF
ann_expr = {
'adjusted_mutation_rate': ht.mu_agg * model[1] + model[0],
'coverage_correction': hl.case()
.when(ht.coverage == 0, 0)
.when(ht.coverage >= cov_cutoff, 1)
.default(coverage_model[1] * hl.log10(ht.coverage) + coverage_model[0])
}
for pop in POPS:
pop_model = hl.literal(plateau_models[pop])
slopes = hl.map(lambda f: f[ht.cpg][1], pop_model)
intercepts = hl.map(lambda f: f[ht.cpg][0], pop_model)
ann_expr[f'adjusted_mutation_rate_{pop}'] = ht.mu_agg * slopes + intercepts
ht = ht.annotate(**ann_expr)
ann_expr = {
'expected_variants': ht.adjusted_mutation_rate * ht.coverage_correction,
'mu': ht.mu_agg * ht.coverage_correction
}
for pop in POPS:
ann_expr[f'expected_variants_{pop}'] = ht[f'adjusted_mutation_rate_{pop}'] * ht.coverage_correction
ht = ht.annotate(**ann_expr)
ht.write(possible_file, True)
possible_variants_ht = hl.read_table(possible_file)
ht = count_variants(exome_ht, additional_grouping=grouping, partition_hint=2000, force_grouping=True,
count_downsamplings=POPS, impose_high_af_cutoff_here=not impose_high_af_cutoff_upfront)
ht = ht.join(possible_variants_ht, 'outer')
ht.write(f'{root}/model/possible_data/all_data_transcript_pop_{custom_model}.ht', True)
ht = hl.read_table(f'{root}/model/possible_data/all_data_transcript_pop_{custom_model}.ht')
grouping.remove('coverage')
agg_expr = {
'variant_count': hl.agg.sum(ht.variant_count),
'adjusted_mutation_rate': hl.agg.sum(ht.adjusted_mutation_rate),
'possible_variants': hl.agg.sum(ht.possible_variants),
'expected_variants': hl.agg.sum(ht.expected_variants),
'mu': hl.agg.sum(ht.mu)
}
for pop in POPS:
agg_expr[f'adjusted_mutation_rate_{pop}'] = hl.agg.array_sum(ht[f'adjusted_mutation_rate_{pop}'])
agg_expr[f'expected_variants_{pop}'] = hl.agg.array_sum(ht[f'expected_variants_{pop}'])
agg_expr[f'downsampling_counts_{pop}'] = hl.agg.array_sum(ht[f'downsampling_counts_{pop}'])
ht = ht.group_by(*grouping).partition_hint(1000).aggregate(**agg_expr)
return ht.annotate(obs_exp=ht.variant_count / ht.expected_variants)
def finalize_dataset(po_ht: hl.Table, keys: Tuple[str] = ('gene', 'transcript', 'canonical'),
n_partitions: int = 1000) -> hl.Table:
"""
Compute the pLI scores, 90% confidence interval around the observed:expected ratio, and z scores
for synonymous variants, missense variants, and predicted loss-of-function (pLoF) variants.
.. note::
The following annotations should be present in `po_ht`:
- modifier
- annotation
- variant_count
- mu
- possible_variants
- expected_variants
- expected_variants_{pop} (pop defaults to `POPS`)
- downsampling_counts_{pop} (pop defaults to `POPS`)
:param po_ht: Input Table with the number of expected variants (output of `get_proportion_observed`).
:param keys: The keys of the output Table, defaults to ('gene', 'transcript', 'canonical').
:param n_partitions: Desired number of partitions for `Table.repartition()`, defaults to 1000.
:return: Table with pLI scores, confidence interval of the observed:expected ratio, and z scores.
"""
# This function aggregates over genes in all cases, as XG spans PAR and non-PAR X
po_ht = po_ht.repartition(n_partitions).persist()
# Getting classic LoF annotations (no LOFTEE)
classic_lof_annotations = hl.literal({'stop_gained', 'splice_donor_variant', 'splice_acceptor_variant'})
lof_ht_classic = po_ht.filter(classic_lof_annotations.contains(po_ht.annotation) &
((po_ht.modifier == 'HC') | (po_ht.modifier == 'LC')))
lof_ht_classic = collapse_lof_ht(lof_ht_classic, keys, False)
lof_ht_classic = lof_ht_classic.rename({x: f'{x}_classic' for x in list(lof_ht_classic.row_value)})
# Getting all LoF annotations (LOFTEE HC + OS)
lof_ht_classic_hc = po_ht.filter((po_ht.modifier == 'HC') | (po_ht.modifier == 'OS'))
lof_ht_classic_hc = collapse_lof_ht(lof_ht_classic_hc, keys, False)
lof_ht_classic_hc = lof_ht_classic_hc.rename({x: f'{x}_with_os' for x in list(lof_ht_classic_hc.row_value)})
# Getting all LoF annotations (LOFTEE HC)
lof_ht = po_ht.filter(po_ht.modifier == 'HC')
lof_ht = collapse_lof_ht(lof_ht, keys, False)
mis_ht = po_ht.filter(po_ht.annotation == 'missense_variant')
agg_expr = {
'obs_mis': hl.agg.sum(mis_ht.variant_count),
'exp_mis': hl.agg.sum(mis_ht.expected_variants),
'oe_mis': hl.agg.sum(mis_ht.variant_count) / hl.agg.sum(mis_ht.expected_variants),
'mu_mis': hl.agg.sum(mis_ht.mu),
'possible_mis': hl.agg.sum(mis_ht.possible_variants)
}
for pop in POPS:
agg_expr[f'exp_mis_{pop}'] = hl.agg.array_sum(mis_ht[f'expected_variants_{pop}'])
agg_expr[f'obs_mis_{pop}'] = hl.agg.array_sum(mis_ht[f'downsampling_counts_{pop}'])
mis_ht = mis_ht.group_by(*keys).aggregate(**agg_expr)
pphen_mis_ht = po_ht.filter(po_ht.modifier == 'probably_damaging')
pphen_mis_ht = pphen_mis_ht.group_by(*keys).aggregate(obs_mis_pphen=hl.agg.sum(pphen_mis_ht.variant_count),
exp_mis_pphen=hl.agg.sum(pphen_mis_ht.expected_variants),
oe_mis_pphen=hl.agg.sum(pphen_mis_ht.variant_count) / hl.agg.sum(pphen_mis_ht.expected_variants),
possible_mis_pphen=hl.agg.sum(pphen_mis_ht.possible_variants))
syn_ht = po_ht.filter(po_ht.annotation == 'synonymous_variant').key_by(*keys)
agg_expr = {
'obs_syn': hl.agg.sum(syn_ht.variant_count),
'exp_syn': hl.agg.sum(syn_ht.expected_variants),
'oe_syn': hl.agg.sum(syn_ht.variant_count) / hl.agg.sum(syn_ht.expected_variants),
'mu_syn': hl.agg.sum(syn_ht.mu),
'possible_syn': hl.agg.sum(syn_ht.possible_variants)
}
for pop in POPS:
agg_expr[f'exp_syn_{pop}'] = hl.agg.array_sum(syn_ht[f'expected_variants_{pop}'])
agg_expr[f'obs_syn_{pop}'] = hl.agg.array_sum(syn_ht[f'downsampling_counts_{pop}'])
syn_ht = syn_ht.group_by(*keys).aggregate(**agg_expr)
ht = lof_ht_classic.annotate(**mis_ht[lof_ht_classic.key], **pphen_mis_ht[lof_ht_classic.key],
**syn_ht[lof_ht_classic.key], **lof_ht[lof_ht_classic.key],
**lof_ht_classic_hc[lof_ht_classic.key])
syn_cis = oe_confidence_interval(ht, ht.obs_syn, ht.exp_syn, prefix='oe_syn')
mis_cis = oe_confidence_interval(ht, ht.obs_mis, ht.exp_mis, prefix='oe_mis')
lof_cis = oe_confidence_interval(ht, ht.obs_lof, ht.exp_lof, prefix='oe_lof')
ht = ht.annotate(**syn_cis[ht.key], **mis_cis[ht.key], **lof_cis[ht.key])
return calculate_all_z_scores(ht) # .annotate(**oe_confidence_interval(ht, ht.obs_lof, ht.exp_lof)[ht.key])
def collapse_lof_ht(lof_ht: hl.Table, keys: Tuple[str], calculate_pop_pLI: bool = False) -> hl.Table:
"""
Collapse the `lof_ht` by `keys` and annotate pLI scores and observed:expected ratio for pLoF variants.
Function sums the number of observed pLoF variants, possible pLoF variants, and expected pLoF variants
across all the combinations of `keys`, and uses the expected variant counts and observed variant counts
to compute the pLI scores and observed:expected ratio.
The following annotations are added to the output Table:
- obs_lof - the sum of observed pLoF variants grouped by `keys`
- mu_lof - the sum of mutation rate at pLoF variants grouped by `keys`
- possible_lof - possible number of pLoF variants grouped by `keys`
- exp_lof - expected number of pLoF variants grouped by `keys`
- exp_lof_{pop} (pop defaults to `POPS`) - expected number of pLoF variants per population grouped by `keys`
- obs_lof_{pop} (pop defaults to `POPS`) - observed number of pLoF variants per population grouped by `keys`
- oe_lof - observed:expected ratio for pLoF variants (oe_lof=lof_ht.obs_lof / lof_ht.exp_lof)
- annotations added by function `pLI()`
.. note::
The following annotations should be present in `lof_ht`:
- variant_count
- mu
- possible_variants
- expected_variants
:param lof_ht: Table with specific pLoF annotations.
:param keys: The keys used to collapse `lof_ht` and use as keys for the output Table.
:param calculate_pop_pLI: Whether to calculate the pLI score for each population, defaults to False.
:return: A collapsed Table with pLI scores and observed:expected ratio for pLoF variants.
"""
agg_expr = {
'obs_lof': hl.agg.sum(lof_ht.variant_count),
'mu_lof': hl.agg.sum(lof_ht.mu),
'possible_lof': hl.agg.sum(lof_ht.possible_variants),
'exp_lof': hl.agg.sum(lof_ht.expected_variants)
}
for pop in POPS:
agg_expr[f'exp_lof_{pop}'] = hl.agg.array_sum(lof_ht[f'expected_variants_{pop}'])
agg_expr[f'obs_lof_{pop}'] = hl.agg.array_sum(lof_ht[f'downsampling_counts_{pop}'])
lof_ht = lof_ht.group_by(*keys).aggregate(**agg_expr).persist()
lof_ht = lof_ht.filter(lof_ht.exp_lof > 0)
if calculate_pop_pLI:
pop_lengths = get_all_pop_lengths(lof_ht, 'obs_lof_')
print(pop_lengths)
for pop_length, pop in pop_lengths:
print(f'Calculating pLI for {pop}...')
plis = []
for i in range(8, pop_length):
print(i)
ht = lof_ht.filter(lof_ht[f'exp_lof_{pop}'][i] > 0)
pli_ht = pLI(ht, ht[f'obs_lof_{pop}'][i], ht[f'exp_lof_{pop}'][i])
plis.append(pli_ht[lof_ht.key])
lof_ht = lof_ht.annotate(**{
f'pLI_{pop}': [pli.pLI for pli in plis],
f'pRec_{pop}': [pli.pRec for pli in plis],
f'pNull_{pop}': [pli.pNull for pli in plis],
})
return lof_ht.annotate(
**pLI(lof_ht, lof_ht.obs_lof, lof_ht.exp_lof)[lof_ht.key],
oe_lof=lof_ht.obs_lof / lof_ht.exp_lof).key_by(*keys)
def annotate_constraint_groupings(ht: Union[hl.Table, hl.MatrixTable],
custom_model: str = None) -> Tuple[Union[hl.Table, hl.MatrixTable], List[str]]:
"""
Add constraint annotations to be used for groupings.
Function adds the following annotations:
- annotation - could be 'most_severe_consequence' of either 'worst_csq_by_gene' or 'transcript_consequences', or 'csq' of 'tx_annotation'
- modifier - classic lof annotation, LOFTEE annotation, or PolyPhen annotation
- gene
- coverage
- expressed (added when custom model specified as tx_annotation)
- transcript (added when custom model isn't specified as worst_csq or tx_annotation)
- canonical (added when custom model isn't specified as worst_csq or tx_annotation)
..note::
HT must be exploded against whatever axis.
:param ht: Input Table or MatrixTable.
:param custom_model: The customized model (one of "standard" or "worst_csq" for now), defaults to None.
:return: A tuple of input Table or MatrixTable with grouping annotations added and the names of added annotations.
"""
if custom_model == 'worst_csq':
groupings = {
'annotation': ht.worst_csq_by_gene.most_severe_consequence,
'modifier': hl.case()
.when(hl.is_defined(ht.worst_csq_by_gene.lof),
ht.worst_csq_by_gene.lof)
.when(hl.is_defined(ht.worst_csq_by_gene.polyphen_prediction),
ht.worst_csq_by_gene.polyphen_prediction)
.default('None'),
'gene': ht.worst_csq_by_gene.gene_symbol,
'coverage': ht.exome_coverage
}
elif custom_model == 'tx_annotation':
groupings = {
'annotation': ht.tx_annotation.csq,
'modifier': ht.tx_annotation.lof,
'gene': ht.tx_annotation.symbol,
'expressed': hl.case(missing_false=True).when(
ht.tx_annotation.mean_expression >= 0.9, 'high').when(
ht.tx_annotation.mean_expression > 0.1, 'medium').when(
hl.is_defined(ht.tx_annotation.mean_expression), 'low').default('missing'),
'coverage': ht.exome_coverage
}
else:
groupings = {
'annotation': ht.transcript_consequences.most_severe_consequence,
'modifier': hl.case()
.when(hl.is_defined(ht.transcript_consequences.lof),
ht.transcript_consequences.lof)
.when(hl.is_defined(ht.transcript_consequences.polyphen_prediction),
ht.transcript_consequences.polyphen_prediction)
.default('None'),
'transcript': ht.transcript_consequences.transcript_id,
'gene': ht.transcript_consequences.gene_symbol,
'canonical': hl.or_else(ht.transcript_consequences.canonical == 1, False),
'coverage': ht.exome_coverage
}
if custom_model == 'splice_region':
groupings['distance_splice'] = ht.transcript_consequences
ht = ht.annotate(**groupings) if isinstance(ht, hl.Table) else ht.annotate_rows(**groupings)
return ht, list(groupings.keys())
# Model building
def build_coverage_model(coverage_ht: hl.Table) -> (float, float):
"""
Calibrate coverage model.
This function uses linear regression to build a model of log10(coverage) to this scaled ratio as a correction
factor for low coverage sites.
.. note::
The following annotations should be present in `coverage_ht`:
- low_coverage_obs_exp - an observed:expected ratio for a given coverage level
- log_coverage - log10 coverage
:param coverage_ht: Low coverage Table.
:return: Tuple with intercept and slope of the model.
"""
return tuple(coverage_ht.aggregate(hl.agg.linreg(coverage_ht.low_coverage_obs_exp, [1, coverage_ht.log_coverage])).beta)
def build_plateau_models(ht: hl.Table, weighted: bool = False) -> Dict[str, Tuple[float, float]]:
"""
Calibrate high coverage model.
The function fits two models, one for CpG transitions and one for the remainder of sites, to calibrate
from the mutation rate to proportion observed.
.. note::
The following annotations should be present in `ht`:
- observed_variants - observed variant counts for each combination of 'context', 'ref', 'alt', 'methylation_level', 'mu_snp
- possible_variants - possible variant counts for each combination of 'context', 'ref', 'alt', 'methylation_level', 'mu_snp
- cpg - whether it's a cpg site or not
- mu_snp - mutation rate
:param ht: High coverage Table.
:param weighted: Whether to generalize the model to weighted least squares using 'possible_variants'.
:return: A Dictionary of intercepts and slopes for observed variants overall.
"""
# TODO: try square weighting
ht = ht.annotate(high_coverage_proportion_observed=ht.observed_variants / ht.possible_variants)
return ht.aggregate(hl.agg.group_by(ht.cpg,
hl.agg.linreg(ht.high_coverage_proportion_observed, [1, ht.mu_snp],
weight=ht.possible_variants if weighted else None)
).map_values(lambda x: x.beta))
def build_plateau_models_pop(ht: hl.Table, weighted: bool = False) -> Dict[str, Tuple[float, float]]:
"""
Calibrate high coverage model (returns intercept and slope).