-
Notifications
You must be signed in to change notification settings - Fork 26
/
summary_stats.py
671 lines (606 loc) · 27 KB
/
summary_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
# noqa: D100
import logging
from typing import Dict, Optional, Set
import hail as hl
from gnomad.utils.filtering import filter_low_conf_regions
from gnomad.utils.vep import (
LOF_CSQ_SET,
add_most_severe_consequence_to_consequence,
filter_vep_to_canonical_transcripts,
filter_vep_to_mane_select_transcripts,
get_most_severe_consequence_for_summary,
process_consequences,
)
logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
def freq_bin_expr(
freq_expr: hl.expr.ArrayExpression, index: int = 0
) -> hl.expr.StringExpression:
"""
Return frequency string annotations based on input AC or AF.
.. note::
- Default index is 0 because function assumes freq_expr was calculated with `annotate_freq`.
- Frequency index 0 from `annotate_freq` is frequency for all pops calculated on adj genotypes only.
:param freq_expr: Array of structs containing frequency information.
:param index: Which index of freq_expr to use for annotation. Default is 0.
:return: StringExpression containing bin name based on input AC or AF.
"""
return (
hl.case(missing_false=True)
.when(freq_expr[index].AC == 0, "Not found")
.when(freq_expr[index].AC == 1, "Singleton")
.when(freq_expr[index].AC == 2, "Doubleton")
.when(freq_expr[index].AC <= 5, "AC 3 - 5")
.when(freq_expr[index].AF < 1e-4, "AC 6 - 0.01%")
.when(freq_expr[index].AF < 1e-3, "0.01% - 0.1%")
.when(freq_expr[index].AF < 1e-2, "0.1% - 1%")
.when(freq_expr[index].AF < 1e-1, "1% - 10%")
.when(freq_expr[index].AF > 0.95, ">95%")
.default("10% - 95%")
)
def get_summary_counts_dict(
locus_expr: hl.expr.LocusExpression,
allele_expr: hl.expr.ArrayExpression,
lof_expr: hl.expr.StringExpression,
no_lof_flags_expr: hl.expr.BooleanExpression,
most_severe_csq_expr: hl.expr.StringExpression,
prefix_str: str = "",
) -> Dict[str, hl.expr.Int64Expression]:
"""
Return dictionary containing containing counts of multiple variant categories.
Categories are:
- Number of variants
- Number of indels
- Number of SNVs
- Number of LoF variants
- Number of LoF variants that pass LOFTEE
- Number of LoF variants that pass LOFTEE without any flgs
- Number of LoF variants annotated as 'other splice' (OS) by LOFTEE
- Number of LoF variants that fail LOFTEE
- Number of missense variants
- Number of synonymous variants
- Number of autosomal variants
- Number of allosomal variants
.. warning::
Assumes `allele_expr` contains only two variants (multi-allelics have been split).
:param locus_expr: LocusExpression.
:param allele_expr: ArrayExpression containing alleles.
:param lof_expr: StringExpression containing LOFTEE annotation.
:param no_lof_flags_expr: BooleanExpression indicating whether LoF variant has any flags.
:param most_severe_csq_expr: StringExpression containing most severe consequence annotation.
:param prefix_str: Desired prefix string for category names. Default is empty str.
:return: Dict of categories and counts per category.
"""
logger.warning("This function expects that multi-allelic variants have been split!")
return {
f"{prefix_str}num_variants": hl.agg.count(),
f"{prefix_str}indels": hl.agg.count_where(
hl.is_indel(allele_expr[0], allele_expr[1])
),
f"{prefix_str}snps": hl.agg.count_where(
hl.is_snp(allele_expr[0], allele_expr[1])
),
f"{prefix_str}LOF": hl.agg.count_where(hl.is_defined(lof_expr)),
f"{prefix_str}pass_loftee": hl.agg.count_where(lof_expr == "HC"),
f"{prefix_str}pass_loftee_no_flag": hl.agg.count_where(
(lof_expr == "HC") & (no_lof_flags_expr)
),
f"{prefix_str}loftee_os": hl.agg.count_where(lof_expr == "OS"),
f"{prefix_str}fail_loftee": hl.agg.count_where(lof_expr == "LC"),
f"{prefix_str}num_missense": hl.agg.count_where(
most_severe_csq_expr == "missense_variant"
),
f"{prefix_str}num_synonymous": hl.agg.count_where(
most_severe_csq_expr == "synonymous_variant"
),
f"{prefix_str}num_autosomal_variants": hl.agg.filter(
locus_expr.in_autosome_or_par(), hl.agg.count()
),
f"{prefix_str}num_allosomal_variants": hl.agg.filter(
locus_expr.in_x_nonpar() | locus_expr.in_y_nonpar(), hl.agg.count()
),
}
def get_summary_ac_dict(
ac_expr: hl.expr.Int64Expression,
lof_expr: hl.expr.StringExpression,
no_lof_flags_expr: hl.expr.BooleanExpression,
most_severe_csq_expr: hl.expr.StringExpression,
) -> Dict[str, hl.expr.Int64Expression]:
"""
Return dictionary containing containing total allele counts for variant categories.
Categories are:
- All variants
- LoF variants
- LoF variants that pass LOFTEE
- LoF variants that pass LOFTEE without any flags
- LoF variants that are annotate as 'other splice' (OS) by LOFTEE
- LoF variants that fail LOFTEE
- Missense variants
- Synonymous variants
.. warning::
Assumes `allele_expr` contains only two variants (multi-allelics have been split).
:param allele_expr: ArrayExpression containing alleles.
:param lof_expr: StringExpression containing LOFTEE annotation.
:param no_lof_flags_expr: BooleanExpression indicating whether LoF variant has any flags.
:return: Dict of variant categories and their total allele counts.
"""
logger.warning("This function expects that multi-allelic variants have been split!")
return {
"total_ac": hl.agg.sum(ac_expr),
"total_ac_LOF": hl.agg.filter(hl.is_defined(lof_expr), hl.agg.sum(ac_expr)),
"total_ac_pass_loftee": hl.agg.filter(lof_expr == "HC", hl.agg.sum(ac_expr)),
"total_ac_pass_loftee_no_flag": hl.agg.filter(
(lof_expr == "HC") & (no_lof_flags_expr), hl.agg.sum(ac_expr)
),
"total_ac_loftee_os": hl.agg.filter(lof_expr == "OS", hl.agg.sum(ac_expr)),
"total_ac_fail_loftee": hl.agg.filter(lof_expr == "LC", hl.agg.sum(ac_expr)),
"total_ac_missense": hl.agg.filter(
most_severe_csq_expr == "missense_variant", hl.agg.sum(ac_expr)
),
"total_ac_synonymous": hl.agg.filter(
most_severe_csq_expr == "synonymous_variant", hl.agg.sum(ac_expr)
),
}
def get_summary_counts(
ht: hl.Table,
freq_field: str = "freq",
filter_field: str = "filters",
filter_decoy: bool = False,
canonical_only: bool = True,
mane_select_only: bool = False,
index: int = 0,
) -> hl.Table:
"""
Generate a struct with summary counts across variant categories.
Summary counts:
- Number of variants
- Number of indels
- Number of SNVs
- Number of LoF variants
- Number of LoF variants that pass LOFTEE (including with LoF flags)
- Number of LoF variants that pass LOFTEE without LoF flags
- Number of OS (other splice) variants annotated by LOFTEE
- Number of LoF variants that fail LOFTEE filters
Also annotates Table's globals with total variant counts.
Before calculating summary counts, function:
- Filters out low confidence regions
- Uses the most severe consequence
- Filters to canonical transcripts (if `canonical_only` is True) or MANE Select
transcripts (if `mane_select_only` is True)
Assumes that:
- Input HT is annotated with VEP.
- Multiallelic variants have been split and/or input HT contains bi-allelic variants only.
- freq_expr was calculated with `annotate_freq`.
- (Frequency index 0 from `annotate_freq` is frequency for all pops calculated on adj genotypes only.)
:param ht: Input Table.
:param freq_field: Name of field in HT containing frequency annotation (array of structs). Default is "freq".
:param filter_field: Name of field in HT containing variant filter information. Default is "filters".
:param canonical_only: Whether to filter to canonical transcripts. Default is True.
:param mane_select_only: Whether to filter to MANE Select transcripts. Default is False.
:param filter_decoy: Whether to filter decoy regions. Default is False.
:param index: Which index of freq_expr to use for annotation. Default is 0.
:return: Table grouped by frequency bin and aggregated across summary count categories.
"""
if canonical_only and mane_select_only:
raise ValueError(
"Only one of `canonical_only` and `mane_select_only` can be True."
)
logger.info("Checking if multi-allelic variants have been split...")
max_alleles = ht.aggregate(hl.agg.max(hl.len(ht.alleles)))
if max_alleles > 2:
logger.info("Splitting multi-allelics and VEP transcript consequences...")
ht = hl.split_multi_hts(ht)
logger.info("Filtering to PASS variants in high confidence regions...")
ht = ht.filter((hl.len(ht[filter_field]) == 0))
ht = filter_low_conf_regions(ht, filter_decoy=filter_decoy)
if canonical_only:
logger.info("Filtering to canonical transcripts...")
ht = filter_vep_to_canonical_transcripts(ht)
elif mane_select_only:
logger.info("Filtering to mane select transcripts...")
ht = filter_vep_to_mane_select_transcripts(ht)
logger.info("Getting VEP summary annotations...")
ht = get_most_severe_consequence_for_summary(ht)
logger.info("Annotating with frequency bin information...")
ht = ht.annotate(freq_bin=freq_bin_expr(ht[freq_field], index))
logger.info(
"Annotating HT globals with total counts/total allele counts per variant"
" category..."
)
summary_counts = ht.aggregate(
hl.struct(
**get_summary_counts_dict(
ht.locus,
ht.alleles,
ht.lof,
ht.no_lof_flags,
ht.most_severe_csq,
prefix_str="total_",
)
)
)
summary_ac_counts = ht.aggregate(
hl.struct(
**get_summary_ac_dict(
ht[freq_field][index].AC,
ht.lof,
ht.no_lof_flags,
ht.most_severe_csq,
)
)
)
ht = ht.annotate_globals(
summary_counts=summary_counts.annotate(**summary_ac_counts)
)
return ht.group_by("freq_bin").aggregate(
**get_summary_counts_dict(
ht.locus,
ht.alleles,
ht.lof,
ht.no_lof_flags,
ht.most_severe_csq,
)
)
def get_an_criteria(
mt: hl.MatrixTable,
samples_by_sex: Optional[Dict[str, int]] = None,
meta_root: str = "meta",
sex_field: str = "sex_imputation.sex_karyotype",
xy_str: str = "XY",
xx_str: str = "XX",
freq_field: str = "freq",
freq_index: int = 0,
an_proportion_cutoff: float = 0.8,
) -> hl.expr.BooleanExpression:
"""
Generate criteria to filter samples based on allele number (AN).
Uses allele number as proxy for call rate.
:param mt: Input MatrixTable.
:param samples_by_sex: Optional Dictionary containing number of samples (value) for each sample sex (key).
:param meta_root: Name of field in MatrixTable containing sample metadata information. Default is 'meta'.
:param sex_field: Name of field in MatrixTable containing sample sex assignment. Defualt is 'sex_imputation.sex_karyotype'.
:param xy_str: String marking whether a sample has XY sex. Default is 'XY'.
:param xx_str: String marking whether a sample has XX sex. Default is 'XX'.
:param freq_field: Name of field in MT that contains frequency information. Default is 'freq'.
:param freq_index: Which index of frequency struct to use. Default is 0.
:param an_proportion_cutoff: Desired allele number proportion cutoff. Default is 0.8.
"""
if samples_by_sex is None:
samples_by_sex = mt.aggregate_cols(hl.agg.counter(mt[meta_root][sex_field]))
return (
hl.case()
.when(
mt.locus.in_autosome_or_par(),
mt[freq_field][freq_index].AN
>= an_proportion_cutoff * 2 * sum(samples_by_sex.values()),
)
.when(
mt.locus.in_x_nonpar(),
mt[freq_field][freq_index].AN
>= an_proportion_cutoff
* (samples_by_sex[xy_str] + samples_by_sex[xx_str] * 2),
)
.when(
mt.locus.in_y_nonpar(),
mt[freq_field][freq_index].AN
>= an_proportion_cutoff * samples_by_sex[xy_str],
)
.or_missing()
)
def get_tx_expression_expr(
key_expr: hl.expr.StructExpression,
tx_ht: hl.Table,
csq_expr: hl.expr.StructExpression,
gene_field: str = "ensg",
csq_field: str = "csq",
tx_struct: str = "tx_annotation",
) -> hl.expr.Float64Expression:
"""
Pull appropriate transcript expression annotation struct given a specific locus and alleles (provided in `key_expr`).
Assumes that `key_expr` contains a locus and alleles.
Assumes that multi-allelic variants have been split in both `tx_ht` and `key_expr`.
:param row_key_expr: StructExpression containing locus and alleles to search in `tx_ht`.
:param tx_ht: Input Table containing transcript expression information.
:param csq_expr: Input StructExpression that contains VEP consequence information.
:param gene_field: Field in `csq_expr` that contains gene ID.
:param csq_field: Field in `csq_expr` that contains `most_severe_consequence` annotation.
:param tx_struct: StructExpression that contains transcript expression information.
:return: StructExpression that contains transcript expression information for given gene ID in `csq_expr`.
"""
return hl.find(
lambda csq: (csq[gene_field] == csq_expr.gene_id)
& (csq[csq_field] == csq_expr.most_severe_consequence),
tx_ht[key_expr][tx_struct],
)
def default_generate_gene_lof_matrix(
mt: hl.MatrixTable,
tx_ht: Optional[hl.Table],
high_expression_cutoff: float = 0.9,
low_expression_cutoff: float = 0.1,
filter_field: str = "filters",
freq_field: str = "freq",
freq_index: int = 0,
additional_csq_set: Set[str] = {"missense_variant", "synonymous_variant"},
all_transcripts: bool = False,
filter_an: bool = False,
filter_to_rare: bool = False,
pre_loftee: bool = False,
lof_csq_set: Set[str] = LOF_CSQ_SET,
remove_ultra_common: bool = False,
) -> hl.MatrixTable:
"""
Generate loss-of-function gene matrix.
Used to generate summary metrics on LoF variants.
:param mt: Input MatrixTable.
:param tx_ht: Optional Table containing expression levels per transcript.
:param high_expression_cutoff: Minimum mean proportion expressed cutoff for a transcript to be considered highly expressed. Default is 0.9.
:param low_expression_cutoff: Upper mean proportion expressed cutoff for a transcript to lowly expressed. Default is 0.1.
:param filter_field: Name of field in MT that contains variant filters. Default is 'filters'.
:param freq_field: Name of field in MT that contains frequency information. Default is 'freq'.
:param freq_index: Which index of frequency struct to use. Default is 0.
:param additional_csq_set: Set of additional consequences to keep. Default is {'missense_variant', 'synonymous_variant'}.
:param all_transcripts: Whether to use all transcripts instead of just the transcript with most severe consequence. Default is False.
:param filter_an: Whether to filter using allele number as proxy for call rate. Default is False.
:param filter_to_rare: Whether to filter to rare (AF < 5%) variants. Default is False.
:param pre_loftee: Whether LoF consequences have been annotated with LOFTEE. Default is False.
:param lof_csq_set: Set of LoF consequence strings. Default is {"splice_acceptor_variant", "splice_donor_variant", "stop_gained", "frameshift_variant"}.
:param remove_ultra_common: Whether to remove ultra common (AF > 95%) variants. Default is False.
"""
logger.info("Filtering to PASS variants...")
filt_criteria = hl.len(mt[filter_field]) == 0
if filter_an:
logger.info(
"Using AN (as a call rate proxy) to filter to variants that meet a minimum"
" call rate..."
)
mt = mt.filter_rows(get_an_criteria(mt))
if remove_ultra_common:
logger.info("Removing ultra common (AF > 95%) variants...")
filt_criteria &= mt[freq_field][freq_index].AF < 0.95
if filter_to_rare:
logger.info("Filtering to rare (AF < 5%) variants...")
filt_criteria &= mt[freq_field][freq_index].AF < 0.05
mt = mt.filter_rows(filt_criteria)
if all_transcripts:
logger.info("Exploding transcript_consequences field...")
explode_field = "transcript_consequences"
else:
logger.info(
"Adding most severe (worst) consequence and expoding worst_csq_by_gene"
" field..."
)
mt = process_consequences(mt)
explode_field = "worst_csq_by_gene"
if additional_csq_set:
logger.info("Including these consequences: %s", additional_csq_set)
additional_cats = hl.literal(additional_csq_set)
if pre_loftee:
logger.info("Filtering to LoF consequences: %s", lof_csq_set)
lof_cats = hl.literal(lof_csq_set)
criteria = lambda x: lof_cats.contains(
add_most_severe_consequence_to_consequence(x).most_severe_consequence
)
if additional_csq_set:
criteria = lambda x: lof_cats.contains(
add_most_severe_consequence_to_consequence(x).most_severe_consequence
) | additional_cats.contains(
add_most_severe_consequence_to_consequence(x).most_severe_consequence
)
else:
logger.info("Filtering to LoF variants that pass LOFTEE with no LoF flags...")
criteria = lambda x: (x.lof == "HC") & hl.is_missing(x.lof_flags)
if additional_csq_set:
criteria = lambda x: (x.lof == "HC") & hl.is_missing(
x.lof_flags
) | additional_cats.contains(
add_most_severe_consequence_to_consequence(x).most_severe_consequence
)
csqs = mt.vep[explode_field].filter(criteria)
mt = mt.select_rows(mt[freq_field], csqs=csqs)
mt = mt.explode_rows(mt.csqs)
annotation_expr = {
"gene_id": mt.csqs.gene_id,
"gene": mt.csqs.gene_symbol,
"indel": hl.is_indel(mt.alleles[0], mt.alleles[1]),
"most_severe_consequence": mt.csqs.most_severe_consequence,
}
if tx_ht:
logger.info("Adding transcript expression annotation...")
tx_annotation = get_tx_expression_expr(
mt.row_key,
tx_ht,
mt.csqs,
).mean_proportion
annotation_expr["expressed"] = (
hl.case()
.when(tx_annotation >= high_expression_cutoff, "high")
.when(tx_annotation > low_expression_cutoff, "medium")
.when(hl.is_defined(tx_annotation), "low")
.default("missing")
)
else:
annotation_expr["transcript_id"] = mt.csqs.transcript_id
annotation_expr["canonical"] = hl.is_defined(mt.csqs.canonical)
mt = mt.annotate_rows(**annotation_expr)
return (
mt.group_rows_by(*list(annotation_expr.keys()))
.aggregate_rows(
n_sites=hl.agg.count(),
n_sites_array=hl.agg.array_sum(mt.freq.map(lambda x: hl.int(x.AC > 0))),
classic_caf=hl.agg.sum(mt[freq_field][freq_index].AF),
max_af=hl.agg.max(mt[freq_field][freq_index].AF),
classic_caf_array=hl.agg.array_sum(mt[freq_field].map(lambda x: x.AF)),
)
.aggregate_entries(
num_homs=hl.agg.count_where(mt.GT.is_hom_var()),
num_hets=hl.agg.count_where(mt.GT.is_het()),
defined_sites=hl.agg.count_where(hl.is_defined(mt.GT)),
)
.result()
)
def get_het_hom_summary_dict(
csq_set: Set[str],
most_severe_csq_expr: hl.expr.StringExpression,
defined_sites_expr: hl.expr.Int64Expression,
num_homs_expr: hl.expr.Int64Expression,
num_hets_expr: hl.expr.Int64Expression,
pop_expr: hl.expr.StringExpression,
) -> Dict[str, hl.expr.Int64Expression]:
"""
Generate dictionary containing summary counts.
Summary counts are:
- Number of sites with defined genotype calls
- Number of samples with heterozygous calls
- Number of samples with homozygous calls
Function has option to generate counts by population.
:param csq_set: Set containing transcript consequence string(s).
:param most_severe_csq_expr: StringExpression containing most severe consequence.
:param defined_sites_expr: Int64Expression containing number of sites with defined genotype calls.
:param num_homs_expr: Int64Expression containing number of samples with homozygous genotype calls.
:param num_hets_expr: Int64Expression containing number of samples with heterozygous genotype calls.
:param pop_expr: StringExpression containing sample population labels.
:return: Dictionary of summary annotation names and their values.
"""
csq_filter_expr = hl.literal(csq_set).contains(most_severe_csq_expr)
return {
"no_alt_calls": hl.agg.count_where(
(csq_filter_expr)
& (defined_sites_expr > 0)
& (num_homs_expr + num_hets_expr == 0)
),
"obs_het": hl.agg.count_where(
(csq_filter_expr) & (num_homs_expr == 0) & (num_hets_expr > 0)
),
"obs_hom": hl.agg.count_where((csq_filter_expr) & (num_homs_expr > 0)),
"defined": hl.agg.count_where((csq_filter_expr) & (defined_sites_expr > 0)),
"pop_no_alt_calls": hl.agg.group_by(
pop_expr,
hl.agg.count_where(
(csq_filter_expr)
& (defined_sites_expr > 0)
& (num_homs_expr + num_hets_expr == 0)
),
),
"pop_obs_het": hl.agg.group_by(
pop_expr,
hl.agg.count_where(
(csq_filter_expr) & (num_homs_expr == 0) & (num_hets_expr > 0)
),
),
"pop_obs_hom": hl.agg.group_by(
pop_expr,
hl.agg.count_where((csq_filter_expr) & (num_homs_expr > 0)),
),
"pop_defined": hl.agg.group_by(
pop_expr,
hl.agg.count_where((csq_filter_expr) & (defined_sites_expr > 0)),
),
}
def default_generate_gene_lof_summary(
mt: hl.MatrixTable,
collapse_indels: bool = False,
tx: bool = False,
lof_csq_set: Set[str] = LOF_CSQ_SET,
meta_root: str = "meta",
pop_field: str = "pop",
filter_loftee: bool = False,
) -> hl.Table:
"""
Generate summary counts for loss-of-function (LoF), missense, and synonymous variants.
Also calculates p, proportion of of haplotypes carrying a putative LoF (pLoF) variant,
and observed/expected (OE) ratio of samples with homozygous pLoF variant calls.
Summary counts are (all per gene):
- Number of samples with no pLoF variants.
- Number of samples with heterozygous pLoF variants.
- Number of samples with homozygous pLoF variants.
- Total number of sites with genotype calls.
- All of the above stats grouped by population.
Assumes MT was created using `default_generate_gene_lof_matrix`.
.. note::
Assumes LoF variants in MT were filtered (LOFTEE pass and no LoF flag only).
If LoF variants have not been filtered and `filter_loftee` is True,
expects MT has the row annotation `vep`.
:param mt: Input MatrixTable.
:param collapse_indels: Whether to collapse indels. Default is False.
:param tx: Whether input MT has transcript expression data. Default is False.
:param lof_csq_set: Set containing LoF transcript consequence strings. Default is LOF_CSQ_SET.
:param meta_root: String indicating top level name for sample metadata. Default is 'meta'.
:param pop_field: String indiciating field with sample population assignment information. Default is 'pop'.
:param filter_loftee: Filters to LOFTEE pass variants (and no LoF flags) only. Default is False.
:return: Table with het/hom summary counts.
"""
if collapse_indels:
grouping = ["gene_id", "gene", "most_severe_consequence"]
if tx:
grouping.append("expressed")
else:
grouping.extend(["transcript_id", "canonical"])
mt = (
mt.group_rows_by(*grouping)
.aggregate_rows(
n_sites=hl.agg.sum(mt.n_sites),
n_sites_array=hl.agg.array_sum(mt.n_sites_array),
classic_caf=hl.agg.sum(mt.classic_caf),
max_af=hl.agg.max(mt.max_af),
classic_caf_array=hl.agg.array_sum(mt.classic_caf_array),
)
.aggregate_entries(
num_homs=hl.agg.sum(mt.num_homs),
num_hets=hl.agg.sum(mt.num_hets),
defined_sites=hl.agg.sum(mt.defined_sites),
)
.result()
)
if filter_loftee:
lof_ht = get_most_severe_consequence_for_summary(mt.rows())
mt = mt.filter_rows(
hl.is_defined(lof_ht[mt.row_key].lof)
& (lof_ht[mt.row_key].lof == "HC")
& (lof_ht[mt.row_key].no_lof_flags)
)
ht = mt.annotate_rows(
lof=hl.struct(
**get_het_hom_summary_dict(
csq_set=lof_csq_set,
most_severe_csq_expr=mt.most_severe_consequence,
defined_sites_expr=mt.defined_sites,
num_homs_expr=mt.num_homs,
num_hets_expr=mt.num_hets,
pop_expr=mt[meta_root][pop_field],
),
),
missense=hl.struct(
**get_het_hom_summary_dict(
csq_set={"missense_variant"},
most_severe_csq_expr=mt.most_severe_consequence,
defined_sites_expr=mt.defined_sites,
num_homs_expr=mt.num_homs,
num_hets_expr=mt.num_hets,
pop_expr=mt[meta_root][pop_field],
),
),
synonymous=hl.struct(
**get_het_hom_summary_dict(
csq_set={"synonymous_variant"},
most_severe_csq_expr=mt.most_severe_consequence,
defined_sites_expr=mt.defined_sites,
num_homs_expr=mt.num_homs,
num_hets_expr=mt.num_hets,
pop_expr=mt[meta_root][pop_field],
),
),
).rows()
ht = ht.annotate(
p=(1 - hl.sqrt(hl.float64(ht.lof.no_alt_calls) / ht.lof.defined)),
pop_p=hl.dict(
hl.array(ht.lof.pop_defined).map(
lambda x: (
x[0],
1 - hl.sqrt(hl.float64(ht.lof.pop_no_alt_calls.get(x[0])) / x[1]),
)
)
),
)
ht = ht.annotate(exp_hom_lof=ht.lof.defined * ht.p * ht.p)
return ht.annotate(oe=ht.lof.obs_hom / ht.exp_hom_lof)