In [1]:
import hail as hl
import gnomad

In [2]:
def create_grch38_revel_info(path, ow = False, give_object = False):
    revel_information = hl.import_table("gs://gnomad-wphu/revel_grch38_all_chromosomes.csv", delimiter=",", types={'hg19_pos':hl.tint,'grch38_pos':hl.tstr,'REVEL': hl.tfloat64})
    revel_information_grch38 = revel_information.drop("hg19_pos")
    revel_information_grch38 = revel_information_grch38.filter(revel_information_grch38.grch38_pos.contains("."),keep=False)
    revel_information_grch38 = revel_information_grch38.annotate(grch38_pos_int = hl.int(revel_information_grch38.grch38_pos))
    revel_information_grch38 = revel_information_grch38.transmute(grch38_pos = revel_information_grch38.grch38_pos_int)
    revel_information_grch38 = revel_information_grch38.transmute(chr = "chr" + revel_information_grch38.chr)
    revel_information_grch38 = revel_information_grch38.annotate(locus = hl.locus(revel_information_grch38.chr, revel_information_grch38.grch38_pos, reference_genome="GRCh38"))
    revel_information_grch38 = revel_information_grch38.annotate(alleles = hl.array([revel_information_grch38.ref, revel_information_grch38.alt]))
    revel_information_grch38 = revel_information_grch38.select("locus","alleles","REVEL", "aaref", "aaalt")
    revel_information_grch38 = revel_information_grch38.key_by("locus", "alleles")
    if ow:
        revel_information_grch38.write(path, overwrite=ow)
    if give_object:
        return revel_information_grch38
    
def load_grch38_revel_info():
    return(hl.read_table("gs://gnomad-wphu/revel_annotations_grch38.ht"))

In [3]:
from gnomad.utils.vcf import ht_to_vcf_mt
def get_chr():
    info_ht = gnomad_vars.select()
    info_ht = info_ht.filter(info_ht.locus.contig=="chr1")
    info_ht = info_ht.head(100000000)
    hl.export_vcf(info_ht, "gs://gnomad-wphu/info_split_large_chr1.vcf")

In [4]:
def create_cadd_info():
    cadd = hl.import_table("gs://gnomad-wphu/gnomad.genomes.r3.0.indel.tsv", comment = "#", types={'Pos':hl.tint32, "RawScore":hl.tfloat, "PHRED":hl.tfloat})
    cadd = cadd.transmute(Chrom = "chr" + cadd.Chrom)
    cadd = cadd.annotate(locus = hl.locus(cadd.Chrom, cadd.Pos, reference_genome="GRCh38"))
    cadd = cadd.annotate(alleles = hl.array([cadd.Ref, cadd.Alt]))
    cadd = cadd.select("locus","alleles","RawScore", "PHRED")
    cadd = cadd.key_by("locus", "alleles")
    return cadd


In [5]:
def load_CADD(path, n_partitions, force_bgz = False):
    column_names = {'f0': 'chrom', 'f1': 'pos', 'f2': 'ref', 'f3': 'alt', 'f4': 'RawScore', 'f5': 'PHRED'}
    types = {'f0': hl.tstr, 'f1': hl.tint32, 'f4': hl.tfloat32, 'f5': hl.tfloat32}
    cadd_ht = hl.import_table(path, comment="#", no_header=True, types=types, min_partitions=n_partitions, force_bgz = force_bgz)
    cadd_ht = cadd_ht.rename(column_names)
    chrom = hl.format("chr%s", cadd_ht.chrom)
    locus = hl.locus(chrom, cadd_ht.pos, reference_genome="GRCh38")
    alleles = hl.array([cadd_ht.ref, cadd_ht.alt])
    cadd_ht = cadd_ht.transmute(locus=locus, alleles=alleles)
    cadd_union_ht = cadd_ht.head(0)
    for contigs in (range(1, 10), list(range(10, 23)) + ["X", "Y", "MT"]):
        contigs = ["chr%s" % contig for contig in contigs]
        cadd_ht_subset = cadd_ht.filter(hl.array(list(map(str, contigs))).contains(cadd_ht.locus.contig))
        cadd_union_ht = cadd_union_ht.union(cadd_ht_subset)

    cadd_union_ht = cadd_union_ht.select("locus", "alleles", "RawScore", "PHRED")
    cadd_union_ht = cadd_union_ht.key_by("locus", "alleles")
    cadd_union_ht = cadd_union_ht.annotate_globals(source_file_path = path)

    cadd_union_ht.describe()

    return cadd_union_ht

def make_unified_CADD():
    snvs = hl.read_table("gs://gnomad-wphu/CADD-v1.6-SNVs.ht")
    release3_indels = hl.read_table("gs://gnomad-wphu/CADD-v1.6-indels-updated.ht")
    raw31_indels = hl.read_table("gs://gnomad-wphu/CADD-indels-gnomad.3.1.ht")
    raw31_complex = hl.read_table("gs://gnomad-wphu/CADD-1.6-gnomad-complex-variants.ht")
    unified = snvs.head(0)
    unified = unified.union(snvs,release3_indels,raw31_indels, raw31_complex)
    unified = unified.annotate_globals(source_file_path = {"snvs":"gs://gnomad-wphu/CADD-v1.6-SNVs.ht",
                                                         "v3-indels":"gs://gnomad-wphu/CADD-v1.6-indels-updated.ht",
                                                         "v3.1-indels":"gs://gnomad-wphu/CADD-indels-gnomad.3.1.ht",
                                                         "v3.1-complex":"gs://gnomad-wphu/CADD-1.6-gnomad-complex-variants.ht"
                                                        })
    #unified.describe()
    return unified

def convert_CADD_indels_64_32():
    release3_indels = hl.read_table("gs://gnomad-wphu/CADD-v1.6-indels.ht")
    release3_indels = release3_indels.transmute(RawScore = hl.float32(release3_indels.RawScore))
    release3_indels = release3_indels.transmute(PHRED = hl.float32(release3_indels.PHRED))
    release3_indels.describe()
    release3_indels.write("gs://gnomad-wphu/CADD-v1.6-indels-updated.ht", overwrite = True)
    
#unified_CADD = make_unified_CADD()
#unified_CADD = unified_CADD.write("gs://gnomad-wphu/complete-CADD-v1.6-annotations.ht", overwrite=True)

In [6]:
def export_for_CADD_analysis(hl_tbl,path):
    export = hl_tbl.select()
    export = export.filter(export.locus.contig=="chrM",keep=False)
    hl.methods.export_vcf(export, path)

In [7]:
def combine_splice_ai():
    recode = {f"{i}":f"chr{i}" for i in (list(range(1, 23)) + ['X', 'Y'])}
    splice_snps_skip_invalid = hl.import_vcf("gs://gnomad-wphu/splice_ai_data/splice_ai_data/genome_scores_v1.3_ds.20a701bc58ab45b59de2576db79ac8d0/spliceai_scores.masked.snv.hg38.vcf.gz",
                                force_bgz= True,
                                min_partitions=3000,
                                reference_genome='GRCh38', contig_recoding=recode, skip_invalid_loci= True
                               )
    splice_snps_skip_invalid.annotate_globals(source_file_path="gs://gnomad-wphu/splice_ai_data/splice_ai_data/genome_scores_v1.3_ds.20a701bc58ab45b59de2576db79ac8d0/spliceai_scores.masked.snv.hg38.vcf.gz")
    
    splice_indels_skip_invalid = hl.import_vcf("gs://gnomad-wphu/splice_ai_data/gnomAD_v3.1_SpliceAI_scores-selected/spliceai_scores.masked.gnomad_indel.hg38.vcf.gz",
                                   force_bgz=True,
                                   reference_genome='GRCh38', contig_recoding=recode,skip_invalid_loci=True,
                                   min_partitions=1000)
    splice_indels_skip_invalid.annotate_globals(source_file_path = "gs://gnomad-wphu/splice_ai_data/gnomAD_v3.1_SpliceAI_scores-selected/spliceai_scores.masked.gnomad_indel.hg38.vcf.gz")
    
    spliceAi_info_skip_invalid = splice_snps_skip_invalid.union_rows(splice_indels_skip_invalid)
    return spliceAi_info_skip_invalid

def annotate_spliceAi(mt):
    delta_scores = mt.info.SpliceAI[0].split(delim="\\|")[2:6]
    splice_split = mt.info.annotate(
        SpliceAI=hl.map(lambda x: hl.float32(x), delta_scores)
    ).rename({"SpliceAI":"splice_ai"})
    mt = mt.annotate_rows(info=splice_split)

    # Annotate info.max_DS with the max of DS_AG, DS_AL, DS_DG, DS_DL in info.
    # delta_score array is |DS_AG|DS_AL|DS_DG|DS_DL
    consequences = hl.literal(
        ["acceptor_gain", "acceptor_loss", "donor_gain", "donor_loss"]
    )
    mt = mt.annotate_rows(info=mt.info.annotate(max_ds=hl.max(mt.info.splice_ai)))
    mt = mt.annotate_rows(
        info=mt.info.annotate(
            splice_consequence=hl.if_else(
                mt.info.max_ds > 0,
                consequences[mt.info.splice_ai.index(mt.info.max_ds)],
                "no_consequence",
            )
        )
    )

    return mt
#spliceAi_info_skip_invalid = combine_splice_ai()
#spliceAi_info_skip_invalid = annotate_spliceAi(spliceAi_info_skip_invalid)
#spliceAi_info_skip_invalid.write("gs://gnomad-wphu/spliceai-scores-updated.ht", overwrite=True)


In [8]:
#chr, pos, ref, alt, refAA, altAA, strand_1pos_0neg, trinucleotide_context, UCSC_gene, ExAC_coverage,primateDL_score
def create_primate_ai_info(write = False, path = "", rewrite = False):
    primate_ai= hl.import_table("gs://gnomad-wphu/PrimateAI_scores_v0.2_hg38.tsv.gz",
                                          force=True, comment="#", skip_blank_lines=True,
                                          types={"pos":hl.tint32, 'primateDL_score':hl.tfloat32, 'ExAC_coverage':hl.tfloat32}
                                         )
    primate_ai = primate_ai.annotate_globals(source_file_path="gs://gnomad-wphu/PrimateAI_scores_v0.2_hg38.tsv.gz")
    primate_ai = primate_ai.annotate(locus = hl.locus(primate_ai.chr,primate_ai.pos, reference_genome="GRCh38"))
    primate_ai = primate_ai.annotate(alleles = hl.array([primate_ai.ref, primate_ai.alt]))
    primate_ai = primate_ai.select("locus", "alleles", "primateDL_score", "ExAC_coverage","strand_1pos_0neg","refAA", "altAA", "trinucleotide_context", "UCSC_gene")
    primate_ai = primate_ai.key_by("locus","alleles")
    if primate_ai.n_partitions() < 10:
        primate_ai = primate_ai.repartition(500)
    if write:
        if overwrite:
            primate_ai.write(path, overwrite = rewrite)
        else:
            try:
                primate_ai.write(path)
            except Exception as e: print(e)
    primate_ai.describe()
    return primate_ai


In [9]:
gnomad_variants = hl.read_table("gs://gnomad/annotations/hail-0.2/ht/genomes_v3.1/gnomad_genomes_v3.1_info.split.ht")
gnomad_indels = hl.read_table("gs://gnomad-wphu/gnomad_indels.ht")
gnomad_variants_v3 = hl.read_table("gs://gnomad/annotations/hail-0.2/ht/genomes_v3/gnomad_genomes_v3_info.split.ht")
gnomad3_release = hl.read_table("gs://gnomad-public-requester-pays/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht")

#seqr_annotations_tbl = hl.read_table("gs://seqr-reference-data/GRCh38/all_reference_data/v2/combined_reference_data_grch38-2.0.3.ht")
#gnomad_vars_in_seqr = gnomad_variants.semi_join(seqr_annotations_tbl)
#gnomad_vars_not_in_seqr = gnomad_variants.anti_join(seqr_annotations_tbl)

#cadd_tbl = hl.read_table("gs://seqr-reference-data/GRCh38/CADD/CADD_snvs_and_indels.v1.4.ht")
#CADD_indels = create_cadd_indel_info()
CADD_indels = hl.read_table("gs://gnomad-wphu/CADD-v1.6-indels-updated.ht")
gnomad_indels_not_in_CADD = hl.read_table("gs://gnomad-wphu/gnomad-indels-anti-CADD.ht")
gnomad_indels_in_CADD = hl.read_table("gs://gnomad-wphu/gnomad-indels-semi-CADD.ht")
CADD_snps = hl.read_table("gs://gnomad-wphu/CADD-v1.6-SNVs.ht")
cadd = hl.read_table("gs://gnomad-wphu/complete-CADD-v1.6-annotations.ht")

primate_ai_info_old = hl.read_table("gs://seqr-reference-data/GRCh38/primate_ai/PrimateAI_scores_v0.2.liftover_grch38.ht")
primate_ai_info = hl.read_table("gs://gnomad-wphu/primate-ai-info.ht")
gnomad_vars_not_in_primate_ai = hl.read_table("gs://gnomad-wphu/gnomad-vars-anti-primate.ht")


#revel_information = hl.import_table("gs://gnomad-wphu/revel_grch38_all_chromosomes.csv", delimiter=",", types={'hg19_pos':hl.tint,'grch38_pos':hl.tstr,'REVEL': hl.tfloat64})
revel_information_grch38 = hl.read_table("gs://gnomad-wphu/revel_annotations_grch38.ht")
gnomad_vars_not_in_revel = hl.read_table("gs://gnomad-wphu/gnomad-vars-anti-revel.ht")
gnomad_vars_in_revel = hl.read_table("gs://gnomad-wphu/gnomad-vars-semi-revel.ht")

spliceAi_info_skip_invalid = hl.read_matrix_table("gs://gnomad-wphu/spliceai-scores-updated.ht")
spliceAi_snps_skip_invalid = hl.read_matrix_table("gs://gnomad-wphu/splice_snps_skip_invalid.ht")
spliceAi_indels_skip_invalid = hl.read_matrix_table("gs://gnomad-wphu/splice_indels_skip_invalid.ht")

full_annotations = hl.read_table("gs://gnomad/annotations/hail-0.2/ht/genomes_v3.1/gnomad_genomes_v3.1.analyst_annotations.ht")


Initializing Hail with default parameters...
Running on Apache Spark version 2.4.5
SparkUI available at http://wphu-m.c.broad-mpg-gnomad.internal:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.57-582b2e31b8bd
LOGGING: writing to /home/hail/hail-20201207-1818-0.2.57-582b2e31b8bd.log


In [50]:
#gnomad_vars_in_revel = gnomad_variants.semi_join(revel_information_grch38)
#gnomad_vars_not_in_revel = gnomad_variants.anti_join(revel_information_grch38)

#gnomad_vars_not_in_revel = gnomad_vars_not_in_revel.checkpoint("gs://gnomad-wphu/gnomad-vars-anti-revel.ht")
#gnomad_indels_in_revel = gnomad_vars_in_revel.checkpoint("gs://gnomad-wphu/gnomad-vars-semi-revel.ht")

#gnomad_indels = gnomad_indels.checkpoint("gs://gnomad-wphu/gnomad-indels-v3.1.ht")
#gnomad_indels_not_in_CADD = gnomad_indels.anti_join(CADD_indels)
#gnomad_indels_in_CADD = gnomad_indels.semi_join(CADD_indels)

#gnomad_indels_not_in_CADD = gnomad_indels_not_in_CADD.checkpoint("gs://gnomad-wphu/gnomad-indels-anti-CADD.ht")
#gnomad_indels_in_CADD = gnomad_indels_in_CADD.checkpoint("gs://gnomad-wphu/gnomad-indels-semi-CADD.ht")


#gnomad_indels = gnomad_indels.checkpoint("gs://gnomad-wphu/gnomad_indels.ht", overwrite = True)
#gnomad_indels.count()
#gnomad_indels.write("gs://gnomad-wphu/gnomad_indels.ht", overwrite = True)

#gnomad3_release = hl.read_table("gs://gnomad-public-requester-pays/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht")
#gnomad3_release.describe()

#gnomad3_release_indels = gnomad3_release.filter(
                            #hl.is_indel(gnomad3_release.alleles[0], gnomad3_release.alleles[1]))

#gnomad3_release_indels.count()

In [17]:
def annotate_gnomad_v31(list_of_annotations, gnomad_v31_tbl):
    import re
    global_annotations = {}
    for annotation in list_of_annotations:
        annotation = re.sub(r'(_| |-)',"",annotation.lower())
        print(annotation)
        if annotation == "cadd":
            cadd = hl.read_table("gs://gnomad-wphu/complete-CADD-v1.6-annotations.ht")
            cadd = cadd.transmute(cadd = hl.struct(raw_score = cadd.RawScore,
                                                   phred = cadd.PHRED
                                                  ))
            #CADD = hl.read_table()
            #gnomad_v31_tbl = gnomad_v31_tbl.join(CADD.transmute(CADD = hl.struct(RawScore = CADD.RawScore, PHRED = CADD.PHRED)), how="left")
            gnomad_v31_tbl = gnomad_v31_tbl.join(cadd, how = "left")
            global_annotations["cadd"]="gs://gnomad-wphu/complete-CADD-v1.6-annotations.ht"
            
        elif annotation == "revel":
            revel = hl.read_table("gs://gnomad-wphu/revel_annotations_grch38.ht")
            revel = revel.transmute(revel = hl.struct(revel_score = revel.REVEL,
                                                      ref_aa = revel.aaref,
                                                      alt_aa = revel.aaalt
                                                     ))
            gnomad_v31_tbl = gnomad_v31_tbl.join(revel, how = "left")
            global_annotations["revel"] = "gs://gnomad-wphu/revel_annotations_grch38.ht"
            
        elif annotation == "spliceai":
            spliceai = hl.read_matrix_table("gs://gnomad-wphu/spliceai-scores-updated.ht")
            #spliceai = spliceai.annotate_rows(info = spliceai.info.annotate(
                                                #risd = spliceai.rsid,
                                                #qual = spliceai.qual,
                                                #filters = spliceai.filters
                                                #)
                                             #)
            spliceai = spliceai.rename({"info" : "splice_ai"})
            gnomad_v31_tbl = gnomad_v31_tbl.join(spliceai.make_table(), how = "left")
            global_annotations["splice_ai"] = "gs://gnomad-wphu/spliceai-scores-updated.ht"
            
        elif annotation == "primateai":
            primateai = hl.read_table("gs://gnomad-wphu/primate-ai-info.ht")
            primateai = primateai.transmute(primate_ai = hl.struct(primate_ai_score = primateai.primateDL_score))
            gnomad_v31_tbl = gnomad_v31_tbl.join(primateai, how = "left")
            global_annotations["primate_ai"] = "gs://gnomad-wphu/primate-ai-info.ht"
    gnomad_v31_tbl = gnomad_v31_tbl.select_globals()
    gnomad_v31_tbl = gnomad_v31_tbl.annotate_globals(annotation_file_path=global_annotations)
    gnomad_v31_tbl.describe()
    gnomad_v31_tbl = gnomad_v31_tbl.select("cadd", "revel", "splice_ai", "primate_ai")
    return gnomad_v31_tbl
        
result = annotate_gnomad_v31(["CADD", "REVEL", "SPLICE-AI", "PRIMATE-AI"], gnomad_variants)
result.describe()


cadd


2020-12-07 18:29:04 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'alleles' -> 'alleles_1'
    'locus' -> 'locus_1'


1570602305
revel


2020-12-07 18:38:54 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'alleles' -> 'alleles_1'
    'locus' -> 'locus_1'


1571019842
spliceai


2020-12-07 18:52:42 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'


1584070311
primateai


2020-12-07 19:11:31 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'alleles' -> 'alleles_1'
    'locus' -> 'locus_1'


1584091437
----------------------------------------
Global fields:
    'annotation_file_path': dict<str, str> 
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'info': struct {
        ReadPosRankSum: float64, 
        MQRankSum: float64, 
        QUALapprox: int64, 
        VarDP: int32, 
        SB: array<int32>, 
        MQ: float64, 
        QD: float32, 
        FS: float64, 
        AS_ReadPosRankSum: float64, 
        AS_MQRankSum: float64, 
        AS_QUALapprox: int64, 
        AS_VarDP: int32, 
        AS_MQ: float64, 
        AS_QD: float32, 
        AS_SB_TABLE: array<int32>, 
        AS_FS: float64, 
        AC_raw: int32, 
        AC: int32, 
        AS_pab_max: float64
    } 
    'lowqual': bool 
    'AS_lowqual': bool 
    'a_index': int32 
    'was_split': bool 
    'old_locus': locus<GRCh38> 
    'old_alleles': array<str> 
    'cadd': struct {
        raw_score: float32, 
        phred: float32
    } 
    

In [18]:
#r = gnomad_variants.anti_join(full_annotations)
#r.show()

Unnamed: 0_level_0,Unnamed: 1_level_0,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,Unnamed: 21_level_0,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0
locus,alleles,ReadPosRankSum,MQRankSum,QUALapprox,VarDP,SB,MQ,QD,FS,AS_ReadPosRankSum,AS_MQRankSum,AS_QUALapprox,AS_VarDP,AS_MQ,AS_QD,AS_SB_TABLE,AS_FS,AC_raw,AC,AS_pab_max,lowqual,AS_lowqual,a_index,was_split,old_locus,old_alleles
locus<GRCh38>,array<str>,float64,float64,int64,int32,array<int32>,float64,float32,float64,float64,float64,int64,int32,float64,float32,array<int32>,float64,int32,int32,float64,bool,bool,int32,bool,locus<GRCh38>,array<str>


In [15]:
#full_annotations.count()

1584091437

In [16]:
#gnomad_variants.count()

1570602305

In [46]:
#test = gnomad_variants
#print(test.count())
#revel = hl.read_table("gs://gnomad-wphu/revel_annotations_grch38.ht")
#revel = revel.transmute(revel = hl.struct(revel_score = revel.REVEL,
#                                                      ref_aa = revel.aaref,
#                                                      alt_aa = revel.aaalt
#                                                     ))
#test = test.join(revel, how = "left")
#test.count()

1570602305


2020-12-07 20:23:50 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'alleles' -> 'alleles_1'
    'locus' -> 'locus_1'


1571019842

In [47]:
#r = test.anti_join(gnomad_variants)
#r.count()

0

In [50]:
#test.distinct().count()

1570602305

In [77]:
#a = full_annotations.collect_by_key()
#b = a.filter(hl.len(a.values)>1)
#full_annotations.semi_join(b).show()

Unnamed: 0_level_0,Unnamed: 1_level_0,cadd,cadd,revel,revel,revel,splice_ai,splice_ai,splice_ai,primate_ai
locus,alleles,raw_score,phred,revel_score,ref_aa,alt_aa,splice_ai,max_ds,splice_consequence,primate_ai_score
locus<GRCh38>,array<str>,float32,float32,float64,str,str,array<float32>,float32,str,float32
chr1:925884,"[""C"",""T""]",0.599,7.5,,,,"[0.00e+00,0.00e+00,0.00e+00,0.00e+00]",0.0,"""no_consequence""",
chr1:925884,"[""C"",""T""]",0.599,7.5,,,,"[4.00e-02,4.00e-02,0.00e+00,0.00e+00]",0.04,"""acceptor_gain""",
chr1:925885,"[""C"",""T""]",0.0301,1.53,,,,"[0.00e+00,0.00e+00,0.00e+00,0.00e+00]",0.0,"""no_consequence""",
chr1:925885,"[""C"",""T""]",0.0301,1.53,,,,"[1.00e-02,2.00e-02,0.00e+00,0.00e+00]",0.02,"""acceptor_loss""",
chr1:925886,"[""G"",""T""]",0.378,5.18,,,,"[0.00e+00,0.00e+00,0.00e+00,0.00e+00]",0.0,"""no_consequence""",
chr1:925886,"[""G"",""T""]",0.378,5.18,,,,"[1.10e-01,5.00e-02,1.00e-02,0.00e+00]",0.11,"""acceptor_gain""",
chr1:925887,"[""C"",""T""]",0.818,9.57,,,,"[0.00e+00,0.00e+00,0.00e+00,0.00e+00]",0.0,"""no_consequence""",
chr1:925887,"[""C"",""T""]",0.818,9.57,,,,"[3.00e-02,2.00e-02,0.00e+00,0.00e+00]",0.03,"""acceptor_gain""",
chr1:925889,"[""C"",""G""]",0.928,10.7,,,,"[0.00e+00,0.00e+00,0.00e+00,0.00e+00]",0.0,"""no_consequence""",
chr1:925889,"[""C"",""G""]",0.928,10.7,,,,"[0.00e+00,0.00e+00,0.00e+00,0.00e+00]",0.0,"""no_consequence""",


In [89]:
#print(spliceAi_info_skip_invalid.make_table().count())
#print(spliceAi_info_skip_invalid.make_table().distinct().count())
#a = spliceAi_info_skip_invalid.make_table().collect_by_key()
#b = a.filter(hl.len(a.values)>1)
#spliceAi_info_skip_invalid.make_table().semi_join(b).show()

3696314616
3602698485


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,info,info,info
locus,alleles,rsid,qual,filters,splice_ai,max_ds,splice_consequence
locus<GRCh38>,array<str>,str,float64,set<str>,array<float32>,float32,str
chr1:925884,"[""C"",""A""]",,-10.0,,"[0.00e+00,0.00e+00,0.00e+00,0.00e+00]",0.0,"""no_consequence"""
chr1:925884,"[""C"",""A""]",,-10.0,,"[2.00e-02,1.00e-02,0.00e+00,0.00e+00]",0.02,"""acceptor_gain"""
chr1:925884,"[""C"",""G""]",,-10.0,,"[0.00e+00,0.00e+00,0.00e+00,0.00e+00]",0.0,"""no_consequence"""
chr1:925884,"[""C"",""G""]",,-10.0,,"[0.00e+00,0.00e+00,0.00e+00,0.00e+00]",0.0,"""no_consequence"""
chr1:925884,"[""C"",""T""]",,-10.0,,"[0.00e+00,0.00e+00,0.00e+00,0.00e+00]",0.0,"""no_consequence"""
chr1:925884,"[""C"",""T""]",,-10.0,,"[4.00e-02,4.00e-02,0.00e+00,0.00e+00]",0.04,"""acceptor_gain"""
chr1:925885,"[""C"",""A""]",,-10.0,,"[0.00e+00,0.00e+00,0.00e+00,0.00e+00]",0.0,"""no_consequence"""
chr1:925885,"[""C"",""A""]",,-10.0,,"[0.00e+00,0.00e+00,0.00e+00,0.00e+00]",0.0,"""no_consequence"""
chr1:925885,"[""C"",""G""]",,-10.0,,"[0.00e+00,0.00e+00,0.00e+00,0.00e+00]",0.0,"""no_consequence"""
chr1:925885,"[""C"",""G""]",,-10.0,,"[0.00e+00,2.00e-02,0.00e+00,0.00e+00]",0.02,"""acceptor_loss"""


In [113]:
#a = spliceAi_snps_skip_invalid.make_table()
#a.filter(hl.is_defined(a.info.SpliceAI)).semi_join(b).show()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,info
locus,alleles,rsid,qual,filters,SpliceAI
locus<GRCh38>,array<str>,str,float64,set<str>,array<str>
chr1:925884,"[""C"",""A""]",,-10.0,,"[""A|AL645608.1|0.00|0.00|0.00|0.00|2|27|12|1""]"
chr1:925884,"[""C"",""A""]",,-10.0,,"[""A|SAMD11|0.02|0.01|0.00|0.00|14|38|14|38""]"
chr1:925884,"[""C"",""G""]",,-10.0,,"[""G|AL645608.1|0.00|0.00|0.00|0.00|27|6|12|1""]"
chr1:925884,"[""C"",""G""]",,-10.0,,"[""G|SAMD11|0.00|0.00|0.00|0.00|38|14|38|14""]"
chr1:925884,"[""C"",""T""]",,-10.0,,"[""T|AL645608.1|0.00|0.00|0.00|0.00|-2|6|12|1""]"
chr1:925884,"[""C"",""T""]",,-10.0,,"[""T|SAMD11|0.04|0.04|0.00|0.00|14|38|14|38""]"
chr1:925885,"[""C"",""A""]",,-10.0,,"[""A|AL645608.1|0.00|0.00|0.00|0.00|1|26|6|11""]"
chr1:925885,"[""C"",""A""]",,-10.0,,"[""A|SAMD11|0.00|0.00|0.00|0.00|37|13|37|13""]"
chr1:925885,"[""C"",""G""]",,-10.0,,"[""G|AL645608.1|0.00|0.00|0.00|0.00|26|5|-25|11""]"
chr1:925885,"[""C"",""G""]",,-10.0,,"[""G|SAMD11|0.00|0.02|0.00|0.00|2|37|-5|13""]"


In [86]:
#print(revel_information_grch38.count())
#print(revel_information_grch38.distinct().count())
#a = revel_information_grch38.collect_by_key()
#b = a.filter(hl.len(a.values)>1)
#revel_information_grch38.semi_join(b).show()

82077491
77843668


locus,alleles,REVEL,aaref,aaalt
locus<GRCh38>,array<str>,float64,str,str
chr1:930312,"[""C"",""A""]",0.097,"""P""","""Q"""
chr1:930312,"[""C"",""A""]",0.033,"""R""","""S"""
chr1:930312,"[""C"",""G""]",0.072,"""P""","""R"""
chr1:930312,"[""C"",""G""]",0.038,"""R""","""G"""
chr1:930312,"[""C"",""T""]",0.071,"""P""","""L"""
chr1:930312,"[""C"",""T""]",0.044,"""R""","""C"""
chr1:930315,"[""A"",""C""]",0.26,"""H""","""P"""
chr1:930315,"[""A"",""C""]",0.004,"""T""","""P"""
chr1:930315,"[""A"",""G""]",0.034,"""H""","""R"""
chr1:930315,"[""A"",""G""]",0.018,"""T""","""A"""


In [82]:
#print(cadd.count())
#print(cadd.distinct().count())

9079588475
9079588475


In [84]:
#print(primate_ai_info.count())
#print(primate_ai_info.distinct().count())
#a = primate_ai_info.collect_by_key()
#b = a.filter(hl.len(a.values)>1)
#primate_ai_info.semi_join(b).show()

70116384
70073665


locus,alleles,primateDL_score,ExAC_coverage,strand_1pos_0neg,refAA,altAA,trinucleotide_context,UCSC_gene
locus<GRCh38>,array<str>,float32,float32,str,str,str,str,str
chr1:2385807,"[""T"",""C""]",0.233,79.7,"""0""","""Q""","""R""","""CAG""","""uc001ajb.1"""
chr1:2385807,"[""T"",""C""]",0.204,79.7,"""0""","""R""","""G""","""CAG""","""uc010nyy.2"""
chr1:2560633,"[""G"",""C""]",0.657,47.7,"""1""","""V""","""L""","""AGT""","""uc001ajt.1"""
chr1:2560633,"[""G"",""C""]",0.278,47.7,"""1""","""S""","""T""","""AGT""","""uc001ajr.3"""
chr1:12824946,"[""C"",""G""]",0.399,0.0,"""1""","""C""","""W""","""GCT""","""uc001ava.1"""
chr1:12824946,"[""C"",""G""]",0.403,88.8,"""0""","""C""","""S""","""TGT""","""uc001auk.2"""
chr1:12824950,"[""A"",""C""]",0.399,0.0,"""1""","""Y""","""S""","""TAC""","""uc001ava.1"""
chr1:12824950,"[""A"",""C""]",0.413,90.1,"""0""","""C""","""G""","""CTG""","""uc001auk.2"""
chr1:12824950,"[""A"",""G""]",0.399,0.0,"""1""","""Y""","""C""","""TAC""","""uc001ava.1"""
chr1:12824950,"[""A"",""G""]",0.449,90.1,"""0""","""C""","""R""","""CTG""","""uc001auk.2"""


# Splice AI

In [65]:
spliceAi_info_skip_invalid.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        splice_ai: array<float32>, 
        max_ds: float32, 
        splice_consequence: str
    }
----------------------------------------
Entry fields:
    None
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


In [63]:
spliceAi_info_skip_invalid.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        splice_ai: array<float32>, 
        max_DS: float32, 
        splice_consequence: str
    }
----------------------------------------
Entry fields:
    None
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


In [22]:
#splice_indels = hl.import_vcf("gs://gnomad-wphu/splice_ai_data/gnomAD_v3.1_SpliceAI_scores-selected/spliceai_scores.masked.gnomad_indel.hg38.vcf.gz",
                                   #force_bgz=True,
                                   #reference_genome='GRCh38', contig_recoding=recode,skip_invalid_loci=True,
                                   #min_partitions=1000)

#splice_snps = hl.import_vcf("gs://gnomad-wphu/splice_ai_data/splice_ai_data/genome_scores_v1.3_ds.20a701bc58ab45b59de2576db79ac8d0/spliceai_scores.raw.snv.hg38.vcf.gz",
                                #force_bgz= True,
                                #min_partitions=10000,
                                #reference_genome='GRCh38', contig_recoding=recode
                               #)
#splice_snps_withAnno = splice_snps.filter_rows(hl.len(splice_snps.info.SpliceAI)>0, keep=True)

#splice_indels = hl.import_vcf("gs://gnomad-wphu/splice_ai_data/gnomAD_v3.1_SpliceAI_scores-selected/spliceai_scores.masked.gnomad_indel.hg38.vcf.gz",
                                   #force_bgz=True,
                                   #reference_genome='GRCh38', contig_recoding=recode, skip_invalid_loci=True,
                                   #min_partitions=1000)

In [23]:
#print(spliceAi_indels_skip_invalid.count())
#print(spliceAi_snps_skip_invalid.count())
#print(spliceAi_info_skip_invalid.count())

In [24]:
#spliceAi_info_skipInvalid_tbl = spliceAi_info_skipInvalid.make_table()
#spliceAi_info_skipInvalid_tbl.show(5)
#spliceAi_info_skipInvalid_tbl.describe()
#print(gnomad_variants.anti_join(spliceAi_info_skipInvalid_tbl).count())
#gnomad_variants.show(5)
#spliceAi_snps_skipInvalid.show()
#print(gnomad_variants.count())

#numbers do not match because spliceAi only does intergenic variants; should pull up list of genes and compare missing variants against gene list.

In [25]:
#spliceAi_genes = hl.import_table("gs://gnomad-wphu/grch38.tsv", types = {"TX_START":hl.tint64, "TX_END":hl.tint64})
#spliceAi_genes = hl.import_table("gs://gnomad-wphu/grch38.tsv")
#spliceAi_genes = spliceAi_genes.transmute(CHROM = "chr" + spliceAi_genes.CHROM)
#spliceAi_genes = spliceAi_genes.annotate(INTERVAL = hl.parse_locus_interval(spliceAi_genes.CHROM + ":"+spliceAi_genes.TX_START + "-" +spliceAi_genes.TX_END, reference_genome="GRCh38"))

In [26]:
#gnomad_variants_in_genomic_regions = hl.filter_intervals(gnomad_variants, spliceAi_genes.INTERVAL.collect())
#gnomad_variants_in_genomic_regions.count()

In [27]:
#vars_in_genes_not_found = hl.filter_intervals(gnomad_variants.anti_join(spliceAi_info_skipInvalid_tbl),spliceAi_genes.INTERVAL.collect())

In [28]:
#gnomad_indels_not_in_CADD.aggregate(hl.agg.collect_as_set(gnomad_indels_not_in_CADD.locus.contig))


#gnomad_indels_not_in_CADD_mitoStripped.aggregate(
    #hl.agg.collect_as_set(gnomad_indels_not_in_CADD_mitoStripped.locus.contig))
#gnomad_indels_not_in_CADD_mitoStripped.count()


#export_indels_for_CADD_analysis(gnomad_indels_not_in_CADD_mitoStripped, "gs://gnomad-wphu/CADD_indels_for_upload_mitoStripped.vcf.bgz")

In [29]:
#test = gnomad_indels_not_in_CADD.head(150000)
#export_indels_for_CADD_analysis(test, "gs://gnomad-wphu/CADD_indels_for_upload_test_90000.vcf")


In [30]:
#CADD_snps_test = load_CADD(path = "gs://gnomad-wphu/whole_genome_SNVs.tsv", n_partitions=5000)
#CADD_snps_test.show()

#CADD_snps = CADD_snps.checkpoint("gs://gnomad-wphu/CADD-v1.6-SNVs.ht")

In [31]:
#cadd_ht = hl.import_table("gs://gnomad-wphu/whole_genome_SNVs.tsv", comment="#", no_header=True, min_partitions=5000)

# Primate AI

In [32]:
#primate_ai_info.count()

In [34]:
#primate_ai_info_new = create_primate_ai_info()

In [35]:
#primate_ai_info_new = primate_ai_info_new.repartition(500)
#primate_ai_info_new = primate_ai_info_new.checkpoint("gs://gnomad-wphu/primate-ai-info.ht", overwrite=False)

# Combined Annotations

In [36]:
#gnomad_variants.count()
#gnomad_variants.describe()
#gnomad_variants.show()

In [37]:
#seqr_annotations_tbl.show()
#seqr_annotations_tbl.describe()

In [11]:
#result = result.checkpoint("gs://gnomad-wphu/gnomad-3.1-all-variants-annotations.ht")
result.write("gs://gnomad/annotations/hail-0.2/ht/genomes_v3.1/gnomad_genomes_v3.1.analyst_annotations.ht", overwrite=True)


2020-10-28 18:07:02 Hail: INFO: wrote table with 1584091437 rows in 7505 partitions to gs://gnomad/annotations/hail-0.2/ht/genomes_v3.1/gnomad_genomes_v3.1.analyst_annotations.ht


In [73]:
result.show()

Unnamed: 0_level_0,Unnamed: 1_level_0,cadd,cadd,revel,revel,revel,splice_ai,splice_ai,splice_ai,primate_ai
locus,alleles,raw_score,phred,revel_score,ref_aa,alt_aa,splice_ai,max_ds,splice_consequence,primate_ai_score
locus<GRCh38>,array<str>,float32,float32,float64,str,str,array<float32>,float32,str,float32
chr1:10001,"[""T"",""A""]",0.703,8.48,,,,,,,
chr1:10001,"[""T"",""C""]",0.751,8.92,,,,,,,
chr1:10007,"[""T"",""C""]",0.756,8.97,,,,,,,
chr1:10009,"[""A"",""C""]",0.72,8.64,,,,,,,
chr1:10013,"[""T"",""A""]",0.702,8.47,,,,,,,
chr1:10015,"[""A"",""C""]",0.714,8.58,,,,,,,
chr1:10019,"[""T"",""C""]",0.751,8.92,,,,,,,
chr1:10020,"[""A"",""C""]",0.713,8.57,,,,,,,
chr1:10021,"[""A"",""C""]",0.713,8.58,,,,,,,
chr1:10022,"[""C"",""T""]",0.621,7.71,,,,,,,


# Other Things

In [None]:
#missing_cadd_scores = result.filter(hl.is_defined(result.cadd.phred),keep=False)
#missing_cadd_scores.count()

In [None]:
#missing_cadd_scores.filter(missing_cadd_scores.locus.contig=="chrM", keep=False).count()

In [None]:
#primate_ai_info.ExAC_coverage.summarize()

In [None]:
#print(gnomad_indels_in_CADD.count())
#print(gnomad_indels_not_in_CADD.count())
#print(gnomad_indels.count())

#misisng_cadd_scores = missing_cadd_scores.write("gs://gnomad-tmp/missing_cadd_scores.ht", overwrite=True)
#missing_cadd_scores = hl.read_table("gs://gnomad-tmp/missing_cadd_scores.ht")

In [None]:
#missing_cadd_scores_no_M = missing_cadd_scores.filter(missing_cadd_scores.locus.contig=="chrM",keep=False)
#print(missing_cadd_scores.count())
#print(missing_cadd_scores_no_M.count())

#print(missing_cadd_scores.filter(missing_cadd_scores.locus.contig=="chrM",keep=True).count())
#print(gnomad_variants.filter(gnomad_variants.locus.contig=="chrM", keep=True).count())
#missing_cadd_scores_no_M.alleles.collect()

In [None]:
#hl.len(missing_cadd_scores_no_M.old_alleles).summarize()
#missing_cadd_scores_no_M.count()
#hl.len(missing_cadd_scores_no_M.old_alleles).show()
#missing_cadd_scores_no_M.select(missing_cadd_scores_no_M.old_alleles).write("gs://gnomad-tmp/gnomad-31-complex-variants.ht")
#missing_cadd_scores_no_M.select(missing_cadd_scores_no_M.old_alleles).export("gs://gnomad-tmp/gnomad-31-complex-variants.tsv")

In [None]:
#missing_cadd_scores_diff_key = missing_cadd_scores_no_M.select(missing_cadd_scores_no_M.old_alleles)
#missing_cadd_scores_diff_key = missing_cadd_scores_diff_key.key_by(missing_cadd_scores_diff_key.old_alleles)
#missing_cadd_scores_diff_key = missing_cadd_scores_diff_key.collect_by_key()

In [None]:
#hl.len(missing_cadd_scores_diff_key.values)>1

In [None]:
#len(set([frozenset(x) for x in missing_cadd_scores_no_M.old_alleles.collect()]))

In [None]:
#missing_cadd_scores_diff_key.export("gs://gnomad-tmp/gnomad-variants-complex-by-old-alleles.tsv")

In [None]:
#missing_cadd_scores_diff_key.count()

In [None]:
#export_for_CADD_analysis(missing_cadd_scores_no_M, "gs://gnomad-wphu/gnomad-large-deletions-cadd.vcf")

In [None]:
#example = ['AGGCTGACCTCTGTCCGCGTGGGAGGGGCCGGTGTGAGGCAAGGGGCTCAGGCTGACCTCTGTCCGCGTGGGAGGGGCCGGTGTGAGGCAAGGGGCTCAGGCTGACCTCTGTCCGCGTGGGAGGGGCCGGGGTGAGGCAAGGGCTCACACTGACCTCTCTCAGCGTGGGAGGGGCCGGTGTGAGGCAAGGGGCTCGGGCTGACCTCTCTCAGCGTGGGAGGGGCCGGTGTGAGGCAAGGGGCTCGGGCTGACCTCTCTCAGCGTGGGAGGGGCCGGTGTGAGGCAAGGGGCTCG', 'G']
#(~(hl.is_snp(example[0], example[1]))).show()
#hl.is_indel(example[0], example[1]).show()

In [None]:
#not_snps = gnomad_variants.filter(hl.is_snp(gnomad_variants.alleles[0],gnomad_variants.alleles[1]),keep=False)
#not_snps.count()

In [None]:
#gnomad_indels.count()

In [None]:
#questionable = not_snps.anti_join(gnomad_indels)
#questionable.count()

In [None]:
#print(missing_cadd_scores_no_M.semi_join(questionable).count())
#print(missing_cadd_scores_no_M.filter(hl.is_complex(missing_cadd_scores_no_M.alleles[0], missing_cadd_scores_no_M.alleles[1])).count())


In [None]:
#hl.filter_intervals(questionable,spliceAi_genes.INTERVAL.collect()).select().export("gs://gnomad-wphu/missing-splice-ai-variants.tsv")


In [None]:
#gnomad_variants.join(CADD_snps.transmute(CADD = hl.struct(RawScore = CADD_snps.RawScore, PHRED = CADD_snps.PHRED)), how="left").show()
#CADD_snps.transmute(CADD = hl.struct(RawScore = CADD_snps.RawScore, PHRED = CADD_snps.PHRED))


In [None]:
#spliceAi_info_skip_invalid.describe()
#spliceAi_info_skip_invalid.rsid.show()
#spliceAi_info_skip_invalid.qual.show()
#spliceAi_info_skip_invalid.filters.show()
#spliceAi_info_skip_invalid.info.show()

#reannotate = spliceAi_info_skip_invalid.annotate_rows(info = spliceAi_info_skip_invalid.info.annotate(
#                                                risd = spliceAi_info_skip_invalid.rsid,
#                                                qual = spliceAi_info_skip_invalid.qual,
#                                                filters = spliceAi_info_skip_invalid.filters
#                                                )
#                                        )

#reannotate = reannotate.rename({"info" : "SPLICE_AI"})
#reannotate.describe()

In [None]:
#gnomadgnomad3_release.filter(hl.is_indel(gnomad3_release.alleles[0], gnomad3_release.alleles[1]))

In [None]:
#gnomad_31_CADD_indels = hl.import_table("gs://gnomad-julia/gnomad_v3.1/cadd_indel_output/CADD_gnomad3.1_scores_*.tsv.gz", comment="#",no_header=True, force_bgz=True)
#gnomad_31_CADD_indels = load_CADD("gs://gnomad-julia/gnomad_v3.1/cadd_indel_output/CADD_gnomad3.1_scores_*.tsv.gz", n_partitions=3000, force_bgz=True)



In [None]:
#gnomad_31_CADD_complex = load_CADD("gs://gnomad-julia/gnomad_v3.1/cadd_indel_output_extra/CADD_gnomad3.1_scores_*.tsv.gz",n_partitions=3000, force_bgz=True)
#gnomad_31_CADD_complex.describe()
#gnomad_31_CADD_complex.checkpoint("gs://gnomad-wphu/CADD-1.6-gnomad-complex-variants.ht")


In [None]:
#gnomad_31_CADD_complex.count()

In [None]:
#gnomad_31_CADD_indels.count()

In [None]:
#gnomad_indels_not_in_CADD.filter(gnomad_indels_not_in_CADD.locus.contig=="chrM",keep=False).count()

In [None]:
#gnomad_indels_not_in_CADD.count()

In [None]:
#gnomad_indels_not_in_CADD.filter(gnomad_indels_not_in_CADD.locus.contig=="chrM",keep=False).anti_join(gnomad_31_CADD_indels).count()

In [None]:
#gnomad_31_CADD_indels = gnomad_31_CADD_indels.checkpoint("gs://gnomad-wphu/CADD-indels-gnomad.3.1.ht")

In [None]:
#unified = make_unified_CADD()

In [None]:
#unified.describe()
#unified.show()

In [None]:
#unified.filePathSource["snvs"].show()

In [None]:
#unified = unified.checkpoint("gs://gnomad-wphu/complete-CADD-v1.6-annotations.ht")

In [None]:
#unified.count()

In [None]:
#import requests
#r = requests.get(url = "https://spliceailookup-api.broadinstitute.org/spliceai/?hg=38&variant=chr8-140300616-T-G")
#print(r.json())

In [None]:
#test = missing_cadd_scores_no_M.select().head(5).collect()

In [None]:
#print(test)

In [None]:
#test[0]

In [None]:
#primate_ai_info.show()
#primate_ai_info_old.show()

In [None]:
#spliceAi_info_skip_invalid.describe()

In [None]:
#test = hl.import_vcf("gs://seqr-reference-data/GRCh38/primate_ai/PrimateAI_scores_v0.2.liftover_grch38.vcf.gz", force_bgz=True)

In [None]:
#test.describe()

In [None]:
#test2 = hl.read('gs://seqr-reference-data/GRCh38/primate_ai/PrimateAI_scores_v0.2.liftover_grch38.vds')

In [None]:
#spliceAi_info_skip_invalid.qual.summarize()

In [7]:
dbNSFP = hl.import_table("gs://gnomad-wphu/dbNSFP/dbNSFP4.1/dbNSFP4.1a_variant.chr*.gz", force=True, missing='.')

2020-10-22 03:14:17 Hail: WARN: file 'gs://gnomad-wphu/dbNSFP/dbNSFP4.1/dbNSFP4.1a_variant.chr1.gz' is 2.8G
  It will be loaded serially (on one core) due to usage of the 'force' argument.
  If it is actually block-gzipped, either rename to .bgz or use the 'force_bgz'
  argument.
2020-10-22 03:14:18 Hail: WARN: file 'gs://gnomad-wphu/dbNSFP/dbNSFP4.1/dbNSFP4.1a_variant.chr10.gz' is 1.1G
  It will be loaded serially (on one core) due to usage of the 'force' argument.
  If it is actually block-gzipped, either rename to .bgz or use the 'force_bgz'
  argument.
2020-10-22 03:14:18 Hail: WARN: file 'gs://gnomad-wphu/dbNSFP/dbNSFP4.1/dbNSFP4.1a_variant.chr11.gz' is 1.7G
  It will be loaded serially (on one core) due to usage of the 'force' argument.
  If it is actually block-gzipped, either rename to .bgz or use the 'force_bgz'
  argument.
2020-10-22 03:14:18 Hail: WARN: file 'gs://gnomad-wphu/dbNSFP/dbNSFP4.1/dbNSFP4.1a_variant.chr12.gz' is 1.5G
  It will be loaded serially (on one core) due

In [22]:
dbNSFP.REVEL_score.summarize()

0,1
Non-missing,73967181 (88.04%)
Missing,10045912 (11.96%)
Min Size,5
Max Size,5
Mean Size,5.00
Sample Values,"['0.206', '0.253', '0.206', '0.046', '0.022']"


In [12]:
revel_information_grch38.show()

locus,alleles,REVEL,aaref,aaalt
locus<GRCh38>,array<str>,float64,str,str
chr1:35142,"[""G"",""A""]",0.027,"""T""","""M"""
chr1:35142,"[""G"",""C""]",0.035,"""T""","""R"""
chr1:35142,"[""G"",""T""]",0.043,"""T""","""K"""
chr1:35143,"[""T"",""A""]",0.018,"""T""","""S"""
chr1:35143,"[""T"",""C""]",0.034,"""T""","""A"""
chr1:35143,"[""T"",""G""]",0.039,"""T""","""P"""
chr1:35144,"[""A"",""C""]",0.012,"""C""","""W"""
chr1:35145,"[""C"",""A""]",0.023,"""C""","""F"""
chr1:35145,"[""C"",""G""]",0.029,"""C""","""S"""
chr1:35145,"[""C"",""T""]",0.016,"""C""","""Y"""


In [18]:
revel_information_grch38.count()

82077491

In [19]:
gnomad_variants.count()

1570602305

In [20]:
revel_information_grch38.anti_join(gnomad_variants).count()

72983629

In [8]:
dbNSFP = dbNSFP.repartition(500)
#dbNSFP.n_partitions()
dbNSFP = dbNSFP.checkpoint("gs://gnomad-wphu/dbNSFP.ht")

2020-10-22 04:13:41 Hail: INFO: wrote table with 84013093 rows in 500 partitions to gs://gnomad-wphu/dbNSFP.ht


In [25]:
82077491-84013093

-1935602

In [28]:
revel_information = hl.import_table("gs://gnomad-wphu/revel_grch38_all_chromosomes.csv", delimiter=",", types={'hg19_pos':hl.tint,'grch38_pos':hl.tstr,'REVEL': hl.tfloat64})
revel_information.count()

2020-10-22 02:58:51 Hail: INFO: Reading table without type imputation
  Loading field 'chr' as type str (not specified)
  Loading field 'hg19_pos' as type int32 (user-supplied)
  Loading field 'grch38_pos' as type str (user-supplied)
  Loading field 'ref' as type str (not specified)
  Loading field 'alt' as type str (not specified)
  Loading field 'aaref' as type str (not specified)
  Loading field 'aaalt' as type str (not specified)
  Loading field 'REVEL' as type float64 (user-supplied)


82100677

In [29]:
82100677-82077491

23186

In [12]:
dbNSFP.VEST4_score.summarize()

0,1
Non-missing,81566269 (97.09%)
Missing,2446824 (2.91%)
Min Size,1
Max Size,209
Mean Size,13.96
Sample Values,"['0.213;.;.;0.271;.;0.268;.;.;.;.', '0.193;.;0.174;0.176;.;.;.', '0.772;0.779;0.781', '0.295;0.477;0.545', '0.017;0.017;0.024']"


In [9]:
full_annotations.revel.summarize()

0,1
Non-missing,9742241 (0.62%)
Missing,1574349196 (99.38%)

0,1
Non-missing,9742241 (0.62%)
Missing,1574349196 (99.38%)
Minimum,0.00
Maximum,1.00
Mean,0.23
Std Dev,0.23

0,1
Non-missing,9742241 (0.62%)
Missing,1574349196 (99.38%)
Min Size,1
Max Size,1
Mean Size,1.00
Sample Values,"['S', 'S', 'S', 'S', 'S']"

0,1
Non-missing,9742241 (0.62%)
Missing,1574349196 (99.38%)
Min Size,1
Max Size,1
Mean Size,1.00
Sample Values,"['T', 'P', 'A', 'Y', 'F']"


In [13]:
revel_information_grch38.anti_join(full_annotations).show()

locus,alleles,REVEL,aaref,aaalt
locus<GRCh38>,array<str>,float64,str,str
chr1:35142,"[""G"",""A""]",0.027,"""T""","""M"""
chr1:35142,"[""G"",""C""]",0.035,"""T""","""R"""
chr1:35142,"[""G"",""T""]",0.043,"""T""","""K"""
chr1:35143,"[""T"",""A""]",0.018,"""T""","""S"""
chr1:35143,"[""T"",""C""]",0.034,"""T""","""A"""
chr1:35143,"[""T"",""G""]",0.039,"""T""","""P"""
chr1:35144,"[""A"",""C""]",0.012,"""C""","""W"""
chr1:35145,"[""C"",""A""]",0.023,"""C""","""F"""
chr1:35145,"[""C"",""G""]",0.029,"""C""","""S"""
chr1:35145,"[""C"",""T""]",0.016,"""C""","""Y"""


In [14]:
full_annotations.filter(hl.is_snp(full_annotations.alleles[0], full_annotations.alleles[1])).count()

1317419242

In [15]:
revel_information_grch38.anti_join(full_annotations).count()

72983629

In [18]:
revel_information_grch38.aggregate(hl.agg.group_by(revel_information_grch38.locus.contig, hl.agg.count()))

{'chr11': 4850779,
 'chr5': 3738178,
 'chr22': 1783478,
 'chr8': 2880984,
 'chr19': 5156147,
 'chrY': 31551,
 'chr1': 8460922,
 'chr15': 2682896,
 'chr12': 4361562,
 'chr18': 1223623,
 'chr20': 1984314,
 'chr2': 6108302,
 'chr13': 1448194,
 'chr7': 3845947,
 'chr14': 2672346,
 'chr3': 4849223,
 'chr17': 4507899,
 'chr4': 3388218,
 'chr6': 4151134,
 'chr9': 3369753,
 'chrX': 3100887,
 'chr10': 3289465,
 'chr21': 841178,
 'chr16': 3350511}

In [34]:
unique_loci = revel_information_grch38.key_by().key_by("locus").distinct()
a = unique_loci.aggregate(hl.agg.group_by(unique_loci.locus.contig, hl.agg.count()))

In [35]:
unique_loci_gnomad = gnomad_variants.key_by().key_by("locus").distinct()
b = unique_loci_gnomad.aggregate(hl.agg.group_by(unique_loci_gnomad.locus.contig, hl.agg.count()))

In [36]:
type(a)
type(b)

dict

In [7]:
full_annotations.summarize()

0,1
Non-missing,1584091437 (100.00%)
Missing,0
Contig Counts,"{'chr11': 69489149, 'chr5': 93769047, 'chr22': 22076679, 'chr8': 77100610, 'chr19': 34268343, 'chrY': 14637245, 'chr1': 119509995, 'chr15': 43776955, 'chr12': 68722532, 'chr18': 38674164, 'chr20': 31905096, 'chr2': 124209991, 'chr13': 50807608, 'chr7': 83637854, 'chr14': 46794023, 'chr3': 101328983, 'chrM': 5746, 'chr17': 43009606, 'chr4': 99543980, 'chr6': 86614746, 'chr9': 66585932, 'chrX': 130280891, 'chr10': 69628678, 'chr21': 21073146, 'chr16': 46640438}"

0,1
Non-missing,1584091437 (100.00%)
Missing,0
Min Size,2
Max Size,2
Mean Size,2.00

0,1
Non-missing,3168182874 (100.00%)
Missing,0
Min Size,1
Max Size,1382
Mean Size,1.91
Sample Values,"['T', 'A', 'T', 'C', 'T']"

0,1
Non-missing,1584085691 (100.00%)
Missing,5746 (0.00%)

0,1
Non-missing,1584085691 (100.00%)
Missing,5746 (0.00%)
Minimum,-19.41
Maximum,22.12
Mean,0.18
Std Dev,0.49

0,1
Non-missing,1584085691 (100.00%)
Missing,5746 (0.00%)
Minimum,0.00
Maximum,96.00
Mean,3.51
Std Dev,3.89

0,1
Non-missing,9742241 (0.62%)
Missing,1574349196 (99.38%)

0,1
Non-missing,9742241 (0.62%)
Missing,1574349196 (99.38%)
Minimum,0.00
Maximum,1.00
Mean,0.23
Std Dev,0.23

0,1
Non-missing,9742241 (0.62%)
Missing,1574349196 (99.38%)
Min Size,1
Max Size,1
Mean Size,1.00
Sample Values,"['S', 'S', 'S', 'S', 'S']"

0,1
Non-missing,9742241 (0.62%)
Missing,1574349196 (99.38%)
Min Size,1
Max Size,1
Mean Size,1.00
Sample Values,"['T', 'P', 'A', 'Y', 'F']"

0,1
Non-missing,745130422 (47.04%)
Missing,838961015 (52.96%)

0,1
Non-missing,583796996 (36.85%)
Missing,1000294441 (63.15%)
Min Size,4
Max Size,4
Mean Size,4.00

0,1
Non-missing,2335187984 (100.00%)
Missing,0
Minimum,-0.00
Maximum,1.00
Mean,0.00
Std Dev,0.02

0,1
Non-missing,583796996 (36.85%)
Missing,1000294441 (63.15%)
Minimum,0.00
Maximum,1.00
Mean,0.00
Std Dev,0.04

0,1
Non-missing,583796996 (36.85%)
Missing,1000294441 (63.15%)
Min Size,10
Max Size,14
Mean Size,13.85
Sample Values,"['Donor gain', 'Donor gain', 'Donor gain', 'Donor gain', 'Donor gain']"

0,1
Non-missing,8388957 (0.53%)
Missing,1575702480 (99.47%)

0,1
Non-missing,8388957 (0.53%)
Missing,1575702480 (99.47%)
Minimum,0.11
Maximum,0.99
Mean,0.54
Std Dev,0.20


In [8]:
spliceAi_info_skip_invalid.

0,1
Non-missing,0
Missing,0


0,1
Non-missing,3696314616 (100.00%)
Missing,0
Contig Counts,"{'chr11': 176389931, 'chr5': 212300234, 'chr22': 56180502, 'chr8': 165598617, 'chr19': 98755025, 'chrY': 7909364, 'chr1': 324696513, 'chr15': 122307725, 'chr12': 185933382, 'chr18': 87511547, 'chr20': 84077613, 'chr2': 307151672, 'chr13': 95599997, 'chr7': 218906720, 'chr14': 114442374, 'chr3': 262597980, 'chr17': 135095070, 'chr4': 202720275, 'chr6': 211688500, 'chr9': 150691302, 'chrX': 147792183, 'chr10': 182471028, 'chr21': 37698988, 'chr16': 107798074}"

0,1
Non-missing,3696314616 (100.00%)
Missing,0
Min Size,2
Max Size,2
Mean Size,2.00

0,1
Non-missing,7392629232 (100.00%)
Missing,0
Min Size,1
Max Size,1382
Mean Size,1.38
Sample Values,"['T', 'TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC', 'T', 'TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCA', 'T']"

0,1
Non-missing,0
Missing,3696314616 (100.00%)

0,1
Non-missing,3696314616 (100.00%)
Missing,0
Minimum,-10.00
Maximum,-10.00
Mean,-10.00
Std Dev,0.00

0,1
Non-missing,0
Missing,3696314616 (100.00%)

0,1
Non-missing,3696314616 (100.00%)
Missing,0

0,1
Non-missing,3534269705 (95.62%)
Missing,162044911 (4.38%)
Min Size,4
Max Size,4
Mean Size,4.00

0,1
Non-missing,14137078820 (100.00%)
Missing,0
Minimum,-0.00
Maximum,1.00
Mean,0.00
Std Dev,0.02

0,1
Non-missing,3534269705 (95.62%)
Missing,162044911 (4.38%)
Minimum,0.00
Maximum,1.00
Mean,0.00
Std Dev,0.04

0,1
Non-missing,3534269705 (95.62%)
Missing,162044911 (4.38%)
Min Size,10
Max Size,14
Mean Size,13.84
Sample Values,"['Acceptor gain', 'Donor gain', 'Donor gain', 'Donor gain', 'Donor gain']"


In [10]:
full_annotations.filter(hl.is_snp(full_annotations.alleles[0],full_annotations.alleles[1])).splice_ai.summarize()

0,1
Non-missing,482841462 (36.65%)
Missing,834577780 (63.35%)

0,1
Non-missing,482841462 (36.65%)
Missing,834577780 (63.35%)
Min Size,4
Max Size,4
Mean Size,4.00

0,1
Non-missing,1931365848 (100.00%)
Missing,0
Minimum,0.00
Maximum,1.00
Mean,0.00
Std Dev,0.02

0,1
Non-missing,482841462 (36.65%)
Missing,834577780 (63.35%)
Minimum,0.00
Maximum,1.00
Mean,0.00
Std Dev,0.03

0,1
Non-missing,482841462 (36.65%)
Missing,834577780 (63.35%)
Min Size,10
Max Size,14
Mean Size,13.85
Sample Values,"['Donor gain', 'Donor gain', 'Donor gain', 'Donor gain', 'Donor gain']"


In [11]:
full_annotations.filter(~hl.is_snp(full_annotations.alleles[0],full_annotations.alleles[1])).splice_ai.summarize()

0,1
Non-missing,262288960 (98.36%)
Missing,4383235 (1.64%)

0,1
Non-missing,100955534 (37.86%)
Missing,165716661 (62.14%)
Min Size,4
Max Size,4
Mean Size,4.00

0,1
Non-missing,403822136 (100.00%)
Missing,0
Minimum,-0.00
Maximum,1.00
Mean,0.00
Std Dev,0.03

0,1
Non-missing,100955534 (37.86%)
Missing,165716661 (62.14%)
Minimum,0.00
Maximum,1.00
Mean,0.01
Std Dev,0.05

0,1
Non-missing,100955534 (37.86%)
Missing,165716661 (62.14%)
Min Size,10
Max Size,14
Mean Size,13.84
Sample Values,"['Donor gain', 'Acceptor gain', 'Acceptor gain', 'Acceptor gain', 'Acceptor gain']"


In [12]:
full_annotations.filter(hl.is_indel(full_annotations.alleles[0],full_annotations.alleles[1])).splice_ai.summarize()

0,1
Non-missing,262288960 (98.36%)
Missing,4380928 (1.64%)

0,1
Non-missing,100955534 (37.86%)
Missing,165714354 (62.14%)
Min Size,4
Max Size,4
Mean Size,4.00

0,1
Non-missing,403822136 (100.00%)
Missing,0
Minimum,-0.00
Maximum,1.00
Mean,0.00
Std Dev,0.03

0,1
Non-missing,100955534 (37.86%)
Missing,165714354 (62.14%)
Minimum,0.00
Maximum,1.00
Mean,0.01
Std Dev,0.05

0,1
Non-missing,100955534 (37.86%)
Missing,165714354 (62.14%)
Min Size,10
Max Size,14
Mean Size,13.84
Sample Values,"['Donor gain', 'Acceptor gain', 'Acceptor gain', 'Acceptor gain', 'Acceptor gain']"


In [13]:
full_annotations.filter(hl.is_indel(full_annotations.alleles[0],full_annotations.alleles[1])).splice_ai.show()

Unnamed: 0_level_0,Unnamed: 1_level_0,splice_ai,splice_ai,splice_ai
locus,alleles,splice_ai,max_ds,splice_consequence
locus<GRCh38>,array<str>,array<float32>,float32,str
chr1:10055,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC""]",,,
chr1:10055,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCA""]",,,
chr1:10061,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC""]",,,
chr1:10064,"[""C"",""CCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAAA""]",,,
chr1:10067,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCCTAACCC""]",,,
chr1:10067,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC""]",,,
chr1:10073,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCCTAACCC""]",,,
chr1:10079,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCC""]",,,
chr1:10091,"[""T"",""TAACCCTAACCCTAACCC""]",,,
chr1:10108,"[""C"",""CA""]",,,


In [20]:
a = full_annotations.filter(hl.is_indel(full_annotations.alleles[0],full_annotations.alleles[1]))
b = a.filter(hl.is_missing(a.splice_ai))
b.show()
c = a.filter(hl.is_defined(a.splice_ai))
c.show()
b.splice_ai.summarize()
c.splice_ai.summarize()

Unnamed: 0_level_0,Unnamed: 1_level_0,cadd,cadd,revel,revel,revel,splice_ai,splice_ai,splice_ai,primate_ai
locus,alleles,raw_score,phred,revel_score,ref_aa,alt_aa,splice_ai,max_ds,splice_consequence,primate_ai_score
locus<GRCh38>,array<str>,float32,float32,float64,str,str,array<float32>,float32,str,float32
chr1:179816,"[""G"",""GA""]",-0.055,1.03,,,,,,,
chr1:179885,"[""A"",""AGCT""]",-0.0431,1.09,,,,,,,
chr1:180054,"[""TTTA"",""T""]",-0.0953,0.848,,,,,,,
chr1:180088,"[""TACCCTA"",""T""]",-0.103,0.816,,,,,,,
chr1:180088,"[""TACCCTAACCCTAACCCTAACCCCTAATCCTAACCCTA"",""T""]",-0.233,0.433,,,,,,,
chr1:180089,"[""ACCCTAACCCTAACCCTAACCCCTAATCCTAACCCTAATCCTAACCCTAACCCTAACCCTAACCCCTAAC"",""A""]",-0.283,0.339,,,,,,,
chr1:180094,"[""AACCCTAACCCTAACCCCTAATCCTAACCCTAATCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAAAACCCTAACCATAACCCTTACCCTTACCCTAATCCTAACCCTAATCCTTACCCTTACCCTTACCCTG"",""A""]",-0.273,0.356,,,,,,,
chr1:180095,"[""A"",""AC""]",-0.0315,1.15,,,,,,,
chr1:180095,"[""ACCCTAACCCTAACCCCTAATCCTAACCCTAATCCTAACCCTAACCCTAACCCTAAC"",""A""]",-0.284,0.336,,,,,,,
chr1:180096,"[""CCCTAACCCTAACCCCTAAT"",""C""]",-0.158,0.627,,,,,,,


Unnamed: 0_level_0,Unnamed: 1_level_0,cadd,cadd,revel,revel,revel,splice_ai,splice_ai,splice_ai,primate_ai
locus,alleles,raw_score,phred,revel_score,ref_aa,alt_aa,splice_ai,max_ds,splice_consequence,primate_ai_score
locus<GRCh38>,array<str>,float32,float32,float64,str,str,array<float32>,float32,str,float32
chr1:10055,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC""]",0.192,3.03,,,,,,,
chr1:10055,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCA""]",0.192,3.03,,,,,,,
chr1:10061,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC""]",0.2,3.12,,,,,,,
chr1:10064,"[""C"",""CCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAAA""]",0.19,3.02,,,,,,,
chr1:10067,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCCTAACCC""]",0.251,3.71,,,,,,,
chr1:10067,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC""]",0.225,3.42,,,,,,,
chr1:10073,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCCTAACCC""]",0.251,3.71,,,,,,,
chr1:10079,"[""T"",""TAACCCTAACCCTAACCCTAACCCTAACCC""]",0.277,4.02,,,,,,,
chr1:10091,"[""T"",""TAACCCTAACCCTAACCC""]",0.328,4.61,,,,,,,
chr1:10108,"[""C"",""CA""]",0.391,5.32,,,,,,,


0,1
Non-missing,0
Missing,4380928 (100.00%)


0,1
Non-missing,262288960 (100.00%)
Missing,0

0,1
Non-missing,100955534 (38.49%)
Missing,161333426 (61.51%)
Min Size,4
Max Size,4
Mean Size,4.00

0,1
Non-missing,403822136 (100.00%)
Missing,0
Minimum,-0.00
Maximum,1.00
Mean,0.00
Std Dev,0.03

0,1
Non-missing,100955534 (38.49%)
Missing,161333426 (61.51%)
Minimum,0.00
Maximum,1.00
Mean,0.01
Std Dev,0.05

0,1
Non-missing,100955534 (38.49%)
Missing,161333426 (61.51%)
Min Size,10
Max Size,14
Mean Size,13.84
Sample Values,"['Donor gain', 'Acceptor gain', 'Acceptor gain', 'Acceptor gain', 'Acceptor gain']"


In [21]:
print(b.splice_ai.summarize())
print(c.splice_ai.summarize())

0,1
Non-missing,0
Missing,4380928 (100.00%)


None


0,1
Non-missing,262288960 (100.00%)
Missing,0

0,1
Non-missing,100955534 (38.49%)
Missing,161333426 (61.51%)
Min Size,4
Max Size,4
Mean Size,4.00

0,1
Non-missing,403822136 (100.00%)
Missing,0
Minimum,-0.00
Maximum,1.00
Mean,0.00
Std Dev,0.03

0,1
Non-missing,100955534 (38.49%)
Missing,161333426 (61.51%)
Minimum,0.00
Maximum,1.00
Mean,0.01
Std Dev,0.05

0,1
Non-missing,100955534 (38.49%)
Missing,161333426 (61.51%)
Min Size,10
Max Size,14
Mean Size,13.84
Sample Values,"['Donor gain', 'Acceptor gain', 'Acceptor gain', 'Acceptor gain', 'Acceptor gain']"


None


In [22]:
spliceAi_indels_skip_invalid.summarize()

0,1
Non-missing,0
Missing,0


0,1
Non-missing,263071868 (100.00%)
Missing,0
Contig Counts,"{'chr11': 11664173, 'chr5': 15405197, 'chr22': 3618580, 'chr8': 12618223, 'chr19': 6793978, 'chrY': 1624633, 'chr1': 20097560, 'chr15': 7537321, 'chr12': 12114513, 'chr18': 6583523, 'chr20': 5545355, 'chr2': 20940975, 'chr13': 8585290, 'chr7': 14492115, 'chr14': 8163417, 'chr3': 17000241, 'chr17': 7893956, 'chr4': 16917567, 'chr6': 14979613, 'chr9': 10722816, 'chrX': 16407215, 'chr10': 12017810, 'chr21': 3498821, 'chr16': 7848976}"

0,1
Non-missing,263071868 (100.00%)
Missing,0
Min Size,2
Max Size,2
Mean Size,2.00

0,1
Non-missing,526143736 (100.00%)
Missing,0
Min Size,1
Max Size,1382
Mean Size,6.37
Sample Values,"['T', 'TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC', 'T', 'TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCA', 'T']"

0,1
Non-missing,0
Missing,263071868 (100.00%)

0,1
Non-missing,263071868 (100.00%)
Missing,0
Minimum,-10.00
Maximum,-10.00
Mean,-10.00
Std Dev,0.00

0,1
Non-missing,0
Missing,263071868 (100.00%)

0,1
Non-missing,263071868 (100.00%)
Missing,0

0,1
Non-missing,101026957 (38.40%)
Missing,162044911 (61.60%)
Min Size,1
Max Size,21
Mean Size,1.03

0,1
Non-missing,103769092 (100.00%)
Missing,0
Min Size,32
Max Size,1318
Mean Size,47.80
Sample Values,"['GT|OR4F5|0.01|0.00|0.05|0.00|20|-26|6|-44', 'GTTTTTT|OR4F5|0.07|0.00|0.03|0.00|20|-26|-44|-24', 'CT|OR4F5|0.11|0.00|0.00|0.00|-25|34|33|18', 'C|OR4F5|0.09|0.00|0.00|0.00|-40|-2|18|3', 'AT|OR4F5|0.01|0.00|0.00|0.00|-17|-38|33|19']"


In [9]:
full_annotations.summarize()

0,1
Non-missing,1584091437 (100.00%)
Missing,0
Contig Counts,"{'chr11': 69489149, 'chr5': 93769047, 'chr22': 22076679, 'chr8': 77100610, 'chr19': 34268343, 'chrY': 14637245, 'chr1': 119509995, 'chr15': 43776955, 'chr12': 68722532, 'chr18': 38674164, 'chr20': 31905096, 'chr2': 124209991, 'chr13': 50807608, 'chr7': 83637854, 'chr14': 46794023, 'chr3': 101328983, 'chrM': 5746, 'chr17': 43009606, 'chr4': 99543980, 'chr6': 86614746, 'chr9': 66585932, 'chrX': 130280891, 'chr10': 69628678, 'chr21': 21073146, 'chr16': 46640438}"

0,1
Non-missing,1584091437 (100.00%)
Missing,0
Min Size,2
Max Size,2
Mean Size,2.00

0,1
Non-missing,3168182874 (100.00%)
Missing,0
Min Size,1
Max Size,1382
Mean Size,1.91
Sample Values,"['T', 'A', 'T', 'C', 'T']"

0,1
Non-missing,1584085691 (100.00%)
Missing,5746 (0.00%)

0,1
Non-missing,1584085691 (100.00%)
Missing,5746 (0.00%)
Minimum,-19.41
Maximum,22.12
Mean,0.18
Std Dev,0.49

0,1
Non-missing,1584085691 (100.00%)
Missing,5746 (0.00%)
Minimum,0.00
Maximum,96.00
Mean,3.51
Std Dev,3.89

0,1
Non-missing,9742241 (0.62%)
Missing,1574349196 (99.38%)

0,1
Non-missing,9742241 (0.62%)
Missing,1574349196 (99.38%)
Minimum,0.00
Maximum,1.00
Mean,0.23
Std Dev,0.23

0,1
Non-missing,9742241 (0.62%)
Missing,1574349196 (99.38%)
Min Size,1
Max Size,1
Mean Size,1.00
Sample Values,"['S', 'S', 'S', 'S', 'S']"

0,1
Non-missing,9742241 (0.62%)
Missing,1574349196 (99.38%)
Min Size,1
Max Size,1
Mean Size,1.00
Sample Values,"['T', 'P', 'A', 'Y', 'F']"

0,1
Non-missing,745130422 (47.04%)
Missing,838961015 (52.96%)

0,1
Non-missing,583796996 (36.85%)
Missing,1000294441 (63.15%)
Min Size,4
Max Size,4
Mean Size,4.00

0,1
Non-missing,2335187984 (100.00%)
Missing,0
Minimum,-0.00
Maximum,1.00
Mean,0.00
Std Dev,0.02

0,1
Non-missing,583796996 (36.85%)
Missing,1000294441 (63.15%)
Minimum,0.00
Maximum,1.00
Mean,0.00
Std Dev,0.04

0,1
Non-missing,583796996 (36.85%)
Missing,1000294441 (63.15%)
Min Size,10
Max Size,14
Mean Size,13.85
Sample Values,"['Donor gain', 'Donor gain', 'Donor gain', 'Donor gain', 'Donor gain']"

0,1
Non-missing,8388957 (0.53%)
Missing,1575702480 (99.47%)

0,1
Non-missing,8388957 (0.53%)
Missing,1575702480 (99.47%)
Minimum,0.11
Maximum,0.99
Mean,0.54
Std Dev,0.20


In [8]:
primate_ai_test= hl.import_table("gs://gnomad-wphu/PrimateAI_scores_v0.2_hg38.tsv.gz",
                                          force_bgz=True, comment="#", skip_blank_lines=True,
                                          types={"pos":hl.tint32, 'primateDL_score':hl.tfloat32, 'ExAC_coverage':hl.tfloat32}
                                         )
primate_ai_test.show()

FatalError: ZipException: File does not conform to block gzip format.

Java stack trace:
java.util.zip.ZipException: File does not conform to block gzip format.
	at is.hail.io.compress.BGzipInputStream$BGzipHeader.<init>(BGzipInputStream.java:41)
	at is.hail.io.compress.BGzipInputStream.resetState(BGzipInputStream.java:256)
	at is.hail.io.compress.BGzipInputStream.<init>(BGzipInputStream.java:110)
	at is.hail.io.compress.BGzipInputStream.<init>(BGzipInputStream.java:122)
	at is.hail.io.fs.BGZipCompressionCodec$.makeInputStream(FS.scala:57)
	at is.hail.io.fs.FS$class.open(FS.scala:141)
	at is.hail.io.fs.HadoopFS.open(HadoopFS.scala:70)
	at is.hail.io.fs.FS$class.open(FS.scala:148)
	at is.hail.io.fs.HadoopFS.open(HadoopFS.scala:70)
	at is.hail.io.fs.FS$class.readLines(FS.scala:208)
	at is.hail.io.fs.HadoopFS.readLines(HadoopFS.scala:70)
	at is.hail.expr.ir.TextTableReader$.readMetadata(TextTableReader.scala:268)
	at is.hail.expr.ir.TextTableReader$.apply(TextTableReader.scala:306)
	at is.hail.expr.ir.TextTableReader$.fromJValue(TextTableReader.scala:313)
	at is.hail.expr.ir.TableReader$.fromJValue(TableIR.scala:101)
	at is.hail.expr.ir.IRParser$.table_ir_1(Parser.scala:1366)
	at is.hail.expr.ir.IRParser$.table_ir(Parser.scala:1342)
	at is.hail.expr.ir.IRParser$$anonfun$parse_table_ir$1.apply(Parser.scala:1813)
	at is.hail.expr.ir.IRParser$$anonfun$parse_table_ir$1.apply(Parser.scala:1813)
	at is.hail.expr.ir.IRParser$.parse(Parser.scala:1802)
	at is.hail.expr.ir.IRParser$.parse_table_ir(Parser.scala:1813)
	at is.hail.backend.spark.SparkBackend$$anonfun$parse_table_ir$1.apply(SparkBackend.scala:542)
	at is.hail.backend.spark.SparkBackend$$anonfun$parse_table_ir$1.apply(SparkBackend.scala:541)
	at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:20)
	at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:18)
	at is.hail.utils.package$.using(package.scala:609)
	at is.hail.annotations.Region$.scoped(Region.scala:18)
	at is.hail.expr.ir.ExecuteContext$.scoped(ExecuteContext.scala:18)
	at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:230)
	at is.hail.backend.spark.SparkBackend.parse_table_ir(SparkBackend.scala:541)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



Hail version: 0.2.57-582b2e31b8bd
Error summary: ZipException: File does not conform to block gzip format.