# Analysis for CIViC data

This notebook contains an analysis on CIViC variant data

In [1]:
import logging
from enum import Enum
import re
import csv

from civicpy import civic as civicpy
from dotenv import load_dotenv

from variation.query import QueryHandler

logging.getLogger("root").setLevel(logging.WARNING)



In [2]:
# Environment variables are set for gene-normalizer dynamodb instance and 
# UTA DB credentials
load_dotenv()

True

In [3]:
query_handler = QueryHandler()

In [4]:
# Get latest data
# civicpy.update_cache(from_remote_cache=False)

In [5]:
civicpy.load_cache(on_stale="ignore")

True

In [6]:
variants = civicpy.get_all_variants()
total_variants = len(variants)
f"Total Number of variants in CIViC: {total_variants}"

'Total Number of variants in CIViC: 3509'

In [7]:
class VariantCategory(str, Enum):
    """Create enum for the kind of variants that are in CIViC."""
    EXPRESSION = "Expression"
    EPIGENETIC_MODIFICATION = "Epigenetic Modification"
    FUSION = "Fusion"
    PROTEIN_CONS = "Protein Consequence"
    GENE_FUNC = "Gene Function"
    REARRANGEMENTS = "Rearrangements"
    COPY_NUMBER = "Copy Number"
    OTHER = "Other"
    GENOTYPES_EASY = "Genotypes Easy"
    GENOTYPES_COMPOUND = "Genotypes Compound"
    REGION_DEFINED_VAR = "Region Defined Variant"
    TRANSCRIPT_VAR = "Transcript Variant"  # no attempt to normalize these ones, since there is no query we could use

    

Below are terms in CIViC that we know that the variation normalizer cannot support.

In [8]:
not_supported = {
    VariantCategory.EXPRESSION: {
        "overexpression", "expression", "underexpression", "serum levels", 
        "transcription levels", "autocrine activation", "tnc-l", 
        "top2a/90", "low ratio of vegf165b/vegftotal", "lgr5fl"
    },
    VariantCategory.EPIGENETIC_MODIFICATION: {
        "methylation", "promoter hypermethylation", "promoter methylation", 
        "phosphorylation"
    },
    VariantCategory.FUSION: {
        "::", "fusion"
    },
    VariantCategory.PROTEIN_CONS: {
        "frameshift truncation", "frameshift", "frame shift", "fs",
        "truncating mutation", "1100delc", "deletion (p.k227_t233del)",
        "y646f, y646n, y646s, y646h, y646c, a682g, a692v"
    },
    VariantCategory.GENE_FUNC: {
        "gain of function", "gain-of-function", "loss of function", "loss-of-function",
        "activating mutation", "tkd mutation", "inactivation", "null", "viii"
    },
    VariantCategory.REARRANGEMENTS: {
        "translocation", "rearrangement", "double ph", "alu insertion", 
        "exon 20 insertion", "internal tandem duplications", "tandem repeat",
        "itd", "d842_h845deldimh", "k558np"
    },
    VariantCategory.COPY_NUMBER: {
        "copy number", "repeat", "dup", "non-amplification", "gain"
    }, 
    VariantCategory.OTHER: {
        "cytoplasmic mislocalization", "alternative transcript", "rare mutation",
        "splice", "splicing", "ceacam1-l", "ceacam1-s", "δ", # this is really Δ for upper case
        "delta", "beta", "ivs2+1g>a", "ivs20, a-g, -2",
        "deprecated", "point mutations", "conserved domain mut", "cis double mutants",
        "loss-of-modification", "gbrcam", "kras4a", "kras4b", "e151int", "delnvtap"
    },
    VariantCategory.GENOTYPES_EASY: {
        "diplotypes", "wild type", "wildtype", "p61braf(v600e)"
    },
    VariantCategory.GENOTYPES_COMPOUND: {
        "loss of heterozygosity", "biallelic inactivation", "bi-allelic inactivation",
        "homozygosity", "loh", "single allele deletion"
    },
    VariantCategory.REGION_DEFINED_VAR: {
        "deleterious mutation", "domain mutation", "polymorphism", 
        "non-p-loop mutation", "p-loop mutation", "3' utr mutation", "alteration",
        "t17 deletion", "exon", "ex19 del l858r", "promoter mutation", "non-v600"
    }
}

In [9]:


# This file contains CIViC Variants where we did not attempt to normalize
# since we cannot find a free text or HGVS-like expression to use.
# One example would be a CIViC Variant whose name has "c." in it. In this case,
# we want the genomic representative. We look at the HGVS expressions to find a genomic
# expression. If there is no genomic HGVS expression, we do not even attempt to
# normalize
transcript_vars_wf = open("transcript_variants.csv", "w")
transcript_vars_wr = csv.writer(transcript_vars_wf, delimiter="\t")
transcript_vars_wr.writerow(["variant_id", "variant_name", "variant_accepted"])

# This file contains protein queries (gene + variant_name) we SHOULD be able to
# normalize
protein_variants_wf = open("all_protein_variant_queries.csv", "w")
protein_variants_wr = csv.writer(protein_variants_wf, delimiter="\t")
protein_variants_wr.writerow(["variant_id", "gene_name", "variant_name", "variant_accepted"])

# This file contains genomic queries (genomic HGVS expressions) we SHOULD be able to
# normalize
genomic_variants_wf = open("all_genomic_variant_queries.csv", "w")
genomic_variants_wr = csv.writer(genomic_variants_wf, delimiter="\t")
genomic_variants_wr.writerow(["variant_id", "hgvs_g", "variant_accepted"])

# This file contains CIViC Variants we do not currently support in Variation Normalizer.
# In these cases, we do not even attempt to try to normalize
not_supported_wf = open("not_supported_variants.csv", "w")
not_supported_wr = csv.writer(not_supported_wf, delimiter="\t")
not_supported_wr.writerow(["variant_id", "gene_name", "variant_name", "category", "variant_accepted"])

# This file contains CIViC Variant queries that we were not able to normalize.
unable_to_normalize_wf = open("unable_to_normalize_queries.csv", "w")
unable_to_normalize_wr = csv.writer(unable_to_normalize_wf, delimiter="\t")
unable_to_normalize_wr.writerow(["variant_id", "query", "query_type", "variant_accepted",
                                 "exception_raised", "message", "warnings"])

# This file contains CIViC Variant queries that we were able to normalize.
able_to_normalize_wf = open("able_to_normalize_queries.csv", "w")
able_to_normalize_wr = csv.writer(able_to_normalize_wf, delimiter="\t")
able_to_normalize_wr.writerow(["variant_id", "query", "query_type", "variant_accepted"])

# Category name for variants we do not support: number of variants we found
variant_category_counts = {c: 0 for c in VariantCategory.__members__}


def _total_counts():
    """Return initial total counts for genomic and protein variants"""
    return {
        "protein": {"accepted": 0, "not_accepted": 0, "count": 0}, 
        "genomic": {"accepted": 0, "not_accepted": 0, "count": 0}
    }

# Keep track of total counts
transcript_vars_total = _total_counts()
should_be_able_to_normalize_total = _total_counts()
can_normalize_total = _total_counts()
unable_to_normalize_total = _total_counts()
exception_total = _total_counts()

queries_found = dict()

def is_accepted_variant(v) -> bool:
    """Return whether or not a variant (MP) has at least one EID in an accepted status.""" 
    for mp in v.molecular_profiles:
        for ev in mp.evidence_items:
            if ev.status == "accepted":
                return True
    return False

for variant in variants:
    v_name = None
    v_q_type = None

    # if a variant has at least one EID in an accepted status, it counts towards 
    # “accepted”, because that indicates review and approval of the variant as part 
    # of the evidence review
    is_accepted = is_accepted_variant(variant)
    accepted_key = "accepted" if is_accepted else "not_accepted"  # used in total counts dicts
    
    if "c." in variant.name:
        # Try getting genomic HGVS expression first
        v_name = ([expr for expr in variant.hgvs_expressions 
                         if "g." in expr] or [None])[0]

        # If there is no genomic HGVS expression, try using gnomad vcf
        if not v_name:
            chromosome = variant.coordinates.chromosome
            pos = variant.coordinates.start
            ref = variant.coordinates.reference_bases
            alt = variant.coordinates.variant_bases

            if all((chromosome, pos, ref, alt)):
                v_name = f"{chromosome}-{pos}-{ref}-{alt}"
        
        v_q_type = "genomic"
    else:
        v_name = variant.name.strip()
        v_q_type = "protein"

    if not v_name:
        variant_category_counts[VariantCategory.TRANSCRIPT_VAR.name] += 1
        transcript_vars_wr.writerow([variant.id, variant.name, is_accepted])
        transcript_vars_total[v_q_type]["count"] += 1
        transcript_vars_total[v_q_type][accepted_key] += 1
        continue
    
    gene_name = variant.gene.name.strip()
    v_name_lower = v_name.lower()
    
    categories = set()
    if v_name_lower in {"loss", "deletion"}:
        categories.add(VariantCategory.GENE_FUNC)
    elif any((
        v_name_lower in {"mutation", "mutations", "snp"},
        v_name_lower == f"{variant.gene.name.lower()} mutation"
    )):
        categories.add(VariantCategory.REGION_DEFINED_VAR)
    else:
        if v_name_lower.endswith("deletion and mutation"):
            v_name_split = v_name.split()
            if len(v_name_split) == 4:
                if query_handler.normalize_handler.gene_normalizer.normalize(v_name_split[0]).match_type > 0:
                    categories.add(VariantCategory.REGION_DEFINED_VAR) 
          
        if re.match(r".*e\d+-e\d+", v_name_lower):  # ex: e20-e20
            categories.add(VariantCategory.FUSION)

        if any((
            "exon" in v_name_lower,
            re.match(r"\d+kb\sdeletion", v_name_lower),  # ex: 10kb Deletion
            re.match(r"partial\sdeletion\sof\s\d+(.\d+)?\skb", v_name_lower),  # ex: Partial deletion of 0.7 Kb
            re.match(r"del\s\d+-\d+", v_name_lower),  # ex: DEL 485-490
            re.match(r"\d+(p|q)\d+(.\d+)?-\d+(.\d+)?\s\d+mb del", v_name_lower),  # ex: 3p26.3-25.3 11Mb del
            re.match(r"intron\s\d+\smutation", v_name_lower)  # ex: Intron 6 Mutation
        )):
            categories.add(VariantCategory.REGION_DEFINED_VAR)
        
        if any((
            re.match(r"t\(.*\)\(.*\)", v_name_lower), # ex: t(1;3)(p36.3;p25)
            re.match(r".*ins$", v_name_lower),  # ex: P780INS, L78_Q79ins
            re.match(r"\w+_?\w+>\w+", v_name_lower),  # ex: 56_61QKQKVG>R, E746_T751>I, N771>GY
        )):  
            categories.add(VariantCategory.REARRANGEMENTS)

        if any((
            re.match(r"^rs\d+", v_name_lower),  # ex: RS11623866
            re.match(r"class\s\d+\smutation", v_name_lower),  # ex: Class 3 Mutation
            re.match(r"\d+\s\((c|a|g|t)+-(c|a|g|t)+\)", v_name_lower)  # ex: 235 (CAG-TAG)
        )):
            categories.add(VariantCategory.OTHER)

        if re.match(r"cd\d+v?\d+", v_name_lower):
            categories.add(VariantCategory.EXPRESSION)

        if any((
            re.match(r"\w+\d+$", v_name_lower),  # ex: V600
            re.match(r"\w+\d+\w+\/\w+$", v_name_lower),  # ex: S893A/T
            re.match(r"[a-z]+\d+[a-z]+\sand\s[a-z]+\d+[a-z|*]+", v_name_lower),  # ex: E2014K and E2419K, R849W and R1108*
            re.match(r"[a-z]+\d+\s&\s[a-z]+\d+", v_name_lower),  # ex: D835 & I836
            re.match(r"[a-z]+\d+[a-z]+\sor\s[a-z]+\d+[a-z]+", v_name_lower),  # ex: H1047L or H1047R
            re.match(r"\w+\d+\smutations", v_name_lower),  # ex: E1813 mutations
        )):
            categories.add(VariantCategory.PROTEIN_CONS)

        if any((
            re.match(r"^\w+\samplification", v_name_lower),  # ex: {gene} amplification
            re.match(r"grch3(7|8)\/hg\d+\s\w+.?\d*\(chr\w+:\d+-\d+\)x\d+", v_name_lower),  # ex: GRCh37/hg19 11q14.3(chr11:88960991-88961138)x160
        )):  
            categories.add(VariantCategory.COPY_NUMBER)

        if re.match(r"\w+[^fs]\*\d+$", v_name_lower):  # ex: UGT1A1*28
            categories.add(VariantCategory.GENOTYPES_EASY)

        for k, v in not_supported.items():
            if {x for x in v if x in v_name_lower}:
                categories.add(k)

    if len(categories) > 1:
        # Those with multiple categories will be classified as other
        categories = {VariantCategory.OTHER}

    if len(categories) == 1:
        variant_category_name = categories.pop()
        variant_category_counts[variant_category_name.name] += 1
        not_supported_wr.writerow([variant.id, gene_name, variant.name, variant_category_name, is_accepted])
    else:
        # We should support this, so we need to query the variation normalizer
        if v_q_type == "protein":
            q = f"{gene_name} {v_name}"
            protein_variants_wr.writerow([variant.id, gene_name, v_name, is_accepted])
        else:
            q = v_name
            genomic_variants_wr.writerow([variant.id, q, is_accepted])

        should_be_able_to_normalize_total[v_q_type]["count"] += 1
        should_be_able_to_normalize_total[v_q_type][accepted_key] += 1

        if q in queries_found:
            queries_found[q].append(variant.id)
        else:
            queries_found[q] = [variant.id]
            
        try:
            variation_norm_resp = await query_handler.normalize_handler.normalize(q)
            if not variation_norm_resp.variation_descriptor:
                is_incomplete = False
                if v_q_type == "protein" and len(v_name.split()) == 1:
                    if "-" in v_name:
                        # could be {gene}-{gene}
                        genes = v_name.split("-")
                        variant_category_name = VariantCategory.FUSION
                    else:
                        # Just a gene name
                        genes = [v_name]
                        variant_category_name = VariantCategory.OTHER

                    is_genes = True
                    for g in genes:
                        if query_handler.normalize_handler.gene_normalizer.normalize(g).match_type == 0:
                            # not a gene 
                            is_genes = False
                            break

                    if is_genes:
                        variant_category_counts[variant_category_name.name] += 1
                        not_supported_wr.writerow([variant.id, gene_name, variant.name, variant_category_name, is_accepted])
                        is_incomplete = True

                if not is_incomplete:
                    unable_to_normalize_wr.writerow([variant.id, q, v_q_type, 
                                                    is_accepted, False,
                                                    "unable to normalize",
                                                    variation_norm_resp.warnings])
                    unable_to_normalize_total[v_q_type]["count"] += 1
                    unable_to_normalize_total[v_q_type][accepted_key] += 1
            else:
                can_normalize_total[v_q_type]["count"] += 1
                can_normalize_total[v_q_type][accepted_key] += 1
                able_to_normalize_wr.writerow([variant.id, q, v_q_type, is_accepted])
        except Exception as e:
            unable_to_normalize_wr.writerow([variant.id, q, v_q_type, is_accepted,
                                             True, str(e), None])
            exception_total[v_q_type]["count"] += 1
            exception_total[v_q_type][accepted_key] += 1

# Close all files
transcript_vars_wf.close()
protein_variants_wf.close()
genomic_variants_wf.close()
not_supported_wf.close()
unable_to_normalize_wf.close()
able_to_normalize_wf.close()

## Variants we do not support

In [10]:
do_not_support_total_sum = sum(variant_category_counts.values())

Below are the total number of variants for each category that we do not support

In [11]:
sorted_variant_cat_counts = dict(sorted(variant_category_counts.items(), key=lambda x: x[1], reverse=True))
sorted_variant_cat_counts

{'TRANSCRIPT_VAR': 384,
 'FUSION': 301,
 'EXPRESSION': 291,
 'REGION_DEFINED_VAR': 190,
 'PROTEIN_CONS': 128,
 'OTHER': 92,
 'GENE_FUNC': 87,
 'REARRANGEMENTS': 50,
 'COPY_NUMBER': 34,
 'EPIGENETIC_MODIFICATION': 15,
 'GENOTYPES_EASY': 10,
 'GENOTYPES_COMPOUND': 6}

Below is the total percentage of variants for each category that we do not support

In [12]:
{k: f"{v / total_variants * 100:.2f}%" for k, v in sorted_variant_cat_counts.items()}

{'TRANSCRIPT_VAR': '10.94%',
 'FUSION': '8.58%',
 'EXPRESSION': '8.29%',
 'REGION_DEFINED_VAR': '5.41%',
 'PROTEIN_CONS': '3.65%',
 'OTHER': '2.62%',
 'GENE_FUNC': '2.48%',
 'REARRANGEMENTS': '1.42%',
 'COPY_NUMBER': '0.97%',
 'EPIGENETIC_MODIFICATION': '0.43%',
 'GENOTYPES_EASY': '0.28%',
 'GENOTYPES_COMPOUND': '0.17%'}

In [13]:
f"The Variation Normalizer does not support {do_not_support_total_sum / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer does not support 45.26% of the total variants'

In [14]:
f"Total number of variants we do not support in the Variation Normalizer: "\
f"{do_not_support_total_sum}"

'Total number of variants we do not support in the Variation Normalizer: 1588'

### Transcript Variants we did not attempt to normalize due to no input query available 
These are CIViC Variants where we did not attempt to normalize since we cannot find a 
free text or HGVS-like expression to use. One example would be a CIViC Variant whose 
name has "c." in it. In this case, we want the genomic representative. We look at the 
HGVS expressions to find a genomic expression. If there is no genomic HGVS expression, 
we do not even attempt to normalize. These are under the Transcript Variant category.

In [15]:
transcript_vars_total

{'protein': {'accepted': 0, 'not_accepted': 0, 'count': 0},
 'genomic': {'accepted': 51, 'not_accepted': 333, 'count': 384}}

In [16]:
transcript_vars_total_sum = transcript_vars_total["protein"]["count"] + transcript_vars_total["genomic"]["count"]
transcript_vars_total_sum

384

In [17]:
transcript_vars_total_accepted = transcript_vars_total["protein"]["accepted"] + transcript_vars_total["genomic"]["accepted"]
f"{transcript_vars_total_accepted / transcript_vars_total_sum * 100:.2f}% of these are accepted variants"

'13.28% of these are accepted variants'

In [18]:
transcript_vars_total_not_accepted = transcript_vars_total["protein"]["not_accepted"] + transcript_vars_total["genomic"]["not_accepted"]
f"{transcript_vars_total_not_accepted / transcript_vars_total_sum * 100:.2f}% of these are NOT accepted variants"

'86.72% of these are NOT accepted variants'

In [19]:
f"{transcript_vars_total_sum / total_variants * 100:.2f}% of the total variants were not attempted to be normalized due to no input query available"

'10.94% of the total variants were not attempted to be normalized due to no input query available'

## Variants we should be able to normalize

In [20]:
should_be_able_to_normalize_total

{'protein': {'accepted': 631, 'not_accepted': 872, 'count': 1503},
 'genomic': {'accepted': 245, 'not_accepted': 179, 'count': 424}}

In [21]:
should_be_able_to_normalize_total_sum = should_be_able_to_normalize_total["protein"]["count"] + should_be_able_to_normalize_total["genomic"]["count"]
should_be_able_to_normalize_total_sum

1927

In [22]:
should_be_able_to_normalize_total_accepted = should_be_able_to_normalize_total["protein"]["accepted"] + should_be_able_to_normalize_total["genomic"]["accepted"]
f"{should_be_able_to_normalize_total_accepted / should_be_able_to_normalize_total_sum * 100:.2f}% of these are accepted variants"

'45.46% of these are accepted variants'

In [23]:
should_be_able_to_normalize_total_not_accepted = should_be_able_to_normalize_total["protein"]["not_accepted"] + should_be_able_to_normalize_total["genomic"]["not_accepted"]
f"{should_be_able_to_normalize_total_not_accepted / should_be_able_to_normalize_total_sum * 100:.2f}% of these are NOT accepted variants"

'54.54% of these are NOT accepted variants'

In [24]:
f"The Variation Normalizer SHOULD be able to normalize {should_be_able_to_normalize_total_sum / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer SHOULD be able to normalize 54.92% of the total variants'

## Variants we were not able to normalize

Either due to a bug or an unsupported query type in Variation Normalizer

In [25]:
unable_to_normalize_total

{'protein': {'accepted': 8, 'not_accepted': 52, 'count': 60},
 'genomic': {'accepted': 1, 'not_accepted': 2, 'count': 3}}

In [26]:
unable_to_normalize_total_sum = unable_to_normalize_total["protein"]["count"] + unable_to_normalize_total["genomic"]["count"]
unable_to_normalize_total_sum

63

In [27]:
unable_to_normalize_total_accepted = unable_to_normalize_total["protein"]["accepted"] + unable_to_normalize_total["genomic"]["accepted"]
f"{unable_to_normalize_total_accepted / unable_to_normalize_total_sum * 100:.2f}% of these are accepted variants"

'14.29% of these are accepted variants'

In [28]:
unable_to_normalize_total_not_accepted = unable_to_normalize_total["protein"]["not_accepted"] + unable_to_normalize_total["genomic"]["not_accepted"]
f"{unable_to_normalize_total_not_accepted / unable_to_normalize_total_sum * 100:.2f}% of these are NOT accepted variants"

'85.71% of these are NOT accepted variants'

In [29]:
f"The Variation Normalizer was unable to normalize {unable_to_normalize_total_sum / total_variants * 100}% of the total variants"

'The Variation Normalizer was unable to normalize 1.7953833000854944% of the total variants'

## Breakdown of the variants we weren't able to normalize

In this section, we breakdown the reasons on why we weren't able to normalize variants.

In [30]:
unable_to_tokenize = 0
unable_to_find_valid = 0
other = 0
with open("unable_to_normalize_queries.csv", "r") as f:
    reader = csv.reader(f, delimiter="\t")
    next(reader)
    for row in reader:
        if "Unable to find valid result" in row[-1]:
            unable_to_find_valid += 1
        elif "Unable to tokenize" in row[-1]:
            unable_to_tokenize += 1
        else:
            other += 1

### Due to not passing validation checks

The Variation Normalizer performs validation checks on the input query (such as reference sequence). If these validation checks fail, then the input query will fail to normalize.

In [31]:
f"The Variation Normalizer found {unable_to_find_valid} invalid variants (This is {unable_to_find_valid / total_variants * 100:.2f}% of the total variants)."

'The Variation Normalizer found 62 invalid variants (This is 1.77% of the total variants).'

### Due to tokenization 

The Variation Normalizer will tokenize the input query to determine the kind of token. It is limited in the kinds of tokens it accepts, so these tokens are not yet supported in the Variation Normalizer. 

In [32]:
f"The Variation Normalizer was unable to tokenize {unable_to_tokenize} variants ({unable_to_tokenize / total_variants * 100:.2f}% of the total variants)."

'The Variation Normalizer was unable to tokenize 1 variants (0.03% of the total variants).'

In [33]:
f"The Variation Normalizer was unable to normalize {other} variants due to other issues (This is {other / total_variants * 100:.2f}% of the total variants)."

'The Variation Normalizer was unable to normalize 0 variants due to other issues (This is 0.00% of the total variants).'

## Variant queries that raised an exception during normalization

In [34]:
exception_total

{'protein': {'accepted': 0, 'not_accepted': 0, 'count': 0},
 'genomic': {'accepted': 0, 'not_accepted': 0, 'count': 0}}

In [35]:
exception_total_sum = exception_total["protein"]["count"] + exception_total["genomic"]["count"]
exception_total_sum

0

In [36]:
f"The Variation Normalizer raised an exception for {exception_total_sum / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer raised an exception for 0.00% of the total variants'

## Variants we were able to normalize

In [37]:
can_normalize_total

{'protein': {'accepted': 621, 'not_accepted': 816, 'count': 1437},
 'genomic': {'accepted': 244, 'not_accepted': 177, 'count': 421}}

In [38]:
can_normalize_total_sum = can_normalize_total["protein"]["count"] + can_normalize_total["genomic"]["count"]
can_normalize_total_sum

1858

In [39]:
can_normalize_total_accepted = can_normalize_total["protein"]["accepted"] + can_normalize_total["genomic"]["accepted"]
f"{can_normalize_total_accepted / can_normalize_total_sum * 100:.2f}% of these are accepted variants"

'46.56% of these are accepted variants'

In [40]:
can_normalize_total_not_accepted = can_normalize_total["protein"]["not_accepted"] + can_normalize_total["genomic"]["not_accepted"]
f"{can_normalize_total_not_accepted / can_normalize_total_sum * 100:.2f}% of these are NOT accepted variants"

'53.44% of these are NOT accepted variants'

In [41]:
f"The Variation Normalizer successfully normalized {can_normalize_total_sum / should_be_able_to_normalize_total_sum * 100:.2f}% of the variants we SHOULD be able to normalize"

'The Variation Normalizer successfully normalized 96.42% of the variants we SHOULD be able to normalize'

In [42]:
f"The Variation Normalizer successfully normalized {can_normalize_total_sum / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer successfully normalized 52.95% of the total variants'

## Duplicate Queries

These are duplicate queries found in civic. The values are the associated variant IDs. 

In [43]:
{k:v for k,v in queries_found.items() if len(v) > 1}

{'BRAF V600D': [11, 3452]}