# Analysis for CIViC data

This notebook contains an analysis on CIViC variant data

In [1]:
import logging
from enum import Enum
import re
import csv

from civicpy import civic as civicpy
from dotenv import load_dotenv

from variation.query import QueryHandler

logging.getLogger("root").setLevel(logging.WARNING)



In [2]:
# Environment variables are set for gene-normalizer dynamodb instance and 
# UTA DB credentials
load_dotenv()

True

In [3]:
query_handler = QueryHandler()

In [4]:
# Get latest data
# civicpy.update_cache(from_remote_cache=False)

In [5]:
civicpy.load_cache(on_stale="ignore")

True

In [6]:
variants = civicpy.get_all_variants()
total_variants = len(variants)
f"Total Number of variants in CIViC: {total_variants}"

'Total Number of variants in CIViC: 3466'

In [7]:
class VariantCategory(str, Enum):
    """Create enum for the kind of variants that are in CIViC."""
    EXPRESSION = "Expression"
    EPIGENETIC_MODIFICATION = "Epigenetic Modification"
    FUSION = "Fusion"
    FUNCTIONAL_PROTEIN_CONS = "Functional Protein Consequence"
    FUNCTIONAL_GENE_FUNC = "Functional Gene Function"
    REARRANGEMENTS = "Rearrangements"
    COPY_NUMBER = "Copy Number"
    OTHER = "Other"
    GENOTYPES_EASY = "Genotypes Easy"
    GENOTYPES_COMPOUND = "Genotypes Compound"
    REGION_DEFINED_VAR = "Region Defined Variant"
    

Below are terms in CIViC that we know that the variation normalizer cannot support.

In [8]:
not_supported = {
    VariantCategory.EXPRESSION: {
        "overexpression", "expression", "underexpression", "serum levels", 
        "transcription levels"
    },
    VariantCategory.EPIGENETIC_MODIFICATION: {
        "methylation", "promoter hypermethylation", "promoter methylation", 
        "phosphorylation"
    },
    VariantCategory.FUSION: {
        "::", "fusion"
    },
    VariantCategory.FUNCTIONAL_PROTEIN_CONS: {
        "frameshift truncation", "frameshift", "frame shift"
    },
    VariantCategory.FUNCTIONAL_GENE_FUNC: {
        "gain of function", "gain-of-function", "loss of function", "loss-of-function",
        "activating mutation", "tkd mutation", "inactivation"
    },
    VariantCategory.REARRANGEMENTS: {
        "translocation", "rearrangement", "double ph", "alu insertion", 
        "exon 20 insertion", "internal tandem duplications", "tandem repeat"
    },
    VariantCategory.COPY_NUMBER: {
        "copy number", "repeat", "dup"
    }, 
    VariantCategory.OTHER: {
        "cytoplasmic mislocalization", "alternative transcript"
    },
    VariantCategory.GENOTYPES_EASY: {
        "diplotypes", "wild type", "wildtype"
    },
    VariantCategory.GENOTYPES_COMPOUND: {
        "loss of heterozygosity", "biallelic inactivation", "bi-allelic inactivation",
        "homozygosity", 
    },
    VariantCategory.REGION_DEFINED_VAR: {
        "deleterious mutation", "exon deletion", "domain mutation", "polymorphism", 
        "non-p-loop mutation", "p-loop mutation"
    }
}

In [9]:

# This file contains CIViC Variants where there are no queries available.
# One example would be a CIViC Variant whose name has "c." in it. In this case,
# we want the genomic representative. We look at the HGVS expressions to find a genomic
# expression. If there is no genomic HGVS expression, we have no name for a query
no_query_wf = open("no_query.csv", "w+")
no_query_wr = csv.writer(no_query_wf, delimiter="\t")
no_query_wr.writerow(["variant_id", "variant_name"])

# This file contains protein queries (gene + variant_name) we SHOULD be able to
# normalize
protein_variants_wf = open("all_protein_variant_queries.csv", "w+")
protein_variants_wr = csv.writer(protein_variants_wf, delimiter="\t")
protein_variants_wr.writerow(["variant_id", "gene_name", "variant_name"])

# This file contains genomic queries (genomic HGVS expressions) we SHOULD be able to
# normalize
genomic_variants_wf = open("all_genomic_variant_queries.csv", "w+")
genomic_variants_wr = csv.writer(genomic_variants_wf, delimiter="\t")
genomic_variants_wr.writerow(["variant_id", "hgvs_g"])

# This file contains CIViC Variants we do not currently support in Variation Normalizer.
# In these cases, we do not even attempt to try to normalize
not_supported_wf = open("not_supported_variants.csv", "w+")
not_supported_wr = csv.writer(not_supported_wf, delimiter="\t")
not_supported_wr.writerow(["variant_id", "gene_name", "variant_name", "category"])

# This file contains CIViC Variant queries that we were not able to normalize.
unable_to_normalize_wf = open("unable_to_normalize_queries.csv", "w+")
unable_to_normalize_wr = csv.writer(unable_to_normalize_wf, delimiter="\t")
unable_to_normalize_wr.writerow(["variant_id", "query", "query_type",
                                 "exception_raised", "message", "warnings"])

# Category name for variants we do not support: number of variants we found
variant_category_counts = {c: 0 for c in VariantCategory.__members__}

# Keep track of total counts
no_query_total = {"protein": 0, "genomic": 0}
should_be_able_to_normalize_total = {"protein": 0, "genomic": 0}
can_normalize_total = {"protein": 0, "genomic": 0}
unable_to_normalize_total = {"protein": 0, "genomic": 0}
exception_total = {"protein": 0, "genomic": 0}

queries_found = dict()

for variant in variants:
    v_name = None
    v_q_type = None
    if "c." in variant.name:
        v_name = ([expr for expr in variant.hgvs_expressions 
                         if "g." in expr] or [None])[0]
        v_q_type = "genomic"
    else:
        v_name = variant.name.strip()
        v_q_type = "protein"

    if not v_name:
        no_query_wr.writerow([variant.id, variant.name])
        no_query_total[v_q_type] += 1
        continue
    
    gene_name = variant.gene.name.strip()
    v_name_lower = v_name.lower()
    
    categories = set()
    if v_name_lower in {"loss", "deletion"}:
        categories.add(VariantCategory.FUNCTIONAL_GENE_FUNC)
    elif v_name_lower == "mutation":
        categories.add(VariantCategory.REGION_DEFINED_VAR)
    else:
        if re.match(r".*e\d+-e\d+", v_name_lower):  # ex: e20-e20
            categories.add(VariantCategory.FUSION)
        
        if "exon" in v_name_lower:
            if {x for x in {"deletion", "mutation"}}:
                categories.add(VariantCategory.REGION_DEFINED_VAR)
        
        if v_name_lower.endswith("fs"):
            categories.add(VariantCategory.FUNCTIONAL_PROTEIN_CONS)

        for k, v in not_supported.items():
            if {x for x in v if x in v_name_lower}:
                categories.add(k)

    if len(categories) > 1:
        # Those with multiple categories will be classified as other
        categories = {VariantCategory.OTHER}

    if len(categories) == 1:
        variant_category_name = categories.pop()
        variant_category_counts[variant_category_name.name] += 1
        not_supported_wr.writerow([variant.id, gene_name, variant.name, variant_category_name])
    else:
        # We should support this, so we need to query the variation normalizer
        if v_q_type == "protein":
            q = f"{gene_name} {v_name}"
            protein_variants_wr.writerow([variant.id, gene_name, v_name])
        else:
            q = v_name
            genomic_variants_wr.writerow([variant.id, q])

        should_be_able_to_normalize_total[v_q_type] += 1

        
        if q in queries_found:
            queries_found[q].append(variant.id)
        else:
            queries_found[q] = [variant.id]
            
        try:
            variation_norm_resp = await query_handler.normalize_handler.normalize(q)
            if not variation_norm_resp.variation_descriptor:
                unable_to_normalize_wr.writerow([variant.id, q, v_q_type, False,
                                                 "unable to normalize",
                                                 variation_norm_resp.warnings])
                unable_to_normalize_total[v_q_type] += 1
            else:
                can_normalize_total[v_q_type] += 1
        except Exception as e:
            unable_to_normalize_wr.writerow([variant.id, q, v_q_type, True, str(e),
                                             None])
            exception_total[v_q_type] += 1

# Close all files
no_query_wf.close()
protein_variants_wf.close()
genomic_variants_wf.close()
not_supported_wf.close()

## Variants that we could not find queries for

In [10]:
no_query_total

{'protein': 0, 'genomic': 406}

In [11]:
no_query_total_sum = sum(no_query_total.values())
no_query_total_sum

406

In [12]:
f"{no_query_total_sum / total_variants * 100}% of the total variants had no queries"

'11.713791113675708% of the total variants had no queries'

## Variants we do not support

In [13]:
do_not_support_total_sum = sum(variant_category_counts.values())

In [14]:
f"Total number of variants we do not support in the Variation Normalizer: "\
f"{do_not_support_total_sum}"

'Total number of variants we do not support in the Variation Normalizer: 939'

In [15]:
dict(sorted(variant_category_counts.items(), key=lambda x: x[1], reverse=True))

{'FUSION': 311,
 'EXPRESSION': 280,
 'REGION_DEFINED_VAR': 154,
 'FUNCTIONAL_GENE_FUNC': 82,
 'FUNCTIONAL_PROTEIN_CONS': 30,
 'COPY_NUMBER': 27,
 'EPIGENETIC_MODIFICATION': 15,
 'REARRANGEMENTS': 15,
 'OTHER': 14,
 'GENOTYPES_EASY': 7,
 'GENOTYPES_COMPOUND': 4}

In [16]:
f"The Variation Normalizer does not support {do_not_support_total_sum / total_variants * 100}% of the total variants"

'The Variation Normalizer does not support 27.091748413156374% of the total variants'

## Variants we should be able to normalize

In [17]:
should_be_able_to_normalize_total

{'protein': 1718, 'genomic': 403}

In [18]:
should_be_able_to_normalize_total_sum = sum(should_be_able_to_normalize_total.values())
should_be_able_to_normalize_total_sum

2121

In [19]:
f"The Variation Normalizer SHOULD be able to normalize {should_be_able_to_normalize_total_sum / total_variants * 100}% of the total variants"

'The Variation Normalizer SHOULD be able to normalize 61.19446047316792% of the total variants'

## Variants we were not able to normalize

Either due to a bug or an unsupported query type in Variation Normalizer

In [20]:
unable_to_normalize_total

{'protein': 325, 'genomic': 0}

In [21]:
unable_to_normalize_total_sum = sum(unable_to_normalize_total.values())
unable_to_normalize_total_sum

325

In [22]:
f"The Variation Normalizer was unable to normalize {unable_to_normalize_total_sum / total_variants * 100}% of the total variants"

'The Variation Normalizer was unable to normalize 9.376803231390653% of the total variants'

## Variant queries that raised an exception during normalization

In [23]:
exception_total

{'protein': 0, 'genomic': 0}

In [24]:
exception_total_sum = sum(exception_total.values())
exception_total_sum

0

In [25]:
f"The Variation Normalizer raised an exception for {exception_total_sum / total_variants * 100}% of the total variants"

'The Variation Normalizer raised an exception for 0.0% of the total variants'

## Variants we were able to normalize

In [26]:
can_normalize_total

{'protein': 1393, 'genomic': 403}

In [27]:
can_normalize_total_sum = sum(can_normalize_total.values())
can_normalize_total_sum

1796

In [28]:
f"The Variation Normalizer successfully normalized {can_normalize_total_sum / total_variants * 100}% of the total variants"

'The Variation Normalizer successfully normalized 51.81765724177726% of the total variants'

## Duplicate Queries

These are duplicate queries found in civic. The values are the associated variant IDs. 

In [29]:
{k:v for k,v in queries_found.items() if len(v) > 1}

{'NC_000003.11:g.10188243T>C': [2034, 2508],
 'NC_000003.11:g.10183725C>G': [1751, 1787]}