# Analysis for CIViC data

This notebook contains an analysis on CIViC variant data

In [1]:
import logging
from enum import Enum
import re
from typing import Tuple, Optional
import csv

from civicpy import civic as civicpy
from dotenv import load_dotenv

from variation.query import QueryHandler

logging.getLogger("root").setLevel(logging.WARNING)



In [2]:
# Environment variables are set for gene-normalizer dynamodb instance and 
# UTA DB credentials
load_dotenv()

True

In [3]:
query_handler = QueryHandler()

In [4]:
# Get latest data
# civicpy.update_cache(from_remote_cache=False)

In [5]:
civicpy.load_cache(on_stale="ignore")

True

In [6]:
variants = civicpy.get_all_variants()
total_variants = len(variants)
f"Total Number of variants in CIViC: {total_variants}"

'Total Number of variants in CIViC: 3466'

In [7]:
class Category(str, Enum):
    """Create enum for the kind of variants that are in CIViC."""
    EXPRESSION = "Expression"
    EPIGENETIC_MODIFICATION = "Epigenetic Modification"
    FUSION = "Fusion"
    FUNCTIONAL_PROTEIN_CONS = "Functional Protein Consequence"
    FUNCTIONAL_GENE_FUNC = "Functional Gene Function"
    REARRANGEMENTS = "Rearrangements"
    COPY_NUMBER = "Copy Number"
    OTHER = "Other"
    GENOTYPES_EASY = "Genotypes Easy"
    GENOTYPES_COMPOUND = "Genotypes Compound"
    REGION_DEFINED_VAR = "Region Defined Variant"
    

Below are terms in CIViC that we know that the variation normalizer cannot support.

In [8]:
not_supported = {
    Category.EXPRESSION: {
        "overexpression", "expression", "underexpression", "serum levels", 
        "transcription levels"
    },
    Category.EPIGENETIC_MODIFICATION: {
        "methylation", "promoter hypermethylation", "promoter methylation", 
        "phosphorylation"
    },
    Category.FUSION: {
        "::", "fusion"
    },
    Category.FUNCTIONAL_PROTEIN_CONS: {
        "frameshift truncation", "frameshift", "frame shift"
    },
    Category.FUNCTIONAL_GENE_FUNC: {
        "gain of function", "gain-of-function", "loss of function", "loss-of-function",
        "activating mutation", "tkd mutation", "inactivation"
    },
    Category.REARRANGEMENTS: {
        "translocation", "rearrangement", "double ph", "alu insertion", 
        "exon 20 insertion", "internal tandem duplications", "tandem repeat"
    },
    Category.COPY_NUMBER: {
        "copy number", "repeat", "dup"
    }, 
    Category.OTHER: {
        "cytoplasmic mislocalization", "alternative transcript"
    },
    Category.GENOTYPES_EASY: {
        "diplotypes", "wild type", "wildtype"
    },
    Category.GENOTYPES_COMPOUND: {
        "loss of heterozygosity", "biallelic inactivation", "bi-allelic inactivation",
        "homozygosity", 
    },
    Category.REGION_DEFINED_VAR: {
        "deleterious mutation", "exon deletion", "domain mutation", "polymorphism", 
        "non-p-loop mutation", "p-loop mutation"
    }
}

In [9]:
def get_variant_name(variant: civicpy.Variant) -> Tuple[Optional[str], str]:
    """Get query to use from CIViC Variant data. If c. in query name, we will use the
    genomic representative. 

    :param civicpy.Variant variant: CIViC Variant record
    :return: (variant name, whether the variant name is protein or genomic)
    """
    variant_name = None
    variant_query_type = None
    if "c." in variant.name:
        variant_name = ([expr for expr in variant.hgvs_expressions 
                         if "g." in expr] or [None])[0]
        variant_query_type = "genomic"
    else:
        variant_name = variant.name.strip()
        variant_query_type = "protein"

    return variant_name, variant_query_type


In [10]:

# This file contains CIViC Variants where there are no queries available.
# One example would be a CIViC Variant whose name has "c." in it. In this case,
# we want the genomic representative. We look at the HGVS expressions to find a genomic
# expression. If there is no genomic HGVS expression, we have no name for a query
no_query_wf = open("no_query.csv", "w+")
no_query_wr = csv.writer(no_query_wf, delimiter="\t")
no_query_wr.writerow(["variant_id", "variant_name"])

# This file contains protein queries (gene + variant_name) we SHOULD be able to
# normalize
protein_variants_wf = open("all_protein_variant_queries.csv", "w+")
protein_variants_wr = csv.writer(protein_variants_wf, delimiter="\t")
protein_variants_wr.writerow(["variant_id", "gene_name", "variant_name"])

# This file contains genomic queries (genomic HGVS expressions) we SHOULD be able to
# normalize
genomic_variants_wf = open("all_genomic_variant_queries.csv", "w+")
genomic_variants_wr = csv.writer(genomic_variants_wf, delimiter="\t")
genomic_variants_wr.writerow(["variant_id", "hgvs_g"])

# This file contains CIViC Variants we do not currently support in Variation Normalizer.
# In these cases, we do not even attempt to try to normalize
not_supported_wf = open("not_supported_variants.csv", "w+")
not_supported_wr = csv.writer(not_supported_wf, delimiter="\t")
not_supported_wr.writerow(["variant_id", "gene_name", "variant_name", "category"])

# This file contains CIViC Variant queries that we were not able to normalize.
unable_to_normalize_wf = open("unable_to_normalize_queries.csv", "w+")
unable_to_normalize_wr = csv.writer(unable_to_normalize_wf, delimiter="\t")
unable_to_normalize_wr.writerow(["variant_id", "query", "query_type",
                                 "exception_raised", "message", "warnings"])

# Category name for variants we do not support: number of variants we found
category_counts = {c: 0 for c in Category.__members__}

i = 0
for variant in variants:
    v_name, v_q_type = get_variant_name(variant)
    if not v_name:
        no_query_wr.writerow([variant.id, variant.name])
        continue
    
    gene_name = variant.gene.name.strip()
    v_name_lower = v_name.lower()
    
    categories = set()
    if v_name_lower in {"loss", "deletion"}:
        categories.add(Category.FUNCTIONAL_GENE_FUNC)
    elif v_name_lower == "mutation":
        categories.add(Category.REGION_DEFINED_VAR)
    else:
        if re.match(r".*e\d+-e\d+", v_name_lower):  # ex: e20-e20
            categories.add(Category.FUSION)
        
        if "exon" in v_name_lower:
            if {x for x in {"deletion", "mutation"}}:
                categories.add(Category.REGION_DEFINED_VAR)
        
        if v_name_lower.endswith("fs"):
            categories.add(Category.FUNCTIONAL_PROTEIN_CONS)

        for k, v in not_supported.items():
            if {x for x in v if x in v_name_lower}:
                categories.add(k)

    if len(categories) > 1:
        # Those with multiple categories will be classified as other
        categories = {Category.OTHER}

    if len(categories) == 1:
        category_name = categories.pop()
        category_counts[category_name.name] += 1
        not_supported_wr.writerow([variant.id, gene_name, variant.name, category_name])
    else:
        # We should support this, so we need to query the variation normalizer
        if v_q_type == "protein":
            q = f"{gene_name} {v_name}"
            protein_variants_wr.writerow([variant.id, gene_name, v_name])
        else:
            q = v_name
            genomic_variants_wr.writerow([variant.id, q])

        try:
            variation_norm_resp = await query_handler.normalize_handler.normalize(q)
            if not variation_norm_resp.variation_descriptor:
                unable_to_normalize_wr.writerow([variant.id, q, v_q_type, False,
                                                 "unable to normalize",
                                                 variation_norm_resp.warnings])
        except Exception as e:
            unable_to_normalize_wr.writerow([variant.id, q, v_q_type, True, str(e),
                                             None])

# Close all files
no_query_wf.close()
protein_variants_wf.close()
genomic_variants_wf.close()
not_supported_wf.close()

## Variants we do not support

In [11]:
f"Total number of variants we do not support in the Variation Normalizer: "\
f"{sum(category_counts.values())}"

'Total number of variants we do not support in the Variation Normalizer: 939'

In [12]:
dict(sorted(category_counts.items(), key=lambda x: x[1], reverse=True))

{'FUSION': 311,
 'EXPRESSION': 280,
 'REGION_DEFINED_VAR': 154,
 'FUNCTIONAL_GENE_FUNC': 82,
 'FUNCTIONAL_PROTEIN_CONS': 30,
 'COPY_NUMBER': 27,
 'EPIGENETIC_MODIFICATION': 15,
 'REARRANGEMENTS': 15,
 'OTHER': 14,
 'GENOTYPES_EASY': 7,
 'GENOTYPES_COMPOUND': 4}