# Analysis for CIViC data

This notebook contains an analysis on CIViC variant data

In [1]:
import logging
from enum import Enum
import re
import csv

from civicpy import civic as civicpy
from dotenv import load_dotenv

from variation.query import QueryHandler

logging.getLogger("root").setLevel(logging.WARNING)



In [2]:
# Environment variables are set for gene-normalizer dynamodb instance and 
# UTA DB credentials
load_dotenv()

True

In [3]:
query_handler = QueryHandler()

In [4]:
# Get latest data
# civicpy.update_cache(from_remote_cache=False)

In [5]:
civicpy.load_cache(on_stale="ignore")

True

In [6]:
variants = civicpy.get_all_variants()
total_variants = len(variants)
f"Total Number of variants in CIViC: {total_variants}"

'Total Number of variants in CIViC: 3466'

In [7]:
class VariantCategory(str, Enum):
    """Create enum for the kind of variants that are in CIViC."""
    EXPRESSION = "Expression"
    EPIGENETIC_MODIFICATION = "Epigenetic Modification"
    FUSION = "Fusion"
    PROTEIN_CONS = "Protein Consequence"
    GENE_FUNC = "Gene Function"
    REARRANGEMENTS = "Rearrangements"
    COPY_NUMBER = "Copy Number"
    OTHER = "Other"
    GENOTYPES_EASY = "Genotypes Easy"
    GENOTYPES_COMPOUND = "Genotypes Compound"
    REGION_DEFINED_VAR = "Region Defined Variant"
    INCOMPLETE_VAR = "Incomplete Variants"
    

Below are terms in CIViC that we know that the variation normalizer cannot support.

In [8]:
not_supported = {
    VariantCategory.EXPRESSION: {
        "overexpression", "expression", "underexpression", "serum levels", 
        "transcription levels", "autocrine activation", "tnc-l", "promoter mutation",
        "top2a/90", "low ratio of vegf165b/vegftotal"
    },
    VariantCategory.EPIGENETIC_MODIFICATION: {
        "methylation", "promoter hypermethylation", "promoter methylation", 
        "phosphorylation"
    },
    VariantCategory.FUSION: {
        "::", "fusion"
    },
    VariantCategory.PROTEIN_CONS: {
        "frameshift truncation", "frameshift", "frame shift", "fs",
        "truncating mutation", "1100delc"
    },
    VariantCategory.GENE_FUNC: {
        "gain of function", "gain-of-function", "loss of function", "loss-of-function",
        "activating mutation", "tkd mutation", "inactivation", "null", "viii"
    },
    VariantCategory.REARRANGEMENTS: {
        "translocation", "rearrangement", "double ph", "alu insertion", 
        "exon 20 insertion", "internal tandem duplications", "tandem repeat",
        "itd"
    },
    VariantCategory.COPY_NUMBER: {
        "copy number", "repeat", "dup"
    }, 
    VariantCategory.OTHER: {
        "cytoplasmic mislocalization", "alternative transcript", "rare mutation",
        "splice", "splicing", "ceacam1-l", "ceacam1-s", "Δ", "delta", "beta",
        "deprecated", "point mutations", "conserved domain mut", "cis double mutants",
        "loss-of-modification", "gBRCAm"
    },
    VariantCategory.GENOTYPES_EASY: {
        "diplotypes", "wild type", "wildtype"
    },
    VariantCategory.GENOTYPES_COMPOUND: {
        "loss of heterozygosity", "biallelic inactivation", "bi-allelic inactivation",
        "homozygosity", "loh", "single allele deletion"
    },
    VariantCategory.REGION_DEFINED_VAR: {
        "deleterious mutation", "domain mutation", "polymorphism", 
        "non-p-loop mutation", "p-loop mutation", "3' utr mutation", "alteration",
        "t17 deletion", "exon"
    }
}

In [9]:

# This file contains CIViC Variants where there are no queries available.
# One example would be a CIViC Variant whose name has "c." in it. In this case,
# we want the genomic representative. We look at the HGVS expressions to find a genomic
# expression. If there is no genomic HGVS expression, we have no name for a query
no_query_wf = open("incomplete_variants.csv", "w")
no_query_wr = csv.writer(no_query_wf, delimiter="\t")
no_query_wr.writerow(["variant_id", "variant_name"])

# This file contains protein queries (gene + variant_name) we SHOULD be able to
# normalize
protein_variants_wf = open("all_protein_variant_queries.csv", "w")
protein_variants_wr = csv.writer(protein_variants_wf, delimiter="\t")
protein_variants_wr.writerow(["variant_id", "gene_name", "variant_name"])

# This file contains genomic queries (genomic HGVS expressions) we SHOULD be able to
# normalize
genomic_variants_wf = open("all_genomic_variant_queries.csv", "w")
genomic_variants_wr = csv.writer(genomic_variants_wf, delimiter="\t")
genomic_variants_wr.writerow(["variant_id", "hgvs_g"])

# This file contains CIViC Variants we do not currently support in Variation Normalizer.
# In these cases, we do not even attempt to try to normalize
not_supported_wf = open("not_supported_variants.csv", "w")
not_supported_wr = csv.writer(not_supported_wf, delimiter="\t")
not_supported_wr.writerow(["variant_id", "gene_name", "variant_name", "category"])

# This file contains CIViC Variant queries that we were not able to normalize.
unable_to_normalize_wf = open("unable_to_normalize_queries.csv", "w")
unable_to_normalize_wr = csv.writer(unable_to_normalize_wf, delimiter="\t")
unable_to_normalize_wr.writerow(["variant_id", "query", "query_type",
                                 "exception_raised", "message", "warnings"])

# Category name for variants we do not support: number of variants we found
variant_category_counts = {c: 0 for c in VariantCategory.__members__}

# Keep track of total counts
no_query_total = {"protein": 0, "genomic": 0}
should_be_able_to_normalize_total = {"protein": 0, "genomic": 0}
can_normalize_total = {"protein": 0, "genomic": 0}
unable_to_normalize_total = {"protein": 0, "genomic": 0}
exception_total = {"protein": 0, "genomic": 0}

queries_found = dict()

for variant in variants:
    v_name = None
    v_q_type = None
    if "c." in variant.name:
        # Try getting genomic HGVS expression first
        v_name = ([expr for expr in variant.hgvs_expressions 
                         if "g." in expr] or [None])[0]

        # If there is no genomic HGVS expression, try using gnomad vcf
        if not v_name:
            chromosome = variant.coordinates.chromosome
            pos = variant.coordinates.start
            ref = variant.coordinates.reference_bases
            alt = variant.coordinates.variant_bases

            if all((chromosome, pos, ref, alt)):
                v_name = f"{chromosome}-{pos}-{ref}-{alt}"
        
        v_q_type = "genomic"
    else:
        v_name = variant.name.strip()
        v_q_type = "protein"

    if not v_name:
        no_query_wr.writerow([variant.id, variant.name])
        no_query_total[v_q_type] += 1
        continue
    
    gene_name = variant.gene.name.strip()
    v_name_lower = v_name.lower()
    
    categories = set()
    if v_name_lower in {"loss", "deletion"}:
        categories.add(VariantCategory.GENE_FUNC)
    elif v_name_lower == "mutation":
        categories.add(VariantCategory.REGION_DEFINED_VAR)
    else:
        if re.match(r".*e\d+-e\d+", v_name_lower):  # ex: e20-e20
            categories.add(VariantCategory.FUSION)

        if any((
            "exon" in v_name_lower,
            re.match(r"\d+kb\sdeletion", v_name_lower),
            re.match(r"partial\sdeletion\sof\s\d+(.\d+)?\skb", v_name_lower),
            re.match(r"del\s\d+-\d+", v_name_lower),
            re.match(r"\d+(p|q)\d+(.\d+)?-\d+(.\d+)?\s\d+mb del", v_name_lower),
            re.match(r"intron\s\d+\smutation", v_name_lower)
        )):
            categories.add(VariantCategory.REGION_DEFINED_VAR)
        
        if any((
            re.match(r"t\(.*\)\(.*\)", v_name_lower), # ex: t(1;3)(p36.3;p25)
            re.match(r"\w\d+ins$", v_name_lower)  # ex: P780INS
        )):  
            categories.add(VariantCategory.REARRANGEMENTS)

        if any((
            re.match(r"^rs\d+", v_name_lower),
            re.match(r"class\s\d+\smutation", v_name_lower),
            re.match(r"\d+\s\((c|a|g|t)+-(c|a|g|t)+\)", v_name_lower)
        )):
            categories.add(VariantCategory.OTHER)

        if re.match(r"cd\d+v\d+", v_name_lower):
            categories.add(VariantCategory.EXPRESSION)

        if any((
            re.match(r"\w+\d+$", v_name_lower),  # ex: V600
            re.match(r"\w+\d+\w+\/\w+$", v_name_lower),  # ex: S893A/T
            re.match(r"[a-z]+\d+[a-z]+\sand\s[a-z]+\d+[a-z]+", v_name_lower),  # ex: E2014K and E2419K
        )):
            categories.add(VariantCategory.PROTEIN_CONS)

        for k, v in not_supported.items():
            if {x for x in v if x in v_name_lower}:
                categories.add(k)

    if len(categories) > 1:
        # Those with multiple categories will be classified as other
        categories = {VariantCategory.OTHER}

    if len(categories) == 1:
        variant_category_name = categories.pop()
        variant_category_counts[variant_category_name.name] += 1
        not_supported_wr.writerow([variant.id, gene_name, variant.name, variant_category_name])
    else:
        # We should support this, so we need to query the variation normalizer
        if v_q_type == "protein":
            q = f"{gene_name} {v_name}"
            protein_variants_wr.writerow([variant.id, gene_name, v_name])
        else:
            q = v_name
            genomic_variants_wr.writerow([variant.id, q])

        should_be_able_to_normalize_total[v_q_type] += 1

        
        if q in queries_found:
            queries_found[q].append(variant.id)
        else:
            queries_found[q] = [variant.id]
            
        try:
            variation_norm_resp = await query_handler.normalize_handler.normalize(q)
            if not variation_norm_resp.variation_descriptor:

                is_incomplete = False
                if v_q_type == "protein" and len(v_name.split()) == 1:
                    if "-" in v_name:
                        # could be {gene}-{gene}
                        genes = v_name.split("-")
                    else:
                        genes = [v_name]

                    is_genes = True
                    for g in genes:
                        if query_handler.normalize_handler.gene_normalizer.normalize(v_name).match_type == 0:
                            # not a gene 
                            is_genes = False
                            break

                    if is_genes:
                        variant_category_name = VariantCategory.INCOMPLETE_VAR
                        variant_category_counts[variant_category_name.name] += 1
                        not_supported_wr.writerow([variant.id, gene_name, variant.name, variant_category_name])
                        is_incomplete = True

                if not is_incomplete:
                    unable_to_normalize_wr.writerow([variant.id, q, v_q_type, False,
                                                    "unable to normalize",
                                                    variation_norm_resp.warnings])
                    unable_to_normalize_total[v_q_type] += 1
            else:
                can_normalize_total[v_q_type] += 1
        except Exception as e:
            unable_to_normalize_wr.writerow([variant.id, q, v_q_type, True, str(e),
                                             None])
            exception_total[v_q_type] += 1

# Close all files
no_query_wf.close()
protein_variants_wf.close()
genomic_variants_wf.close()
not_supported_wf.close()
unable_to_normalize_wf.close()

## Variants that we could not find queries for

In [10]:
no_query_total

{'protein': 0, 'genomic': 384}

In [11]:
no_query_total_sum = sum(no_query_total.values())
no_query_total_sum

384

In [12]:
f"{no_query_total_sum / total_variants * 100:.2f}% of the total variants had no queries"

'11.08% of the total variants had no queries'

## Variants we do not support

In [13]:
do_not_support_total_sum = sum(variant_category_counts.values())

In [14]:
f"Total number of variants we do not support in the Variation Normalizer: "\
f"{do_not_support_total_sum}"

'Total number of variants we do not support in the Variation Normalizer: 1151'

Below are the total number of variants for each category that we do not support

In [15]:
sorted_variant_cat_counts = dict(sorted(variant_category_counts.items(), key=lambda x: x[1], reverse=True))
sorted_variant_cat_counts

{'FUSION': 310,
 'EXPRESSION': 288,
 'REGION_DEFINED_VAR': 174,
 'PROTEIN_CONS': 124,
 'GENE_FUNC': 89,
 'OTHER': 71,
 'REARRANGEMENTS': 37,
 'COPY_NUMBER': 26,
 'EPIGENETIC_MODIFICATION': 15,
 'GENOTYPES_EASY': 7,
 'GENOTYPES_COMPOUND': 6,
 'INCOMPLETE_VAR': 4}

Below is the total percentage of variants for each category that we do not support

In [16]:
{k: f"{v / total_variants * 100:.2f}%" for k, v in sorted_variant_cat_counts.items()}

{'FUSION': '8.94%',
 'EXPRESSION': '8.31%',
 'REGION_DEFINED_VAR': '5.02%',
 'PROTEIN_CONS': '3.58%',
 'GENE_FUNC': '2.57%',
 'OTHER': '2.05%',
 'REARRANGEMENTS': '1.07%',
 'COPY_NUMBER': '0.75%',
 'EPIGENETIC_MODIFICATION': '0.43%',
 'GENOTYPES_EASY': '0.20%',
 'GENOTYPES_COMPOUND': '0.17%',
 'INCOMPLETE_VAR': '0.12%'}

In [17]:
f"The Variation Normalizer does not support {do_not_support_total_sum / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer does not support 33.21% of the total variants'

## Variants we should be able to normalize

In [18]:
should_be_able_to_normalize_total

{'protein': 1510, 'genomic': 425}

In [19]:
should_be_able_to_normalize_total_sum = sum(should_be_able_to_normalize_total.values())
should_be_able_to_normalize_total_sum

1935

In [20]:
f"The Variation Normalizer SHOULD be able to normalize {should_be_able_to_normalize_total_sum / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer SHOULD be able to normalize 55.83% of the total variants'

## Variants we were not able to normalize

Either due to a bug or an unsupported query type in Variation Normalizer

In [21]:
unable_to_normalize_total

{'protein': 113, 'genomic': 3}

In [22]:
unable_to_normalize_total_sum = sum(unable_to_normalize_total.values())
unable_to_normalize_total_sum

116

In [23]:
f"The Variation Normalizer was unable to normalize {unable_to_normalize_total_sum / total_variants * 100}% of the total variants"

'The Variation Normalizer was unable to normalize 3.346797461050202% of the total variants'

## Breakdown of the variants we weren't able to normalize

In this section, we breakdown the reasons on why we weren't able to normalize variants.

In [24]:
unable_to_tokenize = 0
unable_to_find_valid = 0
other = 0
with open("unable_to_normalize_queries.csv", "r") as f:
    reader = csv.reader(f, delimiter="\t")
    next(reader)
    for row in reader:
        if "Unable to find valid result" in row[-1]:
            unable_to_find_valid += 1
        elif "Unable to tokenize" in row[-1]:
            unable_to_tokenize += 1
        else:
            other += 1

### Due to not passing validation checks

The Variation Normalizer performs validation checks on the input query (such as reference sequence). If these validation checks fail, then the input query will fail to normalize.

In [25]:
f"The Variation Normalizer found {unable_to_find_valid} invalid variants (This is {unable_to_find_valid / total_variants * 100:.2f}% of the total variants)."

'The Variation Normalizer found 62 invalid variants (This is 1.79% of the total variants).'

### Due to tokenization 

The Variation Normalizer will tokenize the input query to determine the kind of token. It is limited in the kinds of tokens it accepts, so these tokens are not yet supported in the Variation Normalizer. 

In [26]:
f"The Variation Normalizer was unable to tokenize {unable_to_tokenize} variants ({unable_to_tokenize / total_variants * 100:.2f}% of the total variants)."

'The Variation Normalizer was unable to tokenize 54 variants (1.56% of the total variants).'

In [27]:
f"The Variation Normalizer was unable to normalize {other} variants due to other issues (This is {other / total_variants * 100:.2f}% of the total variants)."

'The Variation Normalizer was unable to normalize 0 variants due to other issues (This is 0.00% of the total variants).'

## Variant queries that raised an exception during normalization

In [28]:
exception_total

{'protein': 0, 'genomic': 0}

In [29]:
exception_total_sum = sum(exception_total.values())
exception_total_sum

0

In [30]:
f"The Variation Normalizer raised an exception for {exception_total_sum / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer raised an exception for 0.00% of the total variants'

## Variants we were able to normalize

In [31]:
can_normalize_total

{'protein': 1393, 'genomic': 422}

In [32]:
can_normalize_total_sum = sum(can_normalize_total.values())
can_normalize_total_sum

1815

In [33]:
f"The Variation Normalizer successfully normalized {can_normalize_total_sum / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer successfully normalized 52.37% of the total variants'

## Duplicate Queries

These are duplicate queries found in civic. The values are the associated variant IDs. 

In [34]:
{k:v for k,v in queries_found.items() if len(v) > 1}

{'NC_000003.11:g.10188243T>C': [2034, 2508],
 'NC_000003.11:g.10183725C>G': [1751, 1787]}