In [29]:
from enum import Enum
import re
import csv

from dotenv import load_dotenv
import requests

from variation.query import QueryHandler

In [30]:
# Environment variables are set for gene-normalizer dynamodb instance and 
# UTA DB credentials
load_dotenv()

True

In [31]:
query_handler = QueryHandler()

In [32]:
# Get all variants from MOA
r = requests.get("https://moalmanac.org/api/features")
variants_resp = r.json()
variants = list()

In [33]:
class VariantCategory(str, Enum):
    """Create enum for the kind of variants that are in MOA."""
    EXPRESSION = "Expression"
    EPIGENETIC_MODIFICATION = "Epigenetic Modification"
    FUSION = "Fusion"
    PROTEIN_CONS = "Protein Consequence"
    GENE_FUNC = "Gene Function"
    REARRANGEMENTS = "Rearrangements"
    COPY_NUMBER = "Copy Number"
    OTHER = "Other"
    GENOTYPES_EASY = "Genotypes Easy"
    GENOTYPES_COMPOUND = "Genotypes Compound"
    REGION_DEFINED_VAR = "Region Defined Variant"
    INCOMPLETE_VAR = "Incomplete Variants"
    

In [34]:
# Map MOA Feature Type to Categories
moa_ft_to_variant_categories = {
    "rearrangement": VariantCategory.REARRANGEMENTS,
    "somatic_variant": VariantCategory.PROTEIN_CONS,
    "germline_variant": VariantCategory.PROTEIN_CONS,
    "copy_number": VariantCategory.COPY_NUMBER,
    "microsatellite_stability": VariantCategory.REARRANGEMENTS,
    "mutational_signature": VariantCategory.OTHER,
    "mutational_burden": VariantCategory.OTHER,
    "neoantigen_burden": VariantCategory.EXPRESSION,
    "knockdown": VariantCategory.EXPRESSION,
    "silencing": VariantCategory.EXPRESSION,
    "aneuploidy": VariantCategory.COPY_NUMBER
}

In [35]:

# This file contains MOA Variants where there are no queries available. 
no_query_wf = open("no_query.csv", "w")
no_query_wr = csv.writer(no_query_wf, delimiter="\t")
no_query_wr.writerow(["variant_id", "feature"])

# This file contains all queries we SHOULD be able to normalize
all_queries_wf = open("all_queries.csv", "w")
all_queries_wr = csv.writer(all_queries_wf, delimiter="\t")
all_queries_wr.writerow(["variant_id", "query", "moa_feature_type", "category"])

# This file contains MOAlmanac Variants we do not currently support in Variation Normalizer.
# In these cases, we do not even attempt to try to normalize
not_supported_wf = open("not_supported_variants.csv", "w")
not_supported_wr = csv.writer(not_supported_wf, delimiter="\t")
not_supported_wr.writerow(["variant_id", "query", "moa_feature_type", "category"])

# This file contains MOAlmanac Variant queries that we were not able to normalize.
unable_to_normalize_wf = open("unable_to_normalize_queries.csv", "w")
unable_to_normalize_wr = csv.writer(unable_to_normalize_wf, delimiter="\t")
unable_to_normalize_wr.writerow(["variant_id", "query", "moa_feature_type", "category",
                                 "exception_raised", "message", "warnings"])

# This file contains MOAlmanac Variant queries that we were able to normalize.
able_to_normalize_wf = open("able_to_normalize_queries.csv", "w")
able_to_normalize_wr = csv.writer(able_to_normalize_wf, delimiter="\t")
able_to_normalize_wr.writerow(["variant_id", "query", "moa_feature_type", "category",
                               "vrs_id"])

# Category name for variants we do not support: number of variants we found
not_supported_feature_counts = {c: 0 for c in VariantCategory.__members__}

# Keep track of total counts
total_variants = 0
no_query_total = 0
should_be_able_to_normalize_total = 0
can_normalize_total = 0
unable_to_normalize_total = 0
exception_total = 0

queries_found = dict()

for v in variants_resp:
    total_variants += 1
    variant_id = v["feature_id"]
    variant_record = {
        "id": variant_id
    }
    variant_record.update(v["attributes"][0])

    feature_type = variant_record["feature_type"]
    supported = False

    feature = None
    if feature_type == "rearrangement":
        feature = "{}{}{}{}".format(variant_record["gene1"] if variant_record.get("gene1") else "",
                                      f"--{variant_record['gene2']}" if variant_record.get("gene2") else "",
                                      f" {variant_record['locus']}" if variant_record.get("locus") else "",
                                      f" {variant_record['rearrangement_type']}"
                                      if variant_record.get("rearrangement_type") else "")
    elif feature_type == "somatic_variant":
        feature = "{}{}".format(variant_record["gene"] if variant_record.get("gene") else "",
                                    f" {variant_record['protein_change']}"
                                    if variant_record.get("protein_change") else "")
        supported = True
    elif feature_type == "germline_variant":
        feature = "{}{}".format(variant_record["gene"] if variant_record.get("gene") else "",
                                    f" {variant_record['protein_change']}"
                                    if variant_record.get("protein_change") else "")
        supported = True
    elif feature_type == "copy_number":
        # These are all {gene} Amplification|Deletion
        feature = "{} {}".format(variant_record["gene"], variant_record["direction"])
        if variant_record["direction"] == "Amplification":
            supported = True
    elif feature_type == "microsatellite_stability":
        # The only one is MSI-High
        feature = "{}".format(variant_record.get("status"))
    elif feature_type == "mutational_signature":
        csn = variant_record["cosmic_signature_number"]
        version = variant_record["cosmic_signature_version"]
        feature = "COSMIC Signature (version {}) {}".format(version, csn)
    elif feature_type == "mutational_burden":
        clss = variant_record["classification"]
        min_mut = variant_record["minimum_mutations"]
        mut_per_mb = variant_record["mutations_per_mb"]
        feature = "{}{}".format(clss,
                                f" (>= {min_mut} mutations)" if min_mut
                                else (f" (>= {mut_per_mb} mutations/Mb)"
                                        if mut_per_mb else ""))
    elif feature_type == "neoantigen_burden":
        # Doesn't seem like there are any of these
        feature = "{}".format(variant_record["classification"])
    elif feature_type == "knockdown" or feature_type == "silencing":
        feature = "{}{}".format(variant_record["gene"], f" ({variant_record['technique']})"
                                if variant_record["technique"] else "")
    elif feature_type == "aneuploidy":
        # The only one is Whole genome doubling
        feature = "{}".format(variant_record["event"])
        
    if not feature:
        no_query_total += 1
        no_query_wr.writerow([variant_id, v])
        continue

    category_name = None
    if any((
        re.match(r".+fs\*\d+$", feature, re.IGNORECASE),
        re.match(r".+\d+$", feature, re.IGNORECASE)
    )):
        supported = False
        category_name = VariantCategory.PROTEIN_CONS

    if not category_name:
        category_name = moa_ft_to_variant_categories[feature_type]
        
    if not supported or len(feature.split()) == 1:
        not_supported_feature_counts[category_name.name] += 1
        not_supported_wr.writerow([variant_id, feature, feature_type, category_name])
        continue

    should_be_able_to_normalize_total += 1
    all_queries_wr.writerow([variant_id, feature, feature_type, category_name])
    
    if feature in queries_found:
        queries_found[feature].append(variant_id)
    else:
        queries_found[feature] = [variant_id]

    try:
        variation_norm_resp = await query_handler.normalize_handler.normalize(feature)
        if not variation_norm_resp.variation_descriptor:
            unable_to_normalize_wr.writerow([variant_id, feature, feature_type,
                                             category_name, False, "unable to normalize", 
                                             variation_norm_resp.warnings])
            unable_to_normalize_total += 1
        else:
            can_normalize_total += 1
            vrs_id = variation_norm_resp.variation_descriptor.variation.id
            able_to_normalize_wr.writerow([variant_id, feature, feature_type, 
                                           category_name, vrs_id])
    except Exception as e:
        unable_to_normalize_wr.writerow([variant_id, feature, feature_type, 
                                         category_name, True, str(e), None])
        exception_total += 1

# Close files
no_query_wf.close()
all_queries_wf.close()
not_supported_wf.close()
unable_to_normalize_wf.close()
able_to_normalize_wf.close()

In [36]:
f"Total number of variants in MOAlmanac: {total_variants}"

'Total number of variants in MOAlmanac: 423'

# Variants that we could not find queries for

In [37]:
no_query_total

0

In [38]:
f"{no_query_total / total_variants * 100:.2f}% of the total features had no queries"

'0.00% of the total features had no queries'

# Variants we do not support

In [39]:
do_not_support_total_sum = sum(not_supported_feature_counts.values())
f"Total number of variants we do not support in the Variation Normalizer: {do_not_support_total_sum}"

'Total number of variants we do not support in the Variation Normalizer: 244'

In [40]:
f"The Variation Normalizer does not support {do_not_support_total_sum / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer does not support 57.68% of the total variants'

Below are the total number of variants for each MOA Feature Type that we do not support

In [41]:
sorted_not_sup_counts = dict(sorted(not_supported_feature_counts.items(), key=lambda x: x[1], reverse=True))
sorted_not_sup_counts

{'PROTEIN_CONS': 176,
 'REARRANGEMENTS': 36,
 'COPY_NUMBER': 17,
 'EXPRESSION': 11,
 'OTHER': 4,
 'EPIGENETIC_MODIFICATION': 0,
 'FUSION': 0,
 'GENE_FUNC': 0,
 'GENOTYPES_EASY': 0,
 'GENOTYPES_COMPOUND': 0,
 'REGION_DEFINED_VAR': 0,
 'INCOMPLETE_VAR': 0}

Below is the total percentage of variants for each MOA Feature Type that we do not support

In [42]:
{k: f"{v / total_variants * 100:.2f}%" for k, v in sorted_not_sup_counts.items()}

{'PROTEIN_CONS': '41.61%',
 'REARRANGEMENTS': '8.51%',
 'COPY_NUMBER': '4.02%',
 'EXPRESSION': '2.60%',
 'OTHER': '0.95%',
 'EPIGENETIC_MODIFICATION': '0.00%',
 'FUSION': '0.00%',
 'GENE_FUNC': '0.00%',
 'GENOTYPES_EASY': '0.00%',
 'GENOTYPES_COMPOUND': '0.00%',
 'REGION_DEFINED_VAR': '0.00%',
 'INCOMPLETE_VAR': '0.00%'}

In [43]:
f"The Variation Normalizer does not support {do_not_support_total_sum / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer does not support 57.68% of the total variants'

# Variants we should be able to normalize

In [44]:
should_be_able_to_normalize_total

179

In [45]:
f"The Variation Normalizer SHOULD be able to normalize {should_be_able_to_normalize_total / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer SHOULD be able to normalize 42.32% of the total variants'

# Variants we were not able to normalize

Either due to a bug or an unsupported query type in Variation Normalizer

In [46]:
unable_to_normalize_total

0

In [47]:
f"The Variation Normalizer was unable to normalize {unable_to_normalize_total / total_variants * 100}% of the total variants"

'The Variation Normalizer was unable to normalize 0.0% of the total variants'

## Breakdown of the variants we weren't able to normalize

In this section, we breakdown the reasons on why we weren't able to normalize variants.

In [48]:
unable_to_tokenize = 0
unable_to_find_valid = 0
other = 0
with open("unable_to_normalize_queries.csv", "r") as f:
    reader = csv.reader(f, delimiter="\t")
    next(reader)
    for row in reader:
        if "Unable to find valid result" in row[-1]:
            unable_to_find_valid += 1
        elif "Unable to tokenize" in row[-1]:
            unable_to_tokenize += 1
        else:
            other += 1

### Due to not passing validation checks

The Variation Normalizer performs validation checks on the input query (such as reference sequence). If these validation checks fail, then the input query will fail to normalize.

In [49]:
f"The Variation Normalizer found {unable_to_find_valid} invalid variants (This is {unable_to_find_valid / total_variants * 100:.2f}% of the total variants)."

'The Variation Normalizer found 0 invalid variants (This is 0.00% of the total variants).'

### Due to tokenization 

The Variation Normalizer will tokenize the input query to determine the kind of token. It is limited in the kinds of tokens it accepts, so these tokens are not yet supported in the Variation Normalizer. 

In [50]:
f"The Variation Normalizer was unable to tokenize {unable_to_tokenize} variants ({unable_to_tokenize / total_variants * 100:.2f}% of the total variants)."

'The Variation Normalizer was unable to tokenize 0 variants (0.00% of the total variants).'

In [51]:
f"The Variation Normalizer was unable to normalize {other} variants due to other issues (This is {other / total_variants * 100:.2f}% of the total variants)."

'The Variation Normalizer was unable to normalize 0 variants due to other issues (This is 0.00% of the total variants).'

## Variant queries that raised an exception during normalization

In [52]:
f"The Variation Normalizer raised an exception for {exception_total / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer raised an exception for 0.00% of the total variants'

# Variants we were able to normalize

In [53]:
can_normalize_total

179

In [54]:
f"The Variation Normalizer successfully normalized {can_normalize_total / should_be_able_to_normalize_total * 100:.2f}% of the variants we SHOULD be able to normalize"

'The Variation Normalizer successfully normalized 100.00% of the variants we SHOULD be able to normalize'

In [55]:
f"The Variation Normalizer successfully normalized {can_normalize_total / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer successfully normalized 42.32% of the total variants'

## Duplicate Queries

These are duplicate queries found in MOA. The values are the associated variant IDs. 

In [56]:
{k:v for k,v in queries_found.items() if len(v) > 1}

{'ABL1 p.T315I': [71, 108],
 'EGFR p.L858R': [267, 284, 286],
 'IDH2 p.R140Q': [329, 334],
 'CCND1 Amplification': [707, 708],
 'MYC Amplification': [765, 766]}