# Analysis of Normalized Queries

This notebook contains an analysis on CIViC evidence data

In [1]:
import csv
import pandas as pd
import numpy as np
from civicpy import civic

In [2]:
# civic.load_cache(on_stale="ignore")

## Total Variants in CIViC

In [67]:
civic_variant_ids = civic.get_all_variants()

total_number_variants = len(civic_variant_ids)
f"Total Number of variants in CIViC: {total_number_variants}"

'Total Number of variants in CIViC: 3525'

## Total Evidence items in CIViC

In [68]:
civic_evidence_ids = civic.get_all_evidence()

total_number_evidences = len(civic_evidence_ids)
f"Total Number of evidence items in CIViC: {total_number_evidences}"

'Total Number of evidence items in CIViC: 10044'

## List of Normalized Variants ID's

In [5]:
normalized_queries_df = pd.read_csv("./able_to_normalize_queries.csv", sep= "\t")
normalized_queries_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True
1,1988,NC_000003.11:g.10191649A>T,genomic,True
2,2488,3-10191647-T-G,genomic,True
3,1986,NC_000003.11:g.10191648G>T,genomic,True
4,1987,NC_000003.11:g.10191649A>G,genomic,True


In [6]:
normalized_variant_id_list = list(normalized_queries_df["variant_id"])

## Variant analysis

In [7]:
total_number_normalized_variants = len(set(normalized_variant_id_list))
percentage_normalized_variants = "{:.2f}".format(total_number_normalized_variants/total_number_variants*100)
f"Percentage of Normalized Variants in CIViC: {percentage_normalized_variants}%"

'Percentage of Normalized Variants in CIViC: 52.79%'

In [8]:
normalized_queries_df.variant_accepted.value_counts()

variant_accepted
False    995
True     866
Name: count, dtype: int64

In [9]:
number_not_accepted_normalized_variants = len(normalized_queries_df) - normalized_queries_df.variant_accepted.sum()
percentage_not_accepted_normalized_variants = "{:.2f}".format(number_not_accepted_normalized_variants/total_number_normalized_variants*100)
f"Percentage of not accepted Normalized Variants: {percentage_not_accepted_normalized_variants}%"

'Percentage of not accepted Normalized Variants: 53.47%'

In [10]:
number_accepted_normalized_variants = normalized_queries_df.variant_accepted.sum()
percentage_accepted_normalized_variants = "{:.2f}".format(number_accepted_normalized_variants/total_number_normalized_variants*100)
f"Percentage of accepted Normalized Variants: {percentage_accepted_normalized_variants}%"

'Percentage of accepted Normalized Variants: 46.53%'

## Import evidence ID's associated with the Normalized Variants using variant ID

Evidence items are linked to the molecular profiles associated with variant items. To pull evidence IDs, need to use molecular profiles. For more information on the structure of a variant item

In [11]:
normalized_variants_evidence_ids = []

for v in normalized_variant_id_list:
    variant_evidence_id_list = []

    for variant in civic_variant_ids:
        if int(v) == variant.id:

            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in variant_evidence_id_list:
                        variant_evidence_id_list.append(e.id)

            variant_evidence_id_list = variant_evidence_id_list or ""

    normalized_variants_evidence_ids.append(variant_evidence_id_list)
    
normalized_queries_df["evidence_ids"] = normalized_variants_evidence_ids
normalized_queries_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,evidence_ids
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,"[9347, 6724]"
1,1988,NC_000003.11:g.10191649A>T,genomic,True,[5336]
2,2488,3-10191647-T-G,genomic,True,"[10779, 6723, 8258]"
3,1986,NC_000003.11:g.10191648G>T,genomic,True,[5334]
4,1987,NC_000003.11:g.10191649A>G,genomic,True,[5335]


## List of Evidence ID's of Normalized Variants

In [12]:
normalized_queries_df = normalized_queries_df.explode(column="evidence_ids")
normalized_queries_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,evidence_ids
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,9347
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,6724
1,1988,NC_000003.11:g.10191649A>T,genomic,True,5336
2,2488,3-10191647-T-G,genomic,True,10779
2,2488,3-10191647-T-G,genomic,True,6723


In [13]:
normalized_variant_evidence_id_list = list(normalized_queries_df["evidence_ids"])

## Import evidence status, rating, and level associated with a specific evidence ID
    please see the CIViC documentation for evidence item attribute decriptions (https://civic.readthedocs.io/en/latest/model/evidence.html)

In [14]:
normalized_variants_evidence_statuses = []
normalized_variants_evidence_ratings = []
normalized_variants_evidence_levels = []

for e in normalized_variant_evidence_id_list:
    variant_evidence_status_list = []
    variant_evidence_rating_list = []
    variant_evidence_level_list = []

    for evidence in civic_evidence_ids: 
        if int(e) == evidence.id:

            if evidence.status not in variant_evidence_status_list:
                variant_evidence_status_list.append(evidence.status)
            variant_evidence_status_list = variant_evidence_status_list or ""

            if evidence.rating not in variant_evidence_rating_list:
                variant_evidence_rating_list.append(evidence.rating)
            variant_evidence_rating_list = variant_evidence_rating_list or ""

            if evidence.evidence_level not in variant_evidence_level_list:
                variant_evidence_level_list.append(evidence.evidence_level)
            variant_evidence_level_list = variant_evidence_level_list or ""

    normalized_variants_evidence_statuses.append(variant_evidence_status_list)
    normalized_variants_evidence_ratings.append(variant_evidence_rating_list)
    normalized_variants_evidence_levels.append(variant_evidence_level_list)

normalized_queries_df["evidence_status"] = normalized_variants_evidence_statuses
normalized_queries_df["evidence_rating"] = normalized_variants_evidence_ratings
normalized_queries_df["evidence_level"] = normalized_variants_evidence_levels
normalized_queries_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,9347,[submitted],[3],[C]
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,6724,[accepted],[2],[C]
1,1988,NC_000003.11:g.10191649A>T,genomic,True,5336,[accepted],[2],[C]
2,2488,3-10191647-T-G,genomic,True,10779,[submitted],[3],[C]
2,2488,3-10191647-T-G,genomic,True,6723,[accepted],[2],[C]


## Evidence analysis

In [15]:
total_number_normalized_variant_unique_evidence_items = len(set(normalized_queries_df.evidence_ids))
normalized_percentage_evidence_items = "{:.2f}".format(total_number_normalized_variant_unique_evidence_items/total_number_evidences*100)
f"Percentage of Normalized Variant Evidence items in CIViC: {normalized_percentage_evidence_items}%"

'Percentage of Normalized Variant Evidence items in CIViC: 58.71%'

In [16]:
normalized_queries_df['evidence_status'] = normalized_queries_df['evidence_status'].str.join(', ')

In [17]:
normalized_queries_df['evidence_accepted'] = normalized_queries_df.evidence_status.map({'accepted':True ,'submitted':False, 'rejected':False})
normalized_queries_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level,evidence_accepted
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,9347,submitted,[3],[C],False
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,6724,accepted,[2],[C],True
1,1988,NC_000003.11:g.10191649A>T,genomic,True,5336,accepted,[2],[C],True
2,2488,3-10191647-T-G,genomic,True,10779,submitted,[3],[C],False
2,2488,3-10191647-T-G,genomic,True,6723,accepted,[2],[C],True


In [18]:
normalized_queries_df.evidence_accepted.value_counts()

evidence_accepted
False    3920
True     2049
Name: count, dtype: int64

In [19]:
number_not_accepted_evidences_normalized_variants = len(normalized_queries_df) - normalized_queries_df.evidence_accepted.sum()
percentage_not_accepted_evidences_normalized_variants = "{:.2f}".format(number_not_accepted_evidences_normalized_variants/total_number_normalized_variant_unique_evidence_items*100)
f"Percentage of not accepted Normalized Variant Evidence items: {percentage_not_accepted_evidences_normalized_variants}%"

'Percentage of not accepted Normalized Variant Evidence items: 66.47%'

In [20]:
number_accepted_evidences_normalized_variants = normalized_queries_df.evidence_accepted.sum()
percentage_accepted_evidences_normalized_variants = "{:.2f}".format(number_accepted_evidences_normalized_variants/total_number_normalized_variant_unique_evidence_items*100)
f"Percentage of accepted Normalized Variant Evidence items: {percentage_accepted_evidences_normalized_variants}%"

'Percentage of accepted Normalized Variant Evidence items: 34.75%'

# Analysis of Unable to Normalize Queries

## List of Unable to Normalize Variant ID's

In [21]:
not_normalized_quer_df = pd.read_csv("./unable_to_normalize_queries.csv", sep= "\t")
not_normalized_quer_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati..."
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati..."
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati..."
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati..."
4,4004,TERT C250T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati..."


In [22]:
not_normalized_variant_id_list = list(not_normalized_quer_df["variant_id"])

## Variant analysis

In [23]:
total_number_not_normalized_variants = len(not_normalized_variant_id_list)
percentage_not_normalized_variants = "{:.2f}".format(total_number_not_normalized_variants/total_number_variants*100)
f"Percentage of Unable to Normalize Variants in CIViC: {percentage_not_normalized_variants}%"

'Percentage of Unable to Normalize Variants in CIViC: 1.76%'

In [24]:
not_normalized_quer_df.variant_accepted.value_counts()

variant_accepted
False    53
True      9
Name: count, dtype: int64

In [25]:
number_not_accepted_not_normalized_variants = len(not_normalized_quer_df) - not_normalized_quer_df.variant_accepted.sum()
percentage_not_accepted_not_normalized_variants = "{:.2f}".format(number_not_accepted_not_normalized_variants/total_number_not_normalized_variants*100)
f"Percentage of not accepted Unable to Normalize Variants: {percentage_not_accepted_not_normalized_variants}%"

'Percentage of not accepted Unable to Normalize Variants: 85.48%'

In [26]:
number_accepted_not_normalized_variants = not_normalized_quer_df.variant_accepted.sum()
percentage_accepted_not_normalized_variants = "{:.2f}".format(number_accepted_not_normalized_variants/total_number_not_normalized_variants*100)
f"Percentage of accepted Unable to Normalize Variants: {percentage_accepted_not_normalized_variants}%"

'Percentage of accepted Unable to Normalize Variants: 14.52%'

## Import evidence ID's associated with the Unable to Normalize Variants using variant ID

Evidence items are linked to the molecular profiles associated with variant items. To pull evidence IDs, need to use molecular profiles. For more information on the structure of a variant item

In [27]:
not_normalized_variants_evidence_ids = []

for v in not_normalized_variant_id_list:
    not_normalized_variant_evidence_id_list = []

    for variant in civic_variant_ids: 
        if int(v) == variant.id:

            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in not_normalized_variant_evidence_id_list:
                        not_normalized_variant_evidence_id_list.append(e.id)

            not_normalized_variant_evidence_id_list = not_normalized_variant_evidence_id_list or ""

    not_normalized_variants_evidence_ids.append(not_normalized_variant_evidence_id_list)

not_normalized_quer_df["evidence_ids"] = not_normalized_variants_evidence_ids
not_normalized_quer_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,evidence_ids
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",[1812]
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[10128]
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[10135]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...","[655, 1646, 6934, 6935]"
4,4004,TERT C250T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[10331]


## List of Evidence ID's of Unable to Normalize Variants

In [28]:
not_normalized_quer_df = not_normalized_quer_df.explode(column="evidence_ids")
not_normalized_quer_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,evidence_ids
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1812
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10128
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10135
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",655
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1646


In [29]:
not_normalized_evidence_id_list = list(not_normalized_quer_df["evidence_ids"])


## Import evidence status, rating, and level associated with a specific evidence ID

In [30]:
not_normalized_variants_evidence_statuses = []
not_normalized_variants_evidence_ratings = []
not_normalized_variants_evidence_levels = []

for e in not_normalized_evidence_id_list:
    not_normalized_variant_evidence_status_list = []
    not_normalized_variant_evidence_rating_list = []
    not_normalized_variant_evidence_level_list = []

    for evidence in civic_evidence_ids: 
        if int(e) == evidence.id:

            if evidence.status not in not_normalized_variant_evidence_status_list:
                not_normalized_variant_evidence_status_list.append(evidence.status)
            not_normalized_variant_evidence_status_list = not_normalized_variant_evidence_status_list or ""

            if evidence.rating not in not_normalized_variant_evidence_rating_list:
                not_normalized_variant_evidence_rating_list.append(evidence.rating)
            not_normalized_variant_evidence_rating_list = not_normalized_variant_evidence_rating_list or ""

            if evidence.evidence_level not in not_normalized_variant_evidence_level_list:
                not_normalized_variant_evidence_level_list.append(evidence.evidence_level)
            not_normalized_variant_evidence_level_list = not_normalized_variant_evidence_level_list or ""

    not_normalized_variants_evidence_statuses.append(not_normalized_variant_evidence_status_list)
    not_normalized_variants_evidence_ratings.append(not_normalized_variant_evidence_rating_list)
    not_normalized_variants_evidence_levels.append(not_normalized_variant_evidence_level_list)

not_normalized_quer_df["evidence_status"] = not_normalized_variants_evidence_statuses
not_normalized_quer_df["evidence_rating"] = not_normalized_variants_evidence_ratings
not_normalized_quer_df["evidence_level"] = not_normalized_variants_evidence_levels
not_normalized_quer_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,evidence_ids,evidence_status,evidence_rating,evidence_level
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1812,[accepted],[1],[C]
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10128,[submitted],[3],[D]
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10135,[submitted],[3],[D]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",655,[accepted],[5],[B]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1646,[accepted],[3],[B]


## Evidence analysis

In [31]:
total_number_not_normalized_variant_unique_evidence_items = len(not_normalized_evidence_id_list)
not_normalized_percentage_evidence_items = "{:.2f}".format(total_number_not_normalized_variant_unique_evidence_items/total_number_evidences*100)
f"Percentage of Unable to Normalize Variant Evidence items in CIViC: {not_normalized_percentage_evidence_items}%"

'Percentage of Unable to Normalize Variant Evidence items in CIViC: 0.79%'

In [32]:
not_normalized_quer_df['evidence_status'] = not_normalized_quer_df['evidence_status'].str.join(', ')


In [33]:
not_normalized_quer_df['evidence_accepted'] = not_normalized_quer_df.evidence_status.map({'accepted':True ,'submitted':False, 'rejected':False})
not_normalized_quer_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,evidence_ids,evidence_status,evidence_rating,evidence_level,evidence_accepted
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1812,accepted,[1],[C],True
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10128,submitted,[3],[D],False
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10135,submitted,[3],[D],False
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",655,accepted,[5],[B],True
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1646,accepted,[3],[B],True


In [34]:
not_normalized_quer_df.evidence_accepted.value_counts()

evidence_accepted
False    65
True     14
Name: count, dtype: int64

In [35]:
number_not_accepted_evidences_not_normalized_variants = len(not_normalized_quer_df) - not_normalized_quer_df.evidence_accepted.sum()
percentage_not_accepted_evidences_not_normalized_variants = "{:.2f}".format(number_not_accepted_evidences_not_normalized_variants/total_number_not_normalized_variant_unique_evidence_items*100)
f"Percentage of not accepted Unable to Normalize Variant Evidence items: {percentage_not_accepted_evidences_not_normalized_variants}%"

'Percentage of not accepted Unable to Normalize Variant Evidence items: 82.28%'

In [36]:
number_accepted_evidences_not_normalized_variants = not_normalized_quer_df.evidence_accepted.sum()
percentage_accepted_evidences_not_normalized_variants = "{:.2f}".format(number_accepted_evidences_not_normalized_variants/total_number_not_normalized_variant_unique_evidence_items*100)
f"Percentage of accepted Unable to Normalize Variant Evidence items: {percentage_accepted_evidences_not_normalized_variants}%"

'Percentage of accepted Unable to Normalize Variant Evidence items: 17.72%'

# Analysis of Not Supported Variants

### List of Not Supported Variant ID's

In [37]:
not_supported_variants_df = pd.read_csv("./not_supported_variants.csv", sep= "\t")
not_supported_variants_df.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted
0,4417,ALK,FBXO11::ALK,Fusion,False
1,4188,VHL,10kb Deletion,Region Defined Variant,False
2,785,CHEK2,1100DELC,Protein Consequence,True
3,2438,VHL,235 (CAG-TAG),Other,False
4,4186,VHL,2kb Deletion,Region Defined Variant,False


In [38]:
not_supported_variant_id_list = list(not_supported_variants_df["variant_id"])

## Variant Analysis

In [39]:
total_number_unique_not_supported_variants = len(set(not_supported_variants_df.variant_id))
percentage_not_supported_variants = "{:.2f}".format(total_number_unique_not_supported_variants/total_number_variants*100)
f"Percentage of Not Supported Variants in CIViC: {percentage_not_supported_variants}%"

'Percentage of Not Supported Variants in CIViC: 34.24%'

In [40]:
not_supported_variants_df.variant_accepted.value_counts()

variant_accepted
True     736
False    471
Name: count, dtype: int64

In [41]:
number_not_accepted_not_supported_variants = len(not_supported_variants_df) - not_supported_variants_df.variant_accepted.sum()
percentage_not_accepted_not_supported_variants = "{:.2f}".format(number_not_accepted_not_supported_variants/total_number_unique_not_supported_variants*100)
f"Percentage of not accepted Not Supported Variants: {percentage_not_accepted_not_supported_variants}%"

'Percentage of not accepted Not Supported Variants: 39.02%'

In [42]:
number_accepted_not_supported_variants = not_supported_variants_df.variant_accepted.sum()
percentage_accepted_not_supported_variants = "{:.2f}".format(number_accepted_not_supported_variants/total_number_unique_not_supported_variants*100)
f"Percentage of accepted Not Supported Variants: {percentage_accepted_not_supported_variants}%"

'Percentage of accepted Not Supported Variants: 60.98%'

### Not Supported Variant Analysis by Subcategory 

In [43]:
not_supported_variant_categories_summary_data = dict()
for category in ["Expression", "Epigenetic Modification", "Fusion", "Protein Consequence", "Gene Function", "Rearrangements", "Copy Number", "Other", "Genotypes Easy", "Genotypes Compound", "Region Defined Variant"]:
    print(category)
    not_supported_variant_categories_summary_data[category] = {}
    category_df = not_supported_variants_df[not_supported_variants_df.category == category]
    
    number_unique_not_supported_category_variants = len(set(category_df.variant_id))
    percent_not_supported_category_variant_of_civic = "{:.2f}".format(number_unique_not_supported_category_variants/total_number_variants*100)
    not_supported_variant_categories_summary_data[category]["percent_not_supported_category_variant_of_civic"] = percent_not_supported_category_variant_of_civic
    print(f"Percent of {category} Variants in CIViC: {percent_not_supported_category_variant_of_civic}%")

    percent_not_supported_category_variant_of_total_not_supported = "{:.2f}".format(number_unique_not_supported_category_variants/total_number_unique_not_supported_variants*100)
    not_supported_variant_categories_summary_data[category]["percent_not_supported_category_variant_of_total_not_supported"] = percent_not_supported_category_variant_of_total_not_supported
    print(f"Percent of {category} Variants in Not Supported Variants: {percent_not_supported_category_variant_of_total_not_supported}%")

    number_accepted_not_supported_category_variants = category_df.variant_accepted.sum()
    percentage_accepted_not_supported_category_variants =  "{:.2f}".format(number_accepted_not_supported_category_variants/number_unique_not_supported_category_variants*100)
    not_supported_variant_categories_summary_data[category]["percentage_accepted_not_supported_category_variants"] = percentage_accepted_not_supported_category_variants
    print(f"Percent of Accepted {category} Variants: {percentage_accepted_not_supported_category_variants}%")
  
    number_not_accepted_not_supported_category_variants =  len(category_df) - category_df.variant_accepted.sum()
    percentage_not_accepted_not_supported_category_variants =  "{:.2f}".format(number_not_accepted_not_supported_category_variants/number_unique_not_supported_category_variants*100)
    not_supported_variant_categories_summary_data[category]["percentage_not_accepted_not_supported_category_variants"] = percentage_not_accepted_not_supported_category_variants
    print(f"Percent of Not Accepted {category} Variants: {percentage_not_accepted_not_supported_category_variants}%")
    
    print("--------------------")

Expression
Percent of Expression Variants in CIViC: 8.26%
Percent of Expression Variants in Not Supported Variants: 24.11%
Percent of Accepted Expression Variants: 61.86%
Percent of Not Accepted Expression Variants: 38.14%
--------------------
Epigenetic Modification
Percent of Epigenetic Modification Variants in CIViC: 0.43%
Percent of Epigenetic Modification Variants in Not Supported Variants: 1.24%
Percent of Accepted Epigenetic Modification Variants: 93.33%
Percent of Not Accepted Epigenetic Modification Variants: 6.67%
--------------------
Fusion
Percent of Fusion Variants in CIViC: 8.48%
Percent of Fusion Variants in Not Supported Variants: 24.77%
Percent of Accepted Fusion Variants: 67.89%
Percent of Not Accepted Fusion Variants: 32.11%
--------------------
Protein Consequence
Percent of Protein Consequence Variants in CIViC: 3.63%
Percent of Protein Consequence Variants in Not Supported Variants: 10.60%
Percent of Accepted Protein Consequence Variants: 54.69%
Percent of Not Acc

## Import evidence ID's associated with the Not Supported Variants using variant ID

In [44]:
not_supported_variants_add_evidence_df = not_supported_variants_df.copy()
not_supported_variants_evidence_ids = []

for v in not_supported_variant_id_list:
    not_supported_variant_evidence_id_list = []

    for variant in civic_variant_ids:
        if int(v) == variant.id:

            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in not_supported_variant_evidence_id_list:
                        not_supported_variant_evidence_id_list.append(e.id)

            not_supported_variant_evidence_id_list = not_supported_variant_evidence_id_list or ""

    not_supported_variants_evidence_ids.append(not_supported_variant_evidence_id_list)

not_supported_variants_add_evidence_df["evidence_id"] = not_supported_variants_evidence_ids
not_supported_variants_add_evidence_df.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_id
0,4417,ALK,FBXO11::ALK,Fusion,False,[7428]
1,4188,VHL,10kb Deletion,Region Defined Variant,False,[10678]
2,785,CHEK2,1100DELC,Protein Consequence,True,"[7235, 1849, 1850]"
3,2438,VHL,235 (CAG-TAG),Other,False,[6524]
4,4186,VHL,2kb Deletion,Region Defined Variant,False,"[10791, 10676]"


## List of Evidence ID's on Not Supported Variants

In [45]:
not_supported_variants_add_evidence_df = not_supported_variants_add_evidence_df.explode(column="evidence_id")
not_supported_variants_add_evidence_df.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_id
0,4417,ALK,FBXO11::ALK,Fusion,False,7428
1,4188,VHL,10kb Deletion,Region Defined Variant,False,10678
2,785,CHEK2,1100DELC,Protein Consequence,True,7235
2,785,CHEK2,1100DELC,Protein Consequence,True,1849
2,785,CHEK2,1100DELC,Protein Consequence,True,1850


In [46]:
not_supported_evidence_id_list = list(not_supported_variants_add_evidence_df["evidence_id"])

## Import evidence status, rating, and level associated with a specific evidence ID

In [47]:
not_supported_variants_evidence_statuses = []
not_supported_variants_evidence_ratings = []
not_supported_variants_evidence_levels = []

for e in not_supported_evidence_id_list:
    not_supported_variant_evidence_status_list = []
    not_supported_variant_evidence_rating_list = []
    not_supported_variant_evidence_level_list = []

    for evidence in civic_evidence_ids:
        if int(e) == evidence.id:

            if evidence.status not in not_supported_variant_evidence_status_list:
                not_supported_variant_evidence_status_list.append(evidence.status)
            not_supported_variant_evidence_status_list = not_supported_variant_evidence_status_list or ""

            if evidence.rating not in not_supported_variant_evidence_rating_list:
                not_supported_variant_evidence_rating_list.append(evidence.rating)
            not_supported_variant_evidence_rating_list = not_supported_variant_evidence_rating_list or ""

            if evidence.evidence_level not in not_supported_variant_evidence_level_list:
                not_supported_variant_evidence_level_list.append(evidence.evidence_level)
            not_supported_variant_evidence_level_list = not_supported_variant_evidence_level_list or ""

    not_supported_variants_evidence_statuses.append(not_supported_variant_evidence_status_list)
    not_supported_variants_evidence_ratings.append(not_supported_variant_evidence_rating_list)
    not_supported_variants_evidence_levels.append(not_supported_variant_evidence_level_list)

not_supported_variants_add_evidence_df["evidence_status"] = not_supported_variants_evidence_statuses
not_supported_variants_add_evidence_df["evidence_rating"] = not_supported_variants_evidence_ratings
not_supported_variants_add_evidence_df["evidence_level"] = not_supported_variants_evidence_levels
not_supported_variants_add_evidence_df.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_id,evidence_status,evidence_rating,evidence_level
0,4417,ALK,FBXO11::ALK,Fusion,False,7428,[submitted],[3],[C]
1,4188,VHL,10kb Deletion,Region Defined Variant,False,10678,[submitted],[3],[C]
2,785,CHEK2,1100DELC,Protein Consequence,True,7235,[submitted],[4],[B]
2,785,CHEK2,1100DELC,Protein Consequence,True,1849,[rejected],[3],[B]
2,785,CHEK2,1100DELC,Protein Consequence,True,1850,[accepted],[3],[B]


## Evidence analysis

In [48]:
total_number_not_supported_variant_unique_evidence_items = len(set(not_supported_variants_add_evidence_df.evidence_id))
not_supported_variant_percentage_evidence_items = "{:.2f}".format(total_number_not_supported_variant_unique_evidence_items/total_number_evidences*100)
f"Percentage of Not Supported Variant Evidence items in CIViC: {not_supported_variant_percentage_evidence_items}%"

'Percentage of Not Supported Variant Evidence items in CIViC: 37.74%'

In [49]:
not_supported_variants_add_evidence_df['evidence_status'] = not_supported_variants_add_evidence_df['evidence_status'].str.join(', ')

In [50]:
not_supported_variants_add_evidence_df['evidence_accepted'] = not_supported_variants_add_evidence_df.evidence_status.map({'accepted':True ,'submitted':False, 'rejected':False})
not_supported_variants_add_evidence_df.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_id,evidence_status,evidence_rating,evidence_level,evidence_accepted
0,4417,ALK,FBXO11::ALK,Fusion,False,7428,submitted,[3],[C],False
1,4188,VHL,10kb Deletion,Region Defined Variant,False,10678,submitted,[3],[C],False
2,785,CHEK2,1100DELC,Protein Consequence,True,7235,submitted,[4],[B],False
2,785,CHEK2,1100DELC,Protein Consequence,True,1849,rejected,[3],[B],False
2,785,CHEK2,1100DELC,Protein Consequence,True,1850,accepted,[3],[B],True


In [51]:
not_supported_variants_add_evidence_df.evidence_accepted.value_counts()

evidence_accepted
True     2145
False    1687
Name: count, dtype: int64

In [52]:
number_not_accepted_evidences_not_supported_variants = len(not_supported_variants_add_evidence_df) - not_supported_variants_add_evidence_df.evidence_accepted.sum()
percentage_not_accepted_evidences_not_supported_variants = "{:.2f}".format(number_not_accepted_evidences_not_supported_variants/total_number_not_supported_variant_unique_evidence_items*100)
f"Percentage of not accepted Not Supported Variant Evidence items: {percentage_not_accepted_evidences_not_supported_variants}%"

'Percentage of not accepted Not Supported Variant Evidence items: 44.50%'

In [53]:
number_accepted_evidences_not_supported_variants = not_supported_variants_add_evidence_df.evidence_accepted.sum()
percentage_accepted_evidences_not_supported_variants = "{:.2f}".format(number_accepted_evidences_not_supported_variants/total_number_not_supported_variant_unique_evidence_items*100)
f"Percentage of accepted Not Supported Variant Evidence items: {percentage_accepted_evidences_not_supported_variants}%"

'Percentage of accepted Not Supported Variant Evidence items: 56.58%'

### Not Supported Variant Evidence Analysis by Subcategory 

 List all the possible variant categories

In [54]:
not_supported_variant_categories = not_supported_variants_add_evidence_df.category.unique()
not_supported_variant_categories 

array(['Fusion', 'Region Defined Variant', 'Protein Consequence', 'Other',
       'Rearrangements', 'Copy Number', 'Expression', 'Gene Function',
       'Genotypes Compound', 'Epigenetic Modification', 'Genotypes Easy'],
      dtype=object)

Evidence ID's can qualify for more than one kind of variant category

In [55]:
duplicate = not_supported_variants_add_evidence_df[not_supported_variants_add_evidence_df.duplicated('evidence_id', keep=False)]
duplicate.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_id,evidence_status,evidence_rating,evidence_level,evidence_accepted
0,4417,ALK,FBXO11::ALK,Fusion,False,7428,submitted,[3],[C],False
160,437,FLT3,D835,Protein Consequence,True,11260,submitted,[4],[A],False
160,437,FLT3,D835,Protein Consequence,True,11261,submitted,[4],[A],False
179,200,IKZF1,Deletion,Gene Function,True,7786,submitted,[5],[B],False
183,696,SMARCB1,Deletion,Gene Function,True,11177,submitted,[4],[B],False


In [56]:
not_supported_variant_categories_evidence_summary_data = dict()
for category in ["Expression", "Epigenetic Modification", "Fusion", "Protein Consequence", "Gene Function", "Rearrangements", "Copy Number", "Other", "Genotypes Easy", "Genotypes Compound", "Region Defined Variant"]:
    print(category)
    not_supported_variant_categories_evidence_summary_data[category] = {}
    evidence_category_df = not_supported_variants_add_evidence_df[not_supported_variants_add_evidence_df.category == category]
    
    number_unique_not_supported_category_evidence = len(set(evidence_category_df.evidence_id))
    percent_not_supported_category_variant_evidence_of_civic = "{:.2f}".format(number_unique_not_supported_category_evidence/total_number_evidences*100)
    not_supported_variant_categories_evidence_summary_data[category]["percent_not_supported_category_variant_evidence_of_civic"] = percent_not_supported_category_variant_evidence_of_civic
    print(f"Percent of {category} Variant Evidence items in CIViC: {percent_not_supported_category_variant_evidence_of_civic}%")
    

    percent_not_supported_category_variant_evidence_of_total_not_supported = "{:.2f}".format(number_unique_not_supported_category_evidence/total_number_not_supported_variant_unique_evidence_items*100)
    not_supported_variant_categories_evidence_summary_data[category]["percent_not_supported_category_variant_evidence_of_total_not_supported"] = percent_not_supported_category_variant_evidence_of_total_not_supported
    print(f"Percent of {category} Variant Evidence items in total Not Supported Variant Evidence items: {percent_not_supported_category_variant_evidence_of_total_not_supported}%")

    number_accepted_not_supported_category_variant_evidence = evidence_category_df.evidence_accepted.sum()
    percentage_accepted_evidence_not_supported_category_variants =  "{:.2f}".format(number_accepted_not_supported_category_variant_evidence/number_unique_not_supported_category_evidence*100)
    not_supported_variant_categories_evidence_summary_data[category]["percentage_accepted_evidence_not_supported_category_variants"] = percentage_accepted_evidence_not_supported_category_variants
    print(f"Percent of Accepted {category} Variant Evidence items: {percentage_accepted_evidence_not_supported_category_variants}%")
  
    number_not_accepted_not_supported_category_variant_evidence = number_unique_not_supported_category_evidence - evidence_category_df.evidence_accepted.sum()
    percentage_not_accepted_evidence_not_supported_category_variants =  "{:.2f}".format(number_not_accepted_not_supported_category_variant_evidence/number_unique_not_supported_category_evidence*100)
    not_supported_variant_categories_evidence_summary_data[category]["percentage_not_accepted_evidence_not_supported_category_variants"] = percentage_not_accepted_evidence_not_supported_category_variants
    print(f"Percent of Not Accepted {category} Variant Evidence items: {percentage_not_accepted_evidence_not_supported_category_variants}%")
    
    print("--------------------")

Expression
Percent of Expression Variant Evidence items in CIViC: 6.14%
Percent of Expression Variant Evidence items in total Not Supported Variant Evidence items: 16.28%
Percent of Accepted Expression Variant Evidence items: 55.43%
Percent of Not Accepted Expression Variant Evidence items: 44.57%
--------------------
Epigenetic Modification
Percent of Epigenetic Modification Variant Evidence items in CIViC: 0.24%
Percent of Epigenetic Modification Variant Evidence items in total Not Supported Variant Evidence items: 0.63%
Percent of Accepted Epigenetic Modification Variant Evidence items: 91.67%
Percent of Not Accepted Epigenetic Modification Variant Evidence items: 8.33%
--------------------
Fusion
Percent of Fusion Variant Evidence items in CIViC: 12.06%
Percent of Fusion Variant Evidence items in total Not Supported Variant Evidence items: 31.94%
Percent of Accepted Fusion Variant Evidence items: 59.70%
Percent of Not Accepted Fusion Variant Evidence items: 40.30%
-----------------

# Summary

## Variant Analysis

In [57]:
all_variant_dict = {'Variant Category':['Normalized', 'Unable to Normalize', 'Not Supported'],
        'Percentage of all CIViC Variant Items': [percentage_normalized_variants, percentage_not_normalized_variants, percentage_not_supported_variants],
        'Percentage of Accepted Variant Items':[ percentage_accepted_normalized_variants, percentage_accepted_not_normalized_variants, percentage_accepted_not_supported_variants],
        'Percentage of Not Accepted Variant Items': [ percentage_not_accepted_normalized_variants, percentage_not_accepted_not_normalized_variants, percentage_not_accepted_not_supported_variants]}

In [58]:
all_variant_df = pd.DataFrame(all_variant_dict)
all_variant_df

Unnamed: 0,Variant Category,Percentage of all CIViC Variant Items,Percentage of Accepted Variant Items,Percentage of Not Accepted Variant Items
0,Normalized,52.79,46.53,53.47
1,Unable to Normalize,1.76,14.52,85.48
2,Not Supported,34.24,60.98,39.02


In [59]:
not_supported_variant_categories = ['Expression','Epigenetic Modification', 'Fusion', 'Protein Consequence', 'Gene Function', 'Rearrangements', 'Copy Number', 'Other', 'Genotypes Easy', 'Genotypes Compound', 'Region Defined Variant' ]
not_supported_percent_of_civic_variant_items = [v["percent_not_supported_category_variant_of_civic"] for k, v in not_supported_variant_categories_summary_data.items()]
not_supported_percent_not_supported_variant_items = [v["percent_not_supported_category_variant_of_total_not_supported"] for k, v in not_supported_variant_categories_summary_data.items()]
not_supported_percent_of_accepted_variant_items = [v["percentage_accepted_not_supported_category_variants"] for k, v in not_supported_variant_categories_summary_data.items()]
not_supported_percent_of_not_accepted_variant_items = [v["percentage_not_accepted_not_supported_category_variants"] for k, v in not_supported_variant_categories_summary_data.items()]

In [60]:
not_supported_variant_dict = {'Category': not_supported_variant_categories,
                                'Percent of Not Supported Variant Items':not_supported_percent_not_supported_variant_items,
                                'Percent of all CIViC Variant Items':not_supported_percent_of_civic_variant_items,
                                'Percent of Accepted Variant Items':not_supported_percent_of_accepted_variant_items,
                                'Percent of Not Accepted Variant Items':not_supported_percent_of_not_accepted_variant_items}

In [61]:
not_supported_variant_df = pd.DataFrame(not_supported_variant_dict)
not_supported_variant_df

Unnamed: 0,Category,Percent of Not Supported Variant Items,Percent of all CIViC Variant Items,Percent of Accepted Variant Items,Percent of Not Accepted Variant Items
0,Expression,24.11,8.26,61.86,38.14
1,Epigenetic Modification,1.24,0.43,93.33,6.67
2,Fusion,24.77,8.48,67.89,32.11
3,Protein Consequence,10.6,3.63,54.69,45.31
4,Gene Function,7.29,2.5,55.68,44.32
5,Rearrangements,4.14,1.42,24.0,76.0
6,Copy Number,2.82,0.96,55.88,44.12
7,Other,7.62,2.61,46.74,53.26
8,Genotypes Easy,0.91,0.31,72.73,27.27
9,Genotypes Compound,0.5,0.17,66.67,33.33


## Evidence Analysis

In [62]:
all_variant_evidence_dict = {'Variant Category':['Normalized', 'Unable to Normalize', 'Not Supported'],
        'Percentage of all CIViC Evidence Items': [normalized_percentage_evidence_items, not_normalized_percentage_evidence_items, not_supported_variant_percentage_evidence_items],
        'Percentage of Accepted Evidence Items': [percentage_accepted_evidences_normalized_variants, percentage_accepted_evidences_not_normalized_variants, percentage_accepted_evidences_not_supported_variants],
        'Percentage of Not Accepted Evidence Items':[percentage_not_accepted_evidences_normalized_variants, percentage_not_accepted_evidences_not_normalized_variants, percentage_not_accepted_evidences_not_supported_variants],}

In [63]:
all_variant_evidence_df = pd.DataFrame(all_variant_evidence_dict)
all_variant_evidence_df

Unnamed: 0,Variant Category,Percentage of all CIViC Evidence Items,Percentage of Accepted Evidence Items,Percentage of Not Accepted Evidence Items
0,Normalized,58.71,34.75,66.47
1,Unable to Normalize,0.79,17.72,82.28
2,Not Supported,37.74,56.58,44.5


In [64]:
not_supported_variant_categories = ['Expression','Epigenetic Modification', 'Fusion', 'Protein Consequence', 'Gene Function', 'Rearrangements', 'Copy Number', 'Other', 'Genotypes Easy', 'Genotypes Compound', 'Region Defined Variant' ]
not_supported_percent_of_civic_evidence_items = [v["percent_not_supported_category_variant_evidence_of_civic"] for k, v in not_supported_variant_categories_evidence_summary_data.items()]
not_supported_percent_not_supported_variant_evidence_items = [v["percent_not_supported_category_variant_evidence_of_total_not_supported"] for k, v in not_supported_variant_categories_evidence_summary_data.items()]
not_supported_percent_of_accepted_evidence_items = [v["percentage_accepted_evidence_not_supported_category_variants"] for k, v in not_supported_variant_categories_evidence_summary_data.items()]
not_supported_percent_of_not_accepted_evidence_items = [v["percentage_not_accepted_evidence_not_supported_category_variants"] for k, v in not_supported_variant_categories_evidence_summary_data.items()]

In [65]:
not_supported_variant_evidence_dict = {'Category': not_supported_variant_categories,
                                'Percent of CIViC Evidence Items': not_supported_percent_of_civic_evidence_items,
                                'Percent of Not Supported Variant Evidence Items': not_supported_percent_not_supported_variant_evidence_items,
                                'Percent of Accepted Evidence Items':not_supported_percent_of_accepted_evidence_items,
                                'Percent of Not Accepted Evidence Items':not_supported_percent_of_not_accepted_evidence_items}

In [66]:
not_supported_variant_evidence_df = pd.DataFrame(not_supported_variant_evidence_dict)
not_supported_variant_evidence_df

Unnamed: 0,Category,Percent of CIViC Evidence Items,Percent of Not Supported Variant Evidence Items,Percent of Accepted Evidence Items,Percent of Not Accepted Evidence Items
0,Expression,6.14,16.28,55.43,44.57
1,Epigenetic Modification,0.24,0.63,91.67,8.33
2,Fusion,12.06,31.94,59.7,40.3
3,Protein Consequence,3.0,7.94,63.46,36.54
4,Gene Function,3.42,9.07,43.9,56.1
5,Rearrangements,1.37,3.64,38.41,61.59
6,Copy Number,0.69,1.82,44.93,55.07
7,Other,1.61,4.27,37.04,62.96
8,Genotypes Easy,0.23,0.61,56.52,43.48
9,Genotypes Compound,0.07,0.18,57.14,42.86


#### Notes: 
    Kori's analysis has variants under the transcript ariant category, while I do not
    Kori's analysis yields 45.26% of not supported variants, mine yeilds 34.4%
    other categories in not supported variants match in variant percentages b/w Kori's and mine