# Analysis of Normalized Queries

This notebook contains an analysis on CIViC evidence data

In [235]:
import csv
import pandas as pd
import numpy as np
from civicpy import civic
from pathlib import Path 
import zipfile 
import plotly.express as px
import seaborn as sb

In [236]:
# Use latest cache that has been pushed to the repo
latest_cache_zip_path = sorted(Path().glob("cache-*.pkl.zip"))[-1]

with zipfile.ZipFile(latest_cache_zip_path, "r") as zip_ref:
    zip_ref.extractall()

civic.load_cache(local_cache_path=Path("cache.pkl"), on_stale="ignore")

True

In [237]:
# civic.load_cache(on_stale="ignore")

## Total Variants in CIViC

In [238]:
civic_variant_ids = civic.get_all_variants()
total_number_variants = len(civic_variant_ids)
f"Total Number of variants in CIViC: {total_number_variants}"

'Total Number of variants in CIViC: 3553'

## Total Evidence items in CIViC

Need to remove all rejected evidence items

In [239]:
civic_evidence_ids = civic.get_all_evidence(include_status=['accepted', 'submitted'])

In [240]:
total_number_evidences = len(civic_evidence_ids)
f"Total Number of evidence items in CIViC: {total_number_evidences}"

'Total Number of evidence items in CIViC: 9766'

## Total Molecular Profiles in CIViC

In [241]:
civic_molprof_ids = civic.get_all_molecular_profiles(include_status=['accepted', 'submitted', 'rejected'])

## List of Normalized Variants ID's

In [242]:
normalized_queries_df = pd.read_csv("./able_to_normalize_queries.csv", sep= "\t")
normalized_queries_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True
1,1988,NC_000003.11:g.10191649A>T,genomic,True
2,2488,3-10191647-T-G,genomic,True
3,1986,NC_000003.11:g.10191648G>T,genomic,True
4,1987,NC_000003.11:g.10191649A>G,genomic,True


In [243]:
normalized_queries_df = normalized_queries_df.drop_duplicates(subset=['variant_id'])

In [244]:
normalized_variant_id_list = list(normalized_queries_df["variant_id"])

## Variant analysis

In [245]:
total_number_normalized_variants = len(set(normalized_variant_id_list))
fraction_normalized_variants = f"{total_number_normalized_variants} / {total_number_variants}"
f"Number of Normalized Variants in CIViC: {fraction_normalized_variants}"

'Number of Normalized Variants in CIViC: 1869 / 3553'

In [246]:
percentage_normalized_variants = "{:.2f}".format(total_number_normalized_variants/total_number_variants*100) + "%"
f"Percentage of Normalized Variants in CIViC: {percentage_normalized_variants}"

'Percentage of Normalized Variants in CIViC: 52.60%'

In [247]:
normalized_queries_df.variant_accepted.value_counts()

variant_accepted
False    1003
True      866
Name: count, dtype: int64

In [248]:
number_not_accepted_normalized_variants = len(normalized_queries_df) - normalized_queries_df.variant_accepted.sum()
fraction_not_accepted_normalized_variants = f'{number_not_accepted_normalized_variants} / {total_number_normalized_variants}'
f"Number of not accepted Normalized Variants: {fraction_not_accepted_normalized_variants}"

'Number of not accepted Normalized Variants: 1003 / 1869'

In [249]:
percentage_not_accepted_normalized_variants = "{:.2f}".format(number_not_accepted_normalized_variants/total_number_normalized_variants*100) + "%"
f"Percentage of not accepted Normalized Variants: {percentage_not_accepted_normalized_variants}"

'Percentage of not accepted Normalized Variants: 53.67%'

In [250]:
number_accepted_normalized_variants = normalized_queries_df.variant_accepted.sum()
fraction_accepted_normalized_variants= f"{number_accepted_normalized_variants} /{total_number_normalized_variants}"
f"Number of accepted Normalized Variants: {fraction_accepted_normalized_variants}"

'Number of accepted Normalized Variants: 866 /1869'

In [251]:
percentage_accepted_normalized_variants = "{:.2f}".format(number_accepted_normalized_variants/total_number_normalized_variants*100) + "%"
f"Percentage of accepted Normalized Variants: {percentage_accepted_normalized_variants}"

'Percentage of accepted Normalized Variants: 46.33%'

## Import evidence ID's associated with the Normalized Variants using variant ID

Evidence items are linked to the molecular profiles associated with variant items. To pull evidence IDs, need to use molecular profiles. For more information on the structure of a variant item

In [252]:
normalized_queries_add_evidence_df = normalized_queries_df.copy()
normalized_variants_evidence_id = []

for v in normalized_variant_id_list:
    variant_evidence_id_list = []

    for variant in civic_variant_ids:
        if int(v) == variant.id:

            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in variant_evidence_id_list:
                        variant_evidence_id_list.append(e.id)

            variant_evidence_id_list = variant_evidence_id_list or ""

    normalized_variants_evidence_id.append(variant_evidence_id_list)
    
normalized_queries_add_evidence_df["evidence_id"] = normalized_variants_evidence_id
normalized_queries_add_evidence_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,evidence_id
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,"[9347, 6724]"
1,1988,NC_000003.11:g.10191649A>T,genomic,True,[5336]
2,2488,3-10191647-T-G,genomic,True,"[10779, 6723, 8258]"
3,1986,NC_000003.11:g.10191648G>T,genomic,True,[5334]
4,1987,NC_000003.11:g.10191649A>G,genomic,True,[5335]


## List of Evidence ID's of Normalized Variants

In [253]:
normalized_queries_add_evidence_df = normalized_queries_add_evidence_df.explode(column="evidence_id")
normalized_queries_add_evidence_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,evidence_id
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,9347
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,6724
1,1988,NC_000003.11:g.10191649A>T,genomic,True,5336
2,2488,3-10191647-T-G,genomic,True,10779
2,2488,3-10191647-T-G,genomic,True,6723


In [254]:
normalized_variant_evidence_id_list = list(normalized_queries_add_evidence_df["evidence_id"])

## Import evidence status, rating, and level associated with a specific evidence ID
    please see the CIViC documentation for evidence item attribute decriptions (https://civic.readthedocs.io/en/latest/model/evidence.html)

In [255]:
normalized_variants_evidence_statuses = []
normalized_variants_evidence_ratings = []
normalized_variants_evidence_levels = []

for e in normalized_variant_evidence_id_list:
    variant_evidence_status_list = []
    variant_evidence_rating_list = []
    variant_evidence_level_list = []

    for evidence in civic_evidence_ids: 
        if int(e) == evidence.id:

            if evidence.status not in variant_evidence_status_list:
                variant_evidence_status_list.append(evidence.status)
            variant_evidence_status_list = variant_evidence_status_list or ""

            if evidence.rating not in variant_evidence_rating_list:
                variant_evidence_rating_list.append(evidence.rating)
            variant_evidence_rating_list = variant_evidence_rating_list or ""

            if evidence.evidence_level not in variant_evidence_level_list:
                variant_evidence_level_list.append(evidence.evidence_level)
            variant_evidence_level_list = variant_evidence_level_list or ""

    normalized_variants_evidence_statuses.append(variant_evidence_status_list)
    normalized_variants_evidence_ratings.append(variant_evidence_rating_list)
    normalized_variants_evidence_levels.append(variant_evidence_level_list)

normalized_queries_add_evidence_df["evidence_status"] = normalized_variants_evidence_statuses
normalized_queries_add_evidence_df["evidence_rating"] = normalized_variants_evidence_ratings
normalized_queries_add_evidence_df["evidence_level"] = normalized_variants_evidence_levels
normalized_queries_add_evidence_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,evidence_id,evidence_status,evidence_rating,evidence_level
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,9347,[submitted],[3],[C]
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,6724,[accepted],[2],[C]
1,1988,NC_000003.11:g.10191649A>T,genomic,True,5336,[accepted],[2],[C]
2,2488,3-10191647-T-G,genomic,True,10779,[submitted],[3],[C]
2,2488,3-10191647-T-G,genomic,True,6723,[accepted],[2],[C]


In [256]:
normalized_queries_add_evidence_df

Unnamed: 0,variant_id,query,query_type,variant_accepted,evidence_id,evidence_status,evidence_rating,evidence_level
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,9347,[submitted],[3],[C]
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,6724,[accepted],[2],[C]
1,1988,NC_000003.11:g.10191649A>T,genomic,True,5336,[accepted],[2],[C]
2,2488,3-10191647-T-G,genomic,True,10779,[submitted],[3],[C]
2,2488,3-10191647-T-G,genomic,True,6723,[accepted],[2],[C]
...,...,...,...,...,...,...,...,...
1864,877,NC_000020.11:g.58903752C>T,genomic,True,1995,[accepted],[2],[B]
1865,731,NC_000003.11:g.37056036G>A,genomic,True,1794,[accepted],[1],[C]
1866,3045,VHL p.F76del,protein,False,8240,[submitted],[2],[C]
1867,4475,MAP2K1 p.K57_G61del,protein,False,11301,[submitted],[4],[C]


In [257]:
normalized_queries_add_evidence_df['evidence_status'] = normalized_queries_add_evidence_df['evidence_status'].str.join(', ')

In [258]:
normalized_queries_add_evidence_df = normalized_queries_add_evidence_df.drop(normalized_queries_add_evidence_df[normalized_queries_add_evidence_df.evidence_status == "rejected"].index)
normalized_queries_add_evidence_df

Unnamed: 0,variant_id,query,query_type,variant_accepted,evidence_id,evidence_status,evidence_rating,evidence_level
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,9347,submitted,[3],[C]
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,6724,accepted,[2],[C]
1,1988,NC_000003.11:g.10191649A>T,genomic,True,5336,accepted,[2],[C]
2,2488,3-10191647-T-G,genomic,True,10779,submitted,[3],[C]
2,2488,3-10191647-T-G,genomic,True,6723,accepted,[2],[C]
...,...,...,...,...,...,...,...,...
1864,877,NC_000020.11:g.58903752C>T,genomic,True,1995,accepted,[2],[B]
1865,731,NC_000003.11:g.37056036G>A,genomic,True,1794,accepted,[1],[C]
1866,3045,VHL p.F76del,protein,False,8240,submitted,[2],[C]
1867,4475,MAP2K1 p.K57_G61del,protein,False,11301,submitted,[4],[C]


In [259]:
normalized_variant_evidence_id_list = list(normalized_queries_add_evidence_df["evidence_id"])

## Evidence analysis

In [260]:
total_number_normalized_variant_unique_evidence_items = len(set(normalized_queries_add_evidence_df.evidence_id))
normalized_fraction_evidence_items = f'{total_number_normalized_variant_unique_evidence_items} / {total_number_evidences}'
f"Number of Normalized Variant Evidence items in CIViC: {normalized_fraction_evidence_items}"

'Number of Normalized Variant Evidence items in CIViC: 5916 / 9766'

In [261]:
normalized_percentage_evidence_items = "{:.2f}".format(total_number_normalized_variant_unique_evidence_items/total_number_evidences*100) + '%'
f"Percentage of Normalized Variant Evidence items in CIViC: {normalized_percentage_evidence_items}"

'Percentage of Normalized Variant Evidence items in CIViC: 60.58%'

In [262]:
normalized_queries_add_evidence_df['evidence_accepted'] = normalized_queries_add_evidence_df.evidence_status.map({'accepted':True ,'submitted':False})
normalized_queries_add_evidence_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,evidence_id,evidence_status,evidence_rating,evidence_level,evidence_accepted
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,9347,submitted,[3],[C],False
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,6724,accepted,[2],[C],True
1,1988,NC_000003.11:g.10191649A>T,genomic,True,5336,accepted,[2],[C],True
2,2488,3-10191647-T-G,genomic,True,10779,submitted,[3],[C],False
2,2488,3-10191647-T-G,genomic,True,6723,accepted,[2],[C],True


In [263]:
normalized_queries_add_evidence_df = normalized_queries_add_evidence_df.drop_duplicates(subset= ['evidence_id'])

In [264]:
normalized_queries_add_evidence_df.evidence_accepted.value_counts()

evidence_accepted
False    3743
True     2032
Name: count, dtype: int64

In [265]:
number_submitted_evidences_normalized_variants = len(normalized_queries_add_evidence_df) - normalized_queries_add_evidence_df.evidence_accepted.sum()
fraction_submitted_evidences_normalized_variants = f'{number_submitted_evidences_normalized_variants} / {total_number_normalized_variant_unique_evidence_items}'
f"Number of submitted Normalized Variant Evidence items: {fraction_submitted_evidences_normalized_variants}"

'Number of submitted Normalized Variant Evidence items: 3884 / 5916'

In [266]:
percentage_submitted_evidences_normalized_variants = "{:.2f}".format(number_submitted_evidences_normalized_variants/total_number_normalized_variant_unique_evidence_items*100) + '%'
f"Percentage of not submitted Normalized Variant Evidence items: {percentage_submitted_evidences_normalized_variants}"

'Percentage of not submitted Normalized Variant Evidence items: 65.65%'

In [267]:
number_accepted_evidences_normalized_variants = normalized_queries_add_evidence_df.evidence_accepted.sum()
fraction_accepted_evidences_normalized_variants = f'{number_accepted_evidences_normalized_variants} / {total_number_normalized_variant_unique_evidence_items}'
f"Number of accepted Normalized Variant Evidence items: {fraction_accepted_evidences_normalized_variants}"

'Number of accepted Normalized Variant Evidence items: 2032 / 5916'

In [268]:
percentage_accepted_evidences_normalized_variants = "{:.2f}".format(number_accepted_evidences_normalized_variants/total_number_normalized_variant_unique_evidence_items*100) + '%'
f"Percentage of accepted Normalized Variant Evidence items: {percentage_accepted_evidences_normalized_variants}"

'Percentage of accepted Normalized Variant Evidence items: 34.35%'

## Impact
molecular profile score

### Import molecular profile id

In [269]:
normalized_queries_add_molprof_df = normalized_queries_df.copy()
normalized_variants_molprof_id = []

for v in normalized_variant_id_list:
    variant_molprof_id_list = []

    for variant in civic_variant_ids:
        if int(v) == variant.id:

            for mp in variant.molecular_profiles:
                    if mp.id not in variant_molprof_id_list:
                        variant_molprof_id_list.append(mp.id)

            variant_molprof_id_list = variant_molprof_id_list or ""

    normalized_variants_molprof_id.append(variant_molprof_id_list)
    
normalized_queries_add_molprof_df["molecular_profile_id"] = normalized_variants_molprof_id
normalized_queries_add_molprof_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,molecular_profile_id
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,[2362]
1,1988,NC_000003.11:g.10191649A>T,genomic,True,[1864]
2,2488,3-10191647-T-G,genomic,True,[2361]
3,1986,NC_000003.11:g.10191648G>T,genomic,True,[1862]
4,1987,NC_000003.11:g.10191649A>G,genomic,True,[1863]


In [270]:
normalized_queries_add_molprof_df.loc[normalized_queries_add_molprof_df['variant_id'] == 190 ]

Unnamed: 0,variant_id,query,query_type,variant_accepted,molecular_profile_id
82,190,EGFR Amplification,protein,True,"[190, 4175, 4346]"


In [271]:
normalized_variant_molprof_id_list = list(normalized_queries_add_molprof_df["molecular_profile_id"])

### Import molecular profile scores

In [272]:
normalized_variants_molprof_score = []

for mp_list in normalized_variant_molprof_id_list:
    variant_molprof_score_list = []
    for mp in mp_list:
        
        for molprof in civic_molprof_ids: 
            if int(mp) == molprof.id:

                if molprof.molecular_profile_score not in variant_molprof_score_list:
                    variant_molprof_score_list.append(molprof.molecular_profile_score)
                variant_molprof_score_list = variant_molprof_score_list or ""

    normalized_variants_molprof_score.append(variant_molprof_score_list)

normalized_queries_add_molprof_df["molecular_profile_score"] = normalized_variants_molprof_score

normalized_queries_add_molprof_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,molecular_profile_id,molecular_profile_score
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,[2362],[5.0]
1,1988,NC_000003.11:g.10191649A>T,genomic,True,[1864],[5.0]
2,2488,3-10191647-T-G,genomic,True,[2361],[5.0]
3,1986,NC_000003.11:g.10191648G>T,genomic,True,[1862],[10.0]
4,1987,NC_000003.11:g.10191649A>G,genomic,True,[1863],[5.0]


In [273]:
normalized_queries_add_molprof_df.loc[normalized_queries_add_molprof_df['variant_id'] == 190 ]

Unnamed: 0,variant_id,query,query_type,variant_accepted,molecular_profile_id,molecular_profile_score
82,190,EGFR Amplification,protein,True,"[190, 4175, 4346]","[173.0, 5.0, 0.0]"


In [274]:
normalized_queries_add_molprof_df['molecular_profile_score_sum'] = normalized_queries_add_molprof_df['molecular_profile_score'].apply(lambda x: sum(x))
normalized_queries_add_molprof_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,molecular_profile_id,molecular_profile_score,molecular_profile_score_sum
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,[2362],[5.0],5.0
1,1988,NC_000003.11:g.10191649A>T,genomic,True,[1864],[5.0],5.0
2,2488,3-10191647-T-G,genomic,True,[2361],[5.0],5.0
3,1986,NC_000003.11:g.10191648G>T,genomic,True,[1862],[10.0],10.0
4,1987,NC_000003.11:g.10191649A>G,genomic,True,[1863],[5.0],5.0


In [275]:
normalized_queries_add_molprof_df.loc[normalized_queries_add_molprof_df['variant_id'] == 190 ]

Unnamed: 0,variant_id,query,query_type,variant_accepted,molecular_profile_id,molecular_profile_score,molecular_profile_score_sum
82,190,EGFR Amplification,protein,True,"[190, 4175, 4346]","[173.0, 5.0, 0.0]",178.0


# Analysis of Unable to Normalize Queries

## List of Unable to Normalize Variant ID's

In [276]:
not_normalized_quer_df = pd.read_csv("./unable_to_normalize_queries.csv", sep= "\t")
not_normalized_quer_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati..."
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati..."
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati..."
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati..."
4,4004,TERT C250T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati..."


In [277]:
not_normalized_quer_df = not_normalized_quer_df.drop_duplicates(subset=['variant_id'])

In [278]:
not_normalized_variant_id_list = list(not_normalized_quer_df["variant_id"])

## Variant analysis

In [279]:
total_number_not_normalized_variants = len(not_normalized_variant_id_list)
fraction_not_normalized_variants = f"{total_number_not_normalized_variants} / {total_number_variants}"
f"Number of Unable to Normalize Variants in CIViC: {fraction_not_normalized_variants}"

'Number of Unable to Normalize Variants in CIViC: 63 / 3553'

In [280]:
percentage_not_normalized_variants = "{:.2f}".format(total_number_not_normalized_variants/total_number_variants*100) +"%"
f"Percentage of Unable to Normalize Variants in CIViC: {percentage_not_normalized_variants}"

'Percentage of Unable to Normalize Variants in CIViC: 1.77%'

In [281]:
not_normalized_quer_df.variant_accepted.value_counts()

variant_accepted
False    54
True      9
Name: count, dtype: int64

In [282]:
number_not_accepted_not_normalized_variants = len(not_normalized_quer_df) - not_normalized_quer_df.variant_accepted.sum()
fraction_not_accepted_not_normalized_variants = f"{number_not_accepted_not_normalized_variants} / {total_number_not_normalized_variants}"
f"Number of not accepted Unable to Normalize Variants: {fraction_not_accepted_not_normalized_variants}"

'Number of not accepted Unable to Normalize Variants: 54 / 63'

In [283]:
percentage_not_accepted_not_normalized_variants = "{:.2f}".format(number_not_accepted_not_normalized_variants/total_number_not_normalized_variants*100) +"%"
f"Percentage of not accepted Unable to Normalize Variants: {percentage_not_accepted_not_normalized_variants}"

'Percentage of not accepted Unable to Normalize Variants: 85.71%'

In [284]:
number_accepted_not_normalized_variants = not_normalized_quer_df.variant_accepted.sum()
fraction_accepted_not_normalized_variants = f"{number_accepted_not_normalized_variants} /{total_number_not_normalized_variants}"
f"Number of accepted Unable to Normalize Variants: {fraction_accepted_not_normalized_variants}"

'Number of accepted Unable to Normalize Variants: 9 /63'

In [285]:
percentage_accepted_not_normalized_variants = "{:.2f}".format(number_accepted_not_normalized_variants/total_number_not_normalized_variants*100) +"%"
f"Percentage of accepted Unable to Normalize Variants: {percentage_accepted_not_normalized_variants}"

'Percentage of accepted Unable to Normalize Variants: 14.29%'

## Import evidence ID's associated with the Unable to Normalize Variants using variant ID

Evidence items are linked to the molecular profiles associated with variant items. To pull evidence IDs, need to use molecular profiles. For more information on the structure of a variant item

In [286]:
not_normalized_quer_add_evidence_df = not_normalized_quer_df.copy()
not_normalized_variants_evidence_id = []

for v in not_normalized_variant_id_list:
    not_normalized_variant_evidence_id_list = []

    for variant in civic_variant_ids: 
        if int(v) == variant.id:

            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in not_normalized_variant_evidence_id_list:
                        not_normalized_variant_evidence_id_list.append(e.id)

            not_normalized_variant_evidence_id_list = not_normalized_variant_evidence_id_list or ""

    not_normalized_variants_evidence_id.append(not_normalized_variant_evidence_id_list)

not_normalized_quer_add_evidence_df["evidence_id"] = not_normalized_variants_evidence_id
not_normalized_quer_add_evidence_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,evidence_id
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",[1812]
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[10128]
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[10135]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...","[655, 1646, 6934, 6935]"
4,4004,TERT C250T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[10331]


## List of Evidence ID's of Unable to Normalize Variants

In [287]:
not_normalized_quer_add_evidence_df = not_normalized_quer_add_evidence_df.explode(column="evidence_id")
not_normalized_quer_add_evidence_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,evidence_id
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1812
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10128
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10135
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",655
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1646


In [288]:
not_normalized_evidence_id_list = list(not_normalized_quer_add_evidence_df["evidence_id"])


## Import evidence status, rating, and level associated with a specific evidence ID

In [289]:
not_normalized_variants_evidence_statuses = []
not_normalized_variants_evidence_ratings = []
not_normalized_variants_evidence_levels = []

for e in not_normalized_evidence_id_list:
    not_normalized_variant_evidence_status_list = []
    not_normalized_variant_evidence_rating_list = []
    not_normalized_variant_evidence_level_list = []

    for evidence in civic_evidence_ids: 
        if int(e) == evidence.id:

            if evidence.status not in not_normalized_variant_evidence_status_list:
                not_normalized_variant_evidence_status_list.append(evidence.status)
            not_normalized_variant_evidence_status_list = not_normalized_variant_evidence_status_list or ""

            if evidence.rating not in not_normalized_variant_evidence_rating_list:
                not_normalized_variant_evidence_rating_list.append(evidence.rating)
            not_normalized_variant_evidence_rating_list = not_normalized_variant_evidence_rating_list or ""

            if evidence.evidence_level not in not_normalized_variant_evidence_level_list:
                not_normalized_variant_evidence_level_list.append(evidence.evidence_level)
            not_normalized_variant_evidence_level_list = not_normalized_variant_evidence_level_list or ""

    not_normalized_variants_evidence_statuses.append(not_normalized_variant_evidence_status_list)
    not_normalized_variants_evidence_ratings.append(not_normalized_variant_evidence_rating_list)
    not_normalized_variants_evidence_levels.append(not_normalized_variant_evidence_level_list)

not_normalized_quer_add_evidence_df["evidence_status"] = not_normalized_variants_evidence_statuses
not_normalized_quer_add_evidence_df["evidence_rating"] = not_normalized_variants_evidence_ratings
not_normalized_quer_add_evidence_df["evidence_level"] = not_normalized_variants_evidence_levels
not_normalized_quer_add_evidence_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,evidence_id,evidence_status,evidence_rating,evidence_level
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1812,[accepted],[1],[C]
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10128,[submitted],[3],[D]
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10135,[submitted],[3],[D]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",655,[accepted],[5],[B]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1646,[accepted],[3],[B]


In [290]:
not_normalized_quer_add_evidence_df

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,evidence_id,evidence_status,evidence_rating,evidence_level
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1812,[accepted],[1],[C]
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10128,[submitted],[3],[D]
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10135,[submitted],[3],[D]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",655,[accepted],[5],[B]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1646,[accepted],[3],[B]
...,...,...,...,...,...,...,...,...,...,...,...
58,2825,BRAF V601E,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",7627,[submitted],[3],[C]
59,3721,AR V757A,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10131,[submitted],[3],[D]
60,3722,AR V757I,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10132,[submitted],[3],[D]
61,3724,AR Y763C,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10134,[submitted],[3],[D]


In [291]:
not_normalized_quer_add_evidence_df['evidence_status'] = not_normalized_quer_add_evidence_df['evidence_status'].str.join(', ')

In [292]:
not_normalized_quer_add_evidence_df = not_normalized_quer_add_evidence_df.drop(not_normalized_quer_add_evidence_df[not_normalized_quer_add_evidence_df.evidence_status == "rejected"].index)
not_normalized_quer_add_evidence_df

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,evidence_id,evidence_status,evidence_rating,evidence_level
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1812,accepted,[1],[C]
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10128,submitted,[3],[D]
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10135,submitted,[3],[D]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",655,accepted,[5],[B]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1646,accepted,[3],[B]
...,...,...,...,...,...,...,...,...,...,...,...
58,2825,BRAF V601E,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",7627,submitted,[3],[C]
59,3721,AR V757A,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10131,submitted,[3],[D]
60,3722,AR V757I,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10132,submitted,[3],[D]
61,3724,AR Y763C,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10134,submitted,[3],[D]


In [293]:
not_normalized_evidence_id_list = list(not_normalized_quer_add_evidence_df["evidence_id"])

## Evidence analysis

In [294]:
total_number_not_normalized_variant_unique_evidence_items = len(not_normalized_evidence_id_list)
not_normalized_fraction_evidence_items = f"{total_number_not_normalized_variant_unique_evidence_items} / {total_number_evidences}"
f"Number of Unable to Normalize Variant Evidence items in CIViC: {not_normalized_fraction_evidence_items}"

'Number of Unable to Normalize Variant Evidence items in CIViC: 80 / 9766'

In [295]:
not_normalized_percentage_evidence_items = "{:.2f}".format(total_number_not_normalized_variant_unique_evidence_items/total_number_evidences*100) + "%"
f"Percentage of Unable to Normalize Variant Evidence items in CIViC: {not_normalized_percentage_evidence_items}"

'Percentage of Unable to Normalize Variant Evidence items in CIViC: 0.82%'

In [296]:
not_normalized_quer_add_evidence_df['evidence_accepted'] = not_normalized_quer_add_evidence_df.evidence_status.map({'accepted':True ,'submitted':False})
not_normalized_quer_add_evidence_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,evidence_id,evidence_status,evidence_rating,evidence_level,evidence_accepted
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1812,accepted,[1],[C],True
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10128,submitted,[3],[D],False
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10135,submitted,[3],[D],False
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",655,accepted,[5],[B],True
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1646,accepted,[3],[B],True


In [297]:
not_normalized_quer_add_evidence_df = not_normalized_quer_add_evidence_df.drop_duplicates(subset= ['evidence_id'])

In [298]:
not_normalized_quer_add_evidence_df.evidence_accepted.value_counts()

evidence_accepted
False    63
True     14
Name: count, dtype: int64

In [299]:
number_submitted_evidences_not_normalized_variants = len(not_normalized_quer_add_evidence_df) - not_normalized_quer_add_evidence_df.evidence_accepted.sum()
fraction_submitted_evidences_not_normalized_variants = f"{number_submitted_evidences_not_normalized_variants} / {total_number_not_normalized_variant_unique_evidence_items}"
f"Number of submitted Unable to Normalize Variant Evidence items: {fraction_submitted_evidences_not_normalized_variants}"

'Number of submitted Unable to Normalize Variant Evidence items: 66 / 80'

In [300]:
percentage_submitted_evidences_not_normalized_variants = "{:.2f}".format(number_submitted_evidences_not_normalized_variants/total_number_not_normalized_variant_unique_evidence_items*100) + "%"
f"Percentage of not submitted Unable to Normalize Variant Evidence items: {percentage_submitted_evidences_not_normalized_variants}"

'Percentage of not submitted Unable to Normalize Variant Evidence items: 82.50%'

In [301]:
number_accepted_evidences_not_normalized_variants = not_normalized_quer_add_evidence_df.evidence_accepted.sum()
fraction_accepted_evidences_not_normalized_variants = f"{number_accepted_evidences_not_normalized_variants} / {total_number_not_normalized_variant_unique_evidence_items}"
f"Number of accepted Unable to Normalize Variant Evidence items: {fraction_accepted_evidences_not_normalized_variants}"

'Number of accepted Unable to Normalize Variant Evidence items: 14 / 80'

In [302]:
percentage_accepted_evidences_not_normalized_variants = "{:.2f}".format(number_accepted_evidences_not_normalized_variants/total_number_not_normalized_variant_unique_evidence_items*100)+ "%" 
f"Percentage of accepted Unable to Normalize Variant Evidence items: {percentage_accepted_evidences_not_normalized_variants}"

'Percentage of accepted Unable to Normalize Variant Evidence items: 17.50%'

## Impact
molecular profile score

### Import molecular profile id

In [303]:
not_normalized_queries_add_molprof_df = not_normalized_quer_df.copy()
not_normalized_variants_molprof_id = []

for v in not_normalized_variant_id_list:
    not_normalized_variant_molprof_id_list = []

    for variant in civic_variant_ids:
        if int(v) == variant.id:

            for mp in variant.molecular_profiles:
                    if mp.id not in not_normalized_variant_molprof_id_list:
                        not_normalized_variant_molprof_id_list.append(mp.id)

            not_normalized_variant_molprof_id_list = not_normalized_variant_molprof_id_list or ""

    not_normalized_variants_molprof_id.append(not_normalized_variant_molprof_id_list)
    
not_normalized_queries_add_molprof_df["molecular_profile_id"] = not_normalized_variants_molprof_id
not_normalized_queries_add_molprof_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,molecular_profile_id
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",[729]
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[3586]
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[3593]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",[244]
4,4004,TERT C250T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[3872]


In [304]:
not_normalized_variant_molprof_id_list = list(not_normalized_queries_add_molprof_df["molecular_profile_id"])

### Import molecular profile scores

In [305]:
not_normalized_variants_molprof_score = []

for mp_list in not_normalized_variant_molprof_id_list:
    not_normalized_variant_molprof_score_list = []
    for mp in mp_list:
        
        for molprof in civic_molprof_ids: 
            if int(mp) == molprof.id:

                if molprof.molecular_profile_score not in not_normalized_variant_molprof_score_list:
                    not_normalized_variant_molprof_score_list.append(molprof.molecular_profile_score)
                not_normalized_variant_molprof_score_list = not_normalized_variant_molprof_score_list or ""

    not_normalized_variants_molprof_score.append(not_normalized_variant_molprof_score_list)

not_normalized_queries_add_molprof_df["molecular_profile_score"] = not_normalized_variants_molprof_score

not_normalized_queries_add_molprof_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,molecular_profile_id,molecular_profile_score
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",[729],[2.5]
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[3586],[0.0]
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[3593],[0.0]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",[244],[40.0]
4,4004,TERT C250T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[3872],[0.0]


In [306]:
not_normalized_queries_add_molprof_df['molecular_profile_score_sum'] = not_normalized_queries_add_molprof_df['molecular_profile_score'].apply(lambda x: sum(x))
not_normalized_queries_add_molprof_df.head()

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,molecular_profile_id,molecular_profile_score,molecular_profile_score_sum
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",[729],[2.5],2.5
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[3586],[0.0],0.0
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[3593],[0.0],0.0
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",[244],[40.0],40.0
4,4004,TERT C250T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[3872],[0.0],0.0


# Analysis of Not Supported Variants

### List of Not Supported Variant ID's

In [307]:
not_supported_variants_df = pd.read_csv("./not_supported_variants.csv", sep= "\t")
not_supported_variants_df.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted
0,4170,VHL,,Transcript Variant,False
1,4417,ALK,FBXO11::ALK,Fusion,False
2,4214,VHL,,Transcript Variant,False
3,4216,VHL,,Transcript Variant,False
4,4278,VHL,,Transcript Variant,False


In [308]:
not_supported_variants_df = not_supported_variants_df.drop_duplicates(subset=['variant_id'])

In [309]:
not_supported_variant_id_list = list(not_supported_variants_df["variant_id"])

## Variant Analysis

In [310]:
total_number_unique_not_supported_variants = len(set(not_supported_variants_df.variant_id))
fraction_not_supported_variants = f" {total_number_unique_not_supported_variants} / {total_number_variants}"
f"Number of Not Supported Variants in CIViC: {fraction_not_supported_variants}"

'Number of Not Supported Variants in CIViC:  1621 / 3553'

In [311]:
percentage_not_supported_variants = "{:.2f}".format(total_number_unique_not_supported_variants/total_number_variants*100) + "%"
f"Percentage of Not Supported Variants in CIViC: {percentage_not_supported_variants}"

'Percentage of Not Supported Variants in CIViC: 45.62%'

In [312]:
not_supported_variants_df.variant_accepted.value_counts()

variant_accepted
False    833
True     788
Name: count, dtype: int64

In [313]:
number_not_accepted_not_supported_variants = len(not_supported_variants_df) - not_supported_variants_df.variant_accepted.sum()
fraction_not_accepted_not_supported_variants = f"{number_not_accepted_not_supported_variants} / {total_number_unique_not_supported_variants}"
f"Number of not accepted Not Supported Variants: {fraction_not_accepted_not_supported_variants}"

'Number of not accepted Not Supported Variants: 833 / 1621'

In [314]:
percentage_not_accepted_not_supported_variants = "{:.2f}".format(number_not_accepted_not_supported_variants/total_number_unique_not_supported_variants*100) + "%"
f"Percentage of not accepted Not Supported Variants: {percentage_not_accepted_not_supported_variants}"

'Percentage of not accepted Not Supported Variants: 51.39%'

In [315]:
number_accepted_not_supported_variants = not_supported_variants_df.variant_accepted.sum()
fraction_accepted_not_supported_variants = f"{number_accepted_not_supported_variants} /{total_number_unique_not_supported_variants}"
f"Number of accepted Not Supported Variants: {fraction_accepted_not_supported_variants}"

'Number of accepted Not Supported Variants: 788 /1621'

In [316]:
percentage_accepted_not_supported_variants = "{:.2f}".format(number_accepted_not_supported_variants/total_number_unique_not_supported_variants*100) + "%"
f"Percentage of accepted Not Supported Variants: {percentage_accepted_not_supported_variants}"

'Percentage of accepted Not Supported Variants: 48.61%'

### Not Supported Variant Analysis by Subcategory 

In [317]:
not_supported_variant_categories_summary_data = dict()
for category in ["Expression", "Epigenetic Modification", "Fusion", "Protein Consequence", "Gene Function", "Rearrangements", "Copy Number", "Other", "Genotypes Easy", "Genotypes Compound", "Region Defined Variant", "Transcript Variant"]:
    print(category)
    not_supported_variant_categories_summary_data[category] = {}
    category_df = not_supported_variants_df[not_supported_variants_df.category == category]
    
    number_unique_not_supported_category_variants = len(set(category_df.variant_id))
    not_supported_variant_categories_summary_data[category]["number_unique_not_supported_category_variants"] = number_unique_not_supported_category_variants

    fraction_not_supported_category_variant_of_civic = f" {number_unique_not_supported_category_variants} / {total_number_variants}"
    not_supported_variant_categories_summary_data[category]["fraction_not_supported_category_variant_of_civic"] = fraction_not_supported_category_variant_of_civic
    print(f"Number of {category} Variants in CIViC: {fraction_not_supported_category_variant_of_civic}")
    
    percent_not_supported_category_variant_of_civic = "{:.2f}".format(number_unique_not_supported_category_variants/total_number_variants*100) + "%"
    not_supported_variant_categories_summary_data[category]["percent_not_supported_category_variant_of_civic"] = percent_not_supported_category_variant_of_civic
    print(f"Percent of {category} Variants in CIViC: {percent_not_supported_category_variant_of_civic}")

    fraction_not_supported_category_variant_of_total_not_supported = f" {number_unique_not_supported_category_variants} / {total_number_unique_not_supported_variants}"
    not_supported_variant_categories_summary_data[category]["fraction_not_supported_category_variant_of_total_not_supported"] = fraction_not_supported_category_variant_of_total_not_supported
    print(f"Number of {category} Variants in Not Supported Variants: {fraction_not_supported_category_variant_of_total_not_supported}")
    
    percent_not_supported_category_variant_of_total_not_supported = "{:.2f}".format(number_unique_not_supported_category_variants/total_number_unique_not_supported_variants*100) + "%"
    not_supported_variant_categories_summary_data[category]["percent_not_supported_category_variant_of_total_not_supported"] = percent_not_supported_category_variant_of_total_not_supported
    print(f"Percent of {category} Variants in Not Supported Variants: {percent_not_supported_category_variant_of_total_not_supported}")

    number_accepted_not_supported_category_variants = category_df.variant_accepted.sum()
    fraction_accepted_not_supported_category_variants = f" {number_accepted_not_supported_category_variants} / {number_unique_not_supported_category_variants}"
    not_supported_variant_categories_summary_data[category]["fraction_accepted_not_supported_category_variants"] = fraction_accepted_not_supported_category_variants
    print(f"Number of Accepted {category} Variants: {fraction_accepted_not_supported_category_variants}")
   
    percentage_accepted_not_supported_category_variants =  "{:.2f}".format(number_accepted_not_supported_category_variants/number_unique_not_supported_category_variants*100) + "%"
    not_supported_variant_categories_summary_data[category]["percentage_accepted_not_supported_category_variants"] = percentage_accepted_not_supported_category_variants
    print(f"Percent of Accepted {category} Variants: {percentage_accepted_not_supported_category_variants}")
  
    number_not_accepted_not_supported_category_variants =  len(category_df) - category_df.variant_accepted.sum()
    fraction_not_accepted_not_supported_category_variants = f" {number_not_accepted_not_supported_category_variants} / {number_unique_not_supported_category_variants}"
    not_supported_variant_categories_summary_data[category]["fraction_not_accepted_not_supported_category_variants"] = fraction_not_accepted_not_supported_category_variants
    print(f"Number of Not Accepted {category} Variants: {fraction_not_accepted_not_supported_category_variants}")

    percentage_not_accepted_not_supported_category_variants =  "{:.2f}".format(number_not_accepted_not_supported_category_variants/number_unique_not_supported_category_variants*100) + "%"
    not_supported_variant_categories_summary_data[category]["percentage_not_accepted_not_supported_category_variants"] = percentage_not_accepted_not_supported_category_variants
    print(f"Percent of Not Accepted {category} Variants: {percentage_not_accepted_not_supported_category_variants}")
    
    print("--------------------")

Expression
Number of Expression Variants in CIViC:  291 / 3553
Percent of Expression Variants in CIViC: 8.19%
Number of Expression Variants in Not Supported Variants:  291 / 1621
Percent of Expression Variants in Not Supported Variants: 17.95%
Number of Accepted Expression Variants:  180 / 291
Percent of Accepted Expression Variants: 61.86%
Number of Not Accepted Expression Variants:  111 / 291
Percent of Not Accepted Expression Variants: 38.14%
--------------------
Epigenetic Modification
Number of Epigenetic Modification Variants in CIViC:  15 / 3553
Percent of Epigenetic Modification Variants in CIViC: 0.42%
Number of Epigenetic Modification Variants in Not Supported Variants:  15 / 1621
Percent of Epigenetic Modification Variants in Not Supported Variants: 0.93%
Number of Accepted Epigenetic Modification Variants:  14 / 15
Percent of Accepted Epigenetic Modification Variants: 93.33%
Number of Not Accepted Epigenetic Modification Variants:  1 / 15
Percent of Not Accepted Epigenetic 

## Import evidence ID's associated with the Not Supported Variants using variant ID

In [318]:
not_supported_variants_add_evidence_df = not_supported_variants_df.copy()
not_supported_variants_evidence_ids = []

for v in not_supported_variant_id_list:
    not_supported_variant_evidence_id_list = []

    for variant in civic_variant_ids:
        if int(v) == variant.id:

            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in not_supported_variant_evidence_id_list:
                        not_supported_variant_evidence_id_list.append(e.id)

            not_supported_variant_evidence_id_list = not_supported_variant_evidence_id_list or ""

    not_supported_variants_evidence_ids.append(not_supported_variant_evidence_id_list)

not_supported_variants_add_evidence_df["evidence_id"] = not_supported_variants_evidence_ids
not_supported_variants_add_evidence_df.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_id
0,4170,VHL,,Transcript Variant,False,[10647]
1,4417,ALK,FBXO11::ALK,Fusion,False,[7428]
2,4214,VHL,,Transcript Variant,False,[10752]
3,4216,VHL,,Transcript Variant,False,[10754]
4,4278,VHL,,Transcript Variant,False,[10958]


## List of Evidence ID's on Not Supported Variants

In [319]:
not_supported_variants_add_evidence_df = not_supported_variants_add_evidence_df.explode(column="evidence_id")
not_supported_variants_add_evidence_df.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_id
0,4170,VHL,,Transcript Variant,False,10647
1,4417,ALK,FBXO11::ALK,Fusion,False,7428
2,4214,VHL,,Transcript Variant,False,10752
3,4216,VHL,,Transcript Variant,False,10754
4,4278,VHL,,Transcript Variant,False,10958


In [320]:
not_supported_evidence_id_list = list(not_supported_variants_add_evidence_df["evidence_id"])

## Import evidence status, rating, and level associated with a specific evidence ID

In [321]:
not_supported_variants_evidence_statuses = []
not_supported_variants_evidence_ratings = []
not_supported_variants_evidence_levels = []

for e in not_supported_evidence_id_list:
    not_supported_variant_evidence_status_list = []
    not_supported_variant_evidence_rating_list = []
    not_supported_variant_evidence_level_list = []

    for evidence in civic_evidence_ids:
        if int(e) == evidence.id:

            if evidence.status not in not_supported_variant_evidence_status_list:
                not_supported_variant_evidence_status_list.append(evidence.status)
            not_supported_variant_evidence_status_list = not_supported_variant_evidence_status_list or ""

            if evidence.rating not in not_supported_variant_evidence_rating_list:
                not_supported_variant_evidence_rating_list.append(evidence.rating)
            not_supported_variant_evidence_rating_list = not_supported_variant_evidence_rating_list or ""

            if evidence.evidence_level not in not_supported_variant_evidence_level_list:
                not_supported_variant_evidence_level_list.append(evidence.evidence_level)
            not_supported_variant_evidence_level_list = not_supported_variant_evidence_level_list or ""

    not_supported_variants_evidence_statuses.append(not_supported_variant_evidence_status_list)
    not_supported_variants_evidence_ratings.append(not_supported_variant_evidence_rating_list)
    not_supported_variants_evidence_levels.append(not_supported_variant_evidence_level_list)

not_supported_variants_add_evidence_df["evidence_status"] = not_supported_variants_evidence_statuses
not_supported_variants_add_evidence_df["evidence_rating"] = not_supported_variants_evidence_ratings
not_supported_variants_add_evidence_df["evidence_level"] = not_supported_variants_evidence_levels
not_supported_variants_add_evidence_df.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_id,evidence_status,evidence_rating,evidence_level
0,4170,VHL,,Transcript Variant,False,10647,[submitted],[2],[C]
1,4417,ALK,FBXO11::ALK,Fusion,False,7428,[submitted],[3],[C]
2,4214,VHL,,Transcript Variant,False,10752,[submitted],[3],[C]
3,4216,VHL,,Transcript Variant,False,10754,[submitted],[3],[C]
4,4278,VHL,,Transcript Variant,False,10958,[submitted],[3],[C]


Remove all evidence items that have been rejected

In [322]:
not_supported_variants_add_evidence_df

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_id,evidence_status,evidence_rating,evidence_level
0,4170,VHL,,Transcript Variant,False,10647,[submitted],[2],[C]
1,4417,ALK,FBXO11::ALK,Fusion,False,7428,[submitted],[3],[C]
2,4214,VHL,,Transcript Variant,False,10752,[submitted],[3],[C]
3,4216,VHL,,Transcript Variant,False,10754,[submitted],[3],[C]
4,4278,VHL,,Transcript Variant,False,10958,[submitted],[3],[C]
...,...,...,...,...,...,...,...,...,...
1618,3478,ESR2,underexpression beta-1,Other,False,9618,[submitted],[4],[B]
1618,3478,ESR2,underexpression beta-1,Other,False,9619,[submitted],[4],[B]
1619,3508,CD274,v242,Protein Consequence,False,9695,[submitted],[4],[E]
1620,2422,NTRK3,~DEPRECATED-ETV6-NTRK3,Other,False,10692,[submitted],[3],[C]


In [323]:
not_supported_variants_add_evidence_df['evidence_status'] = not_supported_variants_add_evidence_df['evidence_status'].str.join(', ')

In [324]:
not_supported_variants_add_evidence_df = not_supported_variants_add_evidence_df.drop(not_supported_variants_add_evidence_df[not_supported_variants_add_evidence_df.evidence_status == "rejected"].index)
not_supported_variants_add_evidence_df

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_id,evidence_status,evidence_rating,evidence_level
0,4170,VHL,,Transcript Variant,False,10647,submitted,[2],[C]
1,4417,ALK,FBXO11::ALK,Fusion,False,7428,submitted,[3],[C]
2,4214,VHL,,Transcript Variant,False,10752,submitted,[3],[C]
3,4216,VHL,,Transcript Variant,False,10754,submitted,[3],[C]
4,4278,VHL,,Transcript Variant,False,10958,submitted,[3],[C]
...,...,...,...,...,...,...,...,...,...
1618,3478,ESR2,underexpression beta-1,Other,False,9618,submitted,[4],[B]
1618,3478,ESR2,underexpression beta-1,Other,False,9619,submitted,[4],[B]
1619,3508,CD274,v242,Protein Consequence,False,9695,submitted,[4],[E]
1620,2422,NTRK3,~DEPRECATED-ETV6-NTRK3,Other,False,10692,submitted,[3],[C]


In [325]:
not_supported_evidence_id_list = list(not_supported_variants_add_evidence_df["evidence_id"])

## Evidence analysis

In [326]:
total_number_not_supported_variant_unique_evidence_items = len(set(not_supported_variants_add_evidence_df.evidence_id))
not_supported_variant_fraction_evidence_items = f"{total_number_not_supported_variant_unique_evidence_items} / {total_number_evidences}"
f"Number of Not Supported Variant Evidence items in CIViC: {not_supported_variant_fraction_evidence_items}"

'Number of Not Supported Variant Evidence items in CIViC: 4286 / 9766'

In [327]:
not_supported_variant_percentage_evidence_items = "{:.2f}".format(total_number_not_supported_variant_unique_evidence_items/total_number_evidences*100) + "%"
f"Percentage of Not Supported Variant Evidence items in CIViC: {not_supported_variant_percentage_evidence_items}"

'Percentage of Not Supported Variant Evidence items in CIViC: 43.89%'

In [328]:
not_supported_variants_add_evidence_df['evidence_accepted'] = not_supported_variants_add_evidence_df.evidence_status.map({'accepted':True ,'submitted':False})
not_supported_variants_add_evidence_df.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_id,evidence_status,evidence_rating,evidence_level,evidence_accepted
0,4170,VHL,,Transcript Variant,False,10647,submitted,[2],[C],False
1,4417,ALK,FBXO11::ALK,Fusion,False,7428,submitted,[3],[C],False
2,4214,VHL,,Transcript Variant,False,10752,submitted,[3],[C],False
3,4216,VHL,,Transcript Variant,False,10754,submitted,[3],[C],False
4,4278,VHL,,Transcript Variant,False,10958,submitted,[3],[C],False


In [329]:
not_supported_variants_add_evidence_unique_df = not_supported_variants_add_evidence_df.drop_duplicates(subset= ['evidence_id'])

In [330]:
not_supported_variants_add_evidence_unique_df.evidence_accepted.value_counts()

evidence_accepted
True     2198
False    1930
Name: count, dtype: int64

In [331]:
number_submitted_evidences_not_supported_variants = len(not_supported_variants_add_evidence_unique_df) - not_supported_variants_add_evidence_unique_df.evidence_accepted.sum()
fraction_submitted_evidences_not_supported_variants = f"{number_submitted_evidences_not_supported_variants} / {total_number_not_supported_variant_unique_evidence_items}"
f"Number of submitted Not Supported Variant Evidence items: {fraction_submitted_evidences_not_supported_variants}"

'Number of submitted Not Supported Variant Evidence items: 2088 / 4286'

In [332]:
percentage_submitted_evidences_not_supported_variants = "{:.2f}".format(number_submitted_evidences_not_supported_variants/total_number_not_supported_variant_unique_evidence_items*100) +"%"
f"Percentage of submitted Not Supported Variant Evidence items: {percentage_submitted_evidences_not_supported_variants}"

'Percentage of submitted Not Supported Variant Evidence items: 48.72%'

In [333]:
number_accepted_evidences_not_supported_variants = not_supported_variants_add_evidence_unique_df.evidence_accepted.sum()
fraction_accepted_evidences_not_supported_variants = f"{number_accepted_evidences_not_supported_variants} / {total_number_not_supported_variant_unique_evidence_items}"
f"Number of accepted Not Supported Variant Evidence items: {fraction_accepted_evidences_not_supported_variants}"

'Number of accepted Not Supported Variant Evidence items: 2198 / 4286'

In [334]:
percentage_accepted_evidences_not_supported_variants = "{:.2f}".format(number_accepted_evidences_not_supported_variants/total_number_not_supported_variant_unique_evidence_items*100)
f"Percentage of accepted Not Supported Variant Evidence items: {percentage_accepted_evidences_not_supported_variants}%"

'Percentage of accepted Not Supported Variant Evidence items: 51.28%'

### Not Supported Variant Evidence Analysis by Subcategory 

 List all the possible variant categories, have to use non unique file since evidence items are used more than once across groups


In [335]:
not_supported_variant_categories = not_supported_variants_add_evidence_df.category.unique()
not_supported_variant_categories 

array(['Transcript Variant', 'Fusion', 'Region Defined Variant',
       'Protein Consequence', 'Other', 'Rearrangements', 'Copy Number',
       'Expression', 'Gene Function', 'Genotypes Compound',
       'Epigenetic Modification', 'Genotypes Easy'], dtype=object)

Evidence ID's can qualify for more than one kind of variant category

In [336]:
duplicate = not_supported_variants_add_evidence_df[not_supported_variants_add_evidence_df.duplicated('evidence_id', keep=False)]
duplicate

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_id,evidence_status,evidence_rating,evidence_level,evidence_accepted
1,4417,ALK,FBXO11::ALK,Fusion,False,7428,submitted,[3],[C],False
58,1,ABL1,BCR::ABL,Fusion,True,11289,submitted,[4],[A],False
126,4467,ABL1,BCR::ABL mutations,Fusion,False,11289,submitted,[4],[A],False
127,4497,FGFR1,BCR::FGFR1,Fusion,False,11324,submitted,[3],[B],False
128,2860,JAK2,BCR::JAK2,Fusion,True,11325,submitted,[2],[B],False
...,...,...,...,...,...,...,...,...,...,...
1421,4500,FGFR1,Translocation,Rearrangements,False,11324,submitted,[3],[B],False
1536,4496,FGFR1,ZMYM2::FGFR1,Fusion,False,11324,submitted,[3],[B],False
1542,4466,TERT,,Transcript Variant,False,11278,submitted,[2],[C],False
1594,4463,TSC1,mutation,Region Defined Variant,False,11269,submitted,[4],[A],False


In [337]:
not_supported_variant_categories_evidence_summary_data = dict()
for category in ["Expression", "Epigenetic Modification", "Fusion", "Protein Consequence", "Gene Function", "Rearrangements", "Copy Number", "Other", "Genotypes Easy", "Genotypes Compound", "Region Defined Variant", "Transcript Variant"]:
    print(category)
    not_supported_variant_categories_evidence_summary_data[category] = {}
    evidence_category_df = not_supported_variants_add_evidence_df[not_supported_variants_add_evidence_df.category == category]
    evidence_category_df = evidence_category_df.drop_duplicates(subset=['evidence_id'])

    number_unique_not_supported_category_evidence = len(set(evidence_category_df.evidence_id))
    not_supported_variant_categories_evidence_summary_data[category]["number_unique_not_supported_category_evidence"]=  number_unique_not_supported_category_evidence
    
    fraction_not_supported_category_variant_evidence_of_civic = f"{number_unique_not_supported_category_evidence} / {total_number_evidences}"
    not_supported_variant_categories_evidence_summary_data[category]["fraction_not_supported_category_variant_evidence_of_civic"]=  fraction_not_supported_category_variant_evidence_of_civic
    print(f"Number of {category} Variant Evidence items in CIViC: {fraction_not_supported_category_variant_evidence_of_civic}")

    percent_not_supported_category_variant_evidence_of_civic = "{:.2f}".format(number_unique_not_supported_category_evidence/total_number_evidences*100)  + '%'
    not_supported_variant_categories_evidence_summary_data[category]["percent_not_supported_category_variant_evidence_of_civic"] = percent_not_supported_category_variant_evidence_of_civic
    print(f"Percent of {category} Variant Evidence items in CIViC: {percent_not_supported_category_variant_evidence_of_civic}")
    
    fraction_not_supported_category_variant_evidence_of_total_not_supported = f"{number_unique_not_supported_category_evidence} / {total_number_not_supported_variant_unique_evidence_items}"
    not_supported_variant_categories_evidence_summary_data[category]["fraction_not_supported_category_variant_evidence_of_total_not_supported"]= fraction_not_supported_category_variant_evidence_of_total_not_supported
    print(f"Number of {category} Variant Evidence items in total Not Supported Variant Evidence items: {fraction_not_supported_category_variant_evidence_of_total_not_supported}")

    percent_not_supported_category_variant_evidence_of_total_not_supported = "{:.2f}".format(number_unique_not_supported_category_evidence/total_number_not_supported_variant_unique_evidence_items*100) + '%'
    not_supported_variant_categories_evidence_summary_data[category]["percent_not_supported_category_variant_evidence_of_total_not_supported"] = percent_not_supported_category_variant_evidence_of_total_not_supported
    print(f"Percent of {category} Variant Evidence items in total Not Supported Variant Evidence items: {percent_not_supported_category_variant_evidence_of_total_not_supported}")

    number_accepted_not_supported_category_variant_evidence = evidence_category_df.evidence_accepted.sum()
    fraction_accepted_evidence_not_supported_category_variants = f"{number_accepted_not_supported_category_variant_evidence} / {number_unique_not_supported_category_evidence}"
    not_supported_variant_categories_evidence_summary_data[category]["fraction_accepted_evidence_not_supported_category_variants"] = fraction_accepted_evidence_not_supported_category_variants
    print(f"Number of Accepted {category} Variant Evidence items: {fraction_accepted_evidence_not_supported_category_variants}")

    percentage_accepted_evidence_not_supported_category_variants =  "{:.2f}".format(number_accepted_not_supported_category_variant_evidence/number_unique_not_supported_category_evidence*100) + '%'
    not_supported_variant_categories_evidence_summary_data[category]["percentage_accepted_evidence_not_supported_category_variants"] = percentage_accepted_evidence_not_supported_category_variants
    print(f"Percent of Accepted {category} Variant Evidence items: {percentage_accepted_evidence_not_supported_category_variants}")
  
    number_submitted_not_supported_category_variant_evidence = number_unique_not_supported_category_evidence - evidence_category_df.evidence_accepted.sum()
    fraction_submitted_evidence_not_supported_category_variants = f"{number_submitted_not_supported_category_variant_evidence} / {number_unique_not_supported_category_evidence}"
    not_supported_variant_categories_evidence_summary_data[category]["fraction_submitted_evidence_not_supported_category_variants"] = fraction_submitted_evidence_not_supported_category_variants
    print(f"Number of submitted {category} Variant Evidence items: {fraction_submitted_evidence_not_supported_category_variants}")

    percentage_submitted_evidence_not_supported_category_variants =  "{:.2f}".format(number_submitted_not_supported_category_variant_evidence/number_unique_not_supported_category_evidence*100) + '%'
    not_supported_variant_categories_evidence_summary_data[category]["percentage_submitted_evidence_not_supported_category_variants"] = percentage_submitted_evidence_not_supported_category_variants
    print(f"Percent of submitted {category} Variant Evidence items: {percentage_submitted_evidence_not_supported_category_variants}")
    
    print("--------------------")


Expression
Number of Expression Variant Evidence items in CIViC: 617 / 9766
Percent of Expression Variant Evidence items in CIViC: 6.32%
Number of Expression Variant Evidence items in total Not Supported Variant Evidence items: 617 / 4286
Percent of Expression Variant Evidence items in total Not Supported Variant Evidence items: 14.40%
Number of Accepted Expression Variant Evidence items: 342 / 617
Percent of Accepted Expression Variant Evidence items: 55.43%
Number of submitted Expression Variant Evidence items: 275 / 617
Percent of submitted Expression Variant Evidence items: 44.57%
--------------------
Epigenetic Modification
Number of Epigenetic Modification Variant Evidence items in CIViC: 24 / 9766
Percent of Epigenetic Modification Variant Evidence items in CIViC: 0.25%
Number of Epigenetic Modification Variant Evidence items in total Not Supported Variant Evidence items: 24 / 4286
Percent of Epigenetic Modification Variant Evidence items in total Not Supported Variant Evidence 

## Impact
molecular profile score

### Import molecular profile id

In [338]:
not_supported_variants_add_molprof_df = not_supported_variants_df.copy()
not_supported_variants_molprof_id = []

for v in not_supported_variant_id_list:
    variant_molprof_id_list = []

    for variant in civic_variant_ids:
        if int(v) == variant.id:

            for mp in variant.molecular_profiles:
                    if mp.id not in variant_molprof_id_list:
                        variant_molprof_id_list.append(mp.id)

            variant_molprof_id_list = variant_molprof_id_list or ""

    not_supported_variants_molprof_id.append(variant_molprof_id_list)
    
not_supported_variants_add_molprof_df["molecular_profile_id"] = not_supported_variants_molprof_id
not_supported_variants_add_molprof_df.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,molecular_profile_id
0,4170,VHL,,Transcript Variant,False,[4038]
1,4417,ALK,FBXO11::ALK,Fusion,False,[4350]
2,4214,VHL,,Transcript Variant,False,[4082]
3,4216,VHL,,Transcript Variant,False,[4084]
4,4278,VHL,,Transcript Variant,False,[4146]


In [339]:
not_supported_variants_molprof_id_list = list(not_supported_variants_add_molprof_df["molecular_profile_id"])

### Import molecular profile scores

In [340]:
not_supported_variants_molprof_score = []

for mp_list in not_supported_variants_molprof_id_list:
    variant_molprof_score_list = []
    for mp in mp_list:
        
        for molprof in civic_molprof_ids: 
            if int(mp) == molprof.id:

                if molprof.molecular_profile_score not in variant_molprof_score_list:
                    variant_molprof_score_list.append(molprof.molecular_profile_score)
                variant_molprof_score_list = variant_molprof_score_list or ""

    not_supported_variants_molprof_score.append(variant_molprof_score_list)

not_supported_variants_add_molprof_df["molecular_profile_score"] = not_supported_variants_molprof_score

not_supported_variants_add_molprof_df.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,molecular_profile_id,molecular_profile_score
0,4170,VHL,,Transcript Variant,False,[4038],[0.0]
1,4417,ALK,FBXO11::ALK,Fusion,False,[4350],[0.0]
2,4214,VHL,,Transcript Variant,False,[4082],[0.0]
3,4216,VHL,,Transcript Variant,False,[4084],[0.0]
4,4278,VHL,,Transcript Variant,False,[4146],[0.0]


In [341]:
not_supported_variants_add_molprof_df['molecular_profile_score_sum'] = not_supported_variants_add_molprof_df['molecular_profile_score'].apply(lambda x: sum(x))
not_supported_variants_add_molprof_df.head()

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,molecular_profile_id,molecular_profile_score,molecular_profile_score_sum
0,4170,VHL,,Transcript Variant,False,[4038],[0.0],0.0
1,4417,ALK,FBXO11::ALK,Fusion,False,[4350],[0.0],0.0
2,4214,VHL,,Transcript Variant,False,[4082],[0.0],0.0
3,4216,VHL,,Transcript Variant,False,[4084],[0.0],0.0
4,4278,VHL,,Transcript Variant,False,[4146],[0.0],0.0


### Impact by Subcategory

In [342]:
not_supported_variant_categories_impact_data = dict()
for category in ["Expression", "Epigenetic Modification", "Fusion", "Protein Consequence", "Gene Function", "Rearrangements", "Copy Number", "Other", "Genotypes Easy", "Genotypes Compound", "Region Defined Variant", "Transcript Variant"]:
    print(category)
    not_supported_variant_categories_impact_data[category] = {}
    impact_category_df = not_supported_variants_add_molprof_df[not_supported_variants_add_molprof_df.category == category]

    total_sum_not_supported_category_impact = impact_category_df['molecular_profile_score_sum'].sum()
    not_supported_variant_categories_impact_data[category]["total_sum_not_supported_category_impact"]=  total_sum_not_supported_category_impact
    print(total_sum_not_supported_category_impact)

    print("--------------------")

Expression
3618.0
--------------------
Epigenetic Modification
285.5
--------------------
Fusion
5689.25
--------------------
Protein Consequence
2683.75
--------------------
Gene Function
1792.5
--------------------
Rearrangements
653.0
--------------------
Copy Number
207.0
--------------------
Other
673.5
--------------------
Genotypes Easy
195.0
--------------------
Genotypes Compound
117.5
--------------------
Region Defined Variant
8311.5
--------------------
Transcript Variant
346.5
--------------------


In [343]:
not_supported_variant_categories = ['Expression','Epigenetic Modification', 'Fusion', 'Protein Consequence', 'Gene Function', 'Rearrangements', 'Copy Number', 'Other', 'Genotypes Easy', 'Genotypes Compound', 'Region Defined Variant','Transcript Variant' ]
not_supported_total_sum_impact_score =[v["total_sum_not_supported_category_impact"] for k, v in not_supported_variant_categories_impact_data.items()]
not_supported_total_number_evidence_items = [v["number_unique_not_supported_category_evidence"]for k, v in not_supported_variant_categories_evidence_summary_data.items()]
not_supported_percent_evidence_items = [v["percentage_accepted_evidence_not_supported_category_variants"]for k, v in not_supported_variant_categories_evidence_summary_data.items()]
not_supported_total_number_variants = [v["number_unique_not_supported_category_variants"]for k, v in not_supported_variant_categories_summary_data.items()]

In [344]:
not_supported_impact_dict = {'Category': not_supported_variant_categories,
                            'Total Sum Impact Score':not_supported_total_sum_impact_score,
                            'Total Number Evidence Items':not_supported_total_number_evidence_items,
                            '% Accepted Evidence Items': not_supported_percent_evidence_items,
                            'Total Number Variants':not_supported_total_number_variants}

In [345]:
not_supported_variant_impact_df = pd.DataFrame(not_supported_impact_dict)

# Summary

## Variant Analysis

In [347]:
all_variant_dict = {'Variant Category':['Normalized', 'Unable to Normalize', 'Not Supported'],
        'Fraction of all CIViC Variant Items': [fraction_normalized_variants, fraction_not_normalized_variants, fraction_not_supported_variants],
        'Percentage of all CIViC Variant Items': [percentage_normalized_variants, percentage_not_normalized_variants, percentage_not_supported_variants],
        'Fraction of Accepted Variant Items': [fraction_accepted_normalized_variants, fraction_accepted_not_normalized_variants, fraction_accepted_not_supported_variants],
        'Percentage of Accepted Variant Items':[ percentage_accepted_normalized_variants, percentage_accepted_not_normalized_variants, percentage_accepted_not_supported_variants],
        'Fraction of Not Accepted Variant Items': [fraction_not_accepted_normalized_variants,fraction_not_accepted_not_normalized_variants, fraction_not_accepted_not_supported_variants],
        'Percentage of Not Accepted Variant Items': [ percentage_not_accepted_normalized_variants, percentage_not_accepted_not_normalized_variants, percentage_not_accepted_not_supported_variants]}

In [348]:
all_variant_df = pd.DataFrame(all_variant_dict)

In [349]:
all_variant_df["Percentage of all CIViC Variant Items"] = all_variant_df['Fraction of all CIViC Variant Items'].astype(str) + "  (" + all_variant_df["Percentage of all CIViC Variant Items"]+ ")"
all_variant_df["Percentage of Accepted Variant Items"] = all_variant_df['Fraction of Accepted Variant Items'].astype(str) + "  (" + all_variant_df["Percentage of Accepted Variant Items"]+ ")"
all_variant_df["Percentage of Not Accepted Variant Items"] = all_variant_df['Fraction of Not Accepted Variant Items'].astype(str) + "  (" + all_variant_df["Percentage of Not Accepted Variant Items"]+ ")"

In [350]:
all_variant_df = all_variant_df.drop(['Fraction of all CIViC Variant Items', 'Fraction of Accepted Variant Items', 'Fraction of Not Accepted Variant Items'], axis=1)

In [351]:
all_variant_percent_of_civic_df = all_variant_df.drop(['Percentage of Accepted Variant Items', 'Percentage of Not Accepted Variant Items'], axis=1)
all_variant_percent_evidence_df = all_variant_df.drop(['Percentage of all CIViC Variant Items'], axis=1)

Summary Table 1: The table below shows the 3 categories that CIViC variants were divided into after normalization and what percentage they make up of all variants in CIViC data. 

In [352]:
all_variant_percent_of_civic_df = all_variant_percent_of_civic_df.set_index('Variant Category')
all_variant_percent_of_civic_df

Unnamed: 0_level_0,Percentage of all CIViC Variant Items
Variant Category,Unnamed: 1_level_1
Normalized,1869 / 3553 (52.60%)
Unable to Normalize,63 / 3553 (1.77%)
Not Supported,1621 / 3553 (45.62%)


In [381]:
summary_table_1 = all_variant_percent_of_civic_df

Summary Table 2: The table below shows the 3 categories that CIViC variants were divided into after normalization and what percentage of the variants in each category are accepted (have at least one evidence item that is accepted) or not.

In [353]:
all_variant_percent_evidence_df = all_variant_percent_evidence_df.set_index('Variant Category')
all_variant_percent_evidence_df

Unnamed: 0_level_0,Percentage of Accepted Variant Items,Percentage of Not Accepted Variant Items
Variant Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Normalized,866 /1869 (46.33%),1003 / 1869 (53.67%)
Unable to Normalize,9 /63 (14.29%),54 / 63 (85.71%)
Not Supported,788 /1621 (48.61%),833 / 1621 (51.39%)


In [None]:
summary_table_2 = all_variant_percent_evidence_df

In [354]:
not_supported_variant_categories = ['Expression','Epigenetic Modification', 'Fusion', 'Protein Consequence', 'Gene Function', 'Rearrangements', 'Copy Number', 'Other', 'Genotypes Easy', 'Genotypes Compound', 'Region Defined Variant','Transcript Variant' ]
not_supported_fraction_of_civic_variant_items =[v["fraction_not_supported_category_variant_of_civic"] for k, v in not_supported_variant_categories_summary_data.items()]
not_supported_percent_of_civic_variant_items = [v["percent_not_supported_category_variant_of_civic"] for k, v in not_supported_variant_categories_summary_data.items()]
not_supported_fraction_not_supported_variant_items =[v["fraction_not_supported_category_variant_of_total_not_supported"] for k, v in not_supported_variant_categories_summary_data.items()]
not_supported_percent_not_supported_variant_items = [v["percent_not_supported_category_variant_of_total_not_supported"] for k, v in not_supported_variant_categories_summary_data.items()]
not_supported_fraction_of_accepted_variant_items =[v["fraction_accepted_not_supported_category_variants"] for k, v in not_supported_variant_categories_summary_data.items()]
not_supported_percent_of_accepted_variant_items = [v["percentage_accepted_not_supported_category_variants"] for k, v in not_supported_variant_categories_summary_data.items()]
not_supported_fraction_of_not_accepted_variant_items =[v["fraction_not_accepted_not_supported_category_variants"] for k, v in not_supported_variant_categories_summary_data.items()]
not_supported_percent_of_not_accepted_variant_items = [v["percentage_not_accepted_not_supported_category_variants"] for k, v in not_supported_variant_categories_summary_data.items()]

In [355]:
not_supported_variant_dict = {'Category': not_supported_variant_categories,
                                'Fraction of Not Supported Variant Items':not_supported_fraction_not_supported_variant_items ,
                                'Percent of Not Supported Variant Items':not_supported_percent_not_supported_variant_items,
                                'Fraction of all CIViC Variant Items': not_supported_fraction_of_civic_variant_items,
                                'Percent of all CIViC Variant Items':not_supported_percent_of_civic_variant_items,
                                'Fraction of Accepted Variant Items': not_supported_fraction_of_accepted_variant_items,
                                'Percent of Accepted Variant Items':not_supported_percent_of_accepted_variant_items,
                                'Fraction of Not Accepted Variant Items': not_supported_fraction_of_not_accepted_variant_items,
                                'Percent of Not Accepted Variant Items':not_supported_percent_of_not_accepted_variant_items}

In [356]:
not_supported_variant_df = pd.DataFrame(not_supported_variant_dict)

In [357]:
not_supported_variant_df['Percent of Not Supported Variant Items'] = not_supported_variant_df['Fraction of Not Supported Variant Items'].astype(str) + "  (" +  not_supported_variant_df['Percent of Not Supported Variant Items']+ ")"
not_supported_variant_df['Percent of all CIViC Variant Items'] = not_supported_variant_df['Fraction of all CIViC Variant Items'].astype(str) + "  (" +  not_supported_variant_df['Percent of all CIViC Variant Items']+ ")"
not_supported_variant_df['Percent of Accepted Variant Items'] = not_supported_variant_df['Fraction of Accepted Variant Items'].astype(str) + "  (" +  not_supported_variant_df['Percent of Accepted Variant Items']+ ")"
not_supported_variant_df['Percent of Not Accepted Variant Items'] = not_supported_variant_df['Fraction of Not Accepted Variant Items'].astype(str) + "  (" +  not_supported_variant_df['Percent of Not Accepted Variant Items']+ ")"

In [358]:
not_supported_variant_df = not_supported_variant_df.drop(['Fraction of Not Supported Variant Items','Fraction of all CIViC Variant Items','Fraction of Accepted Variant Items','Fraction of Not Accepted Variant Items'], axis=1)
not_supported_variant_df

Unnamed: 0,Category,Percent of Not Supported Variant Items,Percent of all CIViC Variant Items,Percent of Accepted Variant Items,Percent of Not Accepted Variant Items
0,Expression,291 / 1621 (17.95%),291 / 3553 (8.19%),180 / 291 (61.86%),111 / 291 (38.14%)
1,Epigenetic Modification,15 / 1621 (0.93%),15 / 3553 (0.42%),14 / 15 (93.33%),1 / 15 (6.67%)
2,Fusion,307 / 1621 (18.94%),307 / 3553 (8.64%),204 / 307 (66.45%),103 / 307 (33.55%)
3,Protein Consequence,128 / 1621 (7.90%),128 / 3553 (3.60%),70 / 128 (54.69%),58 / 128 (45.31%)
4,Gene Function,90 / 1621 (5.55%),90 / 3553 (2.53%),49 / 90 (54.44%),41 / 90 (45.56%)
5,Rearrangements,52 / 1621 (3.21%),52 / 3553 (1.46%),12 / 52 (23.08%),40 / 52 (76.92%)
6,Copy Number,34 / 1621 (2.10%),34 / 3553 (0.96%),19 / 34 (55.88%),15 / 34 (44.12%)
7,Other,93 / 1621 (5.74%),93 / 3553 (2.62%),43 / 93 (46.24%),50 / 93 (53.76%)
8,Genotypes Easy,11 / 1621 (0.68%),11 / 3553 (0.31%),8 / 11 (72.73%),3 / 11 (27.27%)
9,Genotypes Compound,6 / 1621 (0.37%),6 / 3553 (0.17%),4 / 6 (66.67%),2 / 6 (33.33%)


In [359]:
not_supported_variant_percent_of_civic_df =not_supported_variant_df.drop(['Percent of Not Supported Variant Items','Percent of Accepted Variant Items','Percent of Not Accepted Variant Items'], axis=1)
not_supported_variant_percent_of_not_supported_df = not_supported_variant_df.drop(['Percent of all CIViC Variant Items','Percent of Accepted Variant Items','Percent of Not Accepted Variant Items'], axis=1)
not_supported_variant_percent_evidence_df = not_supported_variant_df.drop(['Percent of all CIViC Variant Items','Percent of Not Supported Variant Items'], axis=1)

Summary Table 3: The table below shows the categories that the Not Supported variants were broken into and what percentage of all CIViC variants they make up. These percentages will not add up to 100% because Not Supported variants make up 45.62% of all CIViC variants. 

In [360]:
not_supported_variant_percent_of_civic_df = not_supported_variant_percent_of_civic_df.set_index('Category')
not_supported_variant_percent_of_civic_df

Unnamed: 0_level_0,Percent of all CIViC Variant Items
Category,Unnamed: 1_level_1
Expression,291 / 3553 (8.19%)
Epigenetic Modification,15 / 3553 (0.42%)
Fusion,307 / 3553 (8.64%)
Protein Consequence,128 / 3553 (3.60%)
Gene Function,90 / 3553 (2.53%)
Rearrangements,52 / 3553 (1.46%)
Copy Number,34 / 3553 (0.96%)
Other,93 / 3553 (2.62%)
Genotypes Easy,11 / 3553 (0.31%)
Genotypes Compound,6 / 3553 (0.17%)


In [None]:
summary_table_3 = not_supported_variant_percent_of_civic_df

Summary Table 4: The table below shows the Not Supported variants broken up into 12 sub categories and what percent each sub category take up in Not Supported variant group.

In [361]:
not_supported_variant_percent_of_not_supported_df = not_supported_variant_percent_of_not_supported_df.set_index('Category')
not_supported_variant_percent_of_not_supported_df

Unnamed: 0_level_0,Percent of Not Supported Variant Items
Category,Unnamed: 1_level_1
Expression,291 / 1621 (17.95%)
Epigenetic Modification,15 / 1621 (0.93%)
Fusion,307 / 1621 (18.94%)
Protein Consequence,128 / 1621 (7.90%)
Gene Function,90 / 1621 (5.55%)
Rearrangements,52 / 1621 (3.21%)
Copy Number,34 / 1621 (2.10%)
Other,93 / 1621 (5.74%)
Genotypes Easy,11 / 1621 (0.68%)
Genotypes Compound,6 / 1621 (0.37%)


In [None]:
summary_table_4: not_supported_variant_percent_of_not_supported_df

Summary Table 5: The table below shows the percent of variant items in each Not Supported variant sub category that are accepted(have at least one evidence item that is accepted) or not, per category.

In [362]:
not_supported_variant_percent_evidence_df = not_supported_variant_percent_evidence_df.set_index('Category')
not_supported_variant_percent_evidence_df

Unnamed: 0_level_0,Percent of Accepted Variant Items,Percent of Not Accepted Variant Items
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Expression,180 / 291 (61.86%),111 / 291 (38.14%)
Epigenetic Modification,14 / 15 (93.33%),1 / 15 (6.67%)
Fusion,204 / 307 (66.45%),103 / 307 (33.55%)
Protein Consequence,70 / 128 (54.69%),58 / 128 (45.31%)
Gene Function,49 / 90 (54.44%),41 / 90 (45.56%)
Rearrangements,12 / 52 (23.08%),40 / 52 (76.92%)
Copy Number,19 / 34 (55.88%),15 / 34 (44.12%)
Other,43 / 93 (46.24%),50 / 93 (53.76%)
Genotypes Easy,8 / 11 (72.73%),3 / 11 (27.27%)
Genotypes Compound,4 / 6 (66.67%),2 / 6 (33.33%)


In [None]:
summary_table_5= not_supported_variant_percent_evidence_df

## Evidence Analysis

In [363]:
all_variant_evidence_dict = {'Variant Category':['Normalized', 'Unable to Normalize', 'Not Supported'],
        'Fraction of all CIViC Evidence Items': [normalized_fraction_evidence_items, not_normalized_fraction_evidence_items, not_supported_variant_fraction_evidence_items],
        'Percentage of all CIViC Evidence Items': [normalized_percentage_evidence_items, not_normalized_percentage_evidence_items, not_supported_variant_percentage_evidence_items],
        'Fraction of Accepted Evidence Items': [fraction_accepted_evidences_normalized_variants, fraction_accepted_evidences_not_normalized_variants, fraction_accepted_evidences_not_supported_variants],
        'Percentage of Accepted Evidence Items': [percentage_accepted_evidences_normalized_variants, percentage_accepted_evidences_not_normalized_variants, percentage_accepted_evidences_not_supported_variants],
        'Fraction of Submitted Evidence Items': [fraction_submitted_evidences_normalized_variants, fraction_submitted_evidences_not_normalized_variants, fraction_submitted_evidences_not_supported_variants],
        'Percentage of Submitted Evidence Items':[percentage_submitted_evidences_normalized_variants, percentage_submitted_evidences_not_normalized_variants, percentage_submitted_evidences_not_supported_variants]}

In [364]:
all_variant_evidence_df = pd.DataFrame(all_variant_evidence_dict)

In [365]:
all_variant_evidence_df["Percentage of all CIViC Evidence Items"] = all_variant_evidence_df['Fraction of all CIViC Evidence Items'].astype(str) + "  (" + all_variant_evidence_df["Percentage of all CIViC Evidence Items"]+ ")"
all_variant_evidence_df["Percentage of Accepted Evidence Items"] = all_variant_evidence_df['Fraction of Accepted Evidence Items'].astype(str) + "  (" + all_variant_evidence_df["Percentage of Accepted Evidence Items"]+ ")"
all_variant_evidence_df["Percentage of Submitted Evidence Items"] = all_variant_evidence_df['Fraction of Submitted Evidence Items'].astype(str) + "  (" + all_variant_evidence_df["Percentage of Submitted Evidence Items"]+ ")"

In [366]:
all_variant_evidence_df = all_variant_evidence_df.drop(['Fraction of all CIViC Evidence Items', 'Fraction of Accepted Evidence Items', 'Fraction of Submitted Evidence Items'], axis=1)

In [367]:
all_variant_evidence_percent_of_civic_df = all_variant_evidence_df.drop(['Percentage of Accepted Evidence Items', 'Percentage of Submitted Evidence Items'], axis=1)	
all_variant_evidence_percent_evidence_df = all_variant_evidence_df.drop(['Percentage of all CIViC Evidence Items'], axis=1)

Summary Table 6: The table below shows what percentage of all evidence items in CIViC are associated with Normalized, Unable to Normalize, and Not Supported variants. This will not add up to 100% because evidence itmes may be used across multiple variants.

In [368]:
all_variant_evidence_percent_of_civic_df = all_variant_evidence_percent_of_civic_df.set_index('Variant Category')
all_variant_evidence_percent_of_civic_df

Unnamed: 0_level_0,Percentage of all CIViC Evidence Items
Variant Category,Unnamed: 1_level_1
Normalized,5916 / 9766 (60.58%)
Unable to Normalize,80 / 9766 (0.82%)
Not Supported,4286 / 9766 (43.89%)


In [None]:
summary_table_6 = all_variant_evidence_percent_of_civic_df

Summmary Table 7: The table below shows the percentage of accepted and sumbitted evidence items per category of variants. 

In [369]:
all_variant_evidence_percent_evidence_df = all_variant_evidence_percent_evidence_df.set_index('Variant Category')
all_variant_evidence_percent_evidence_df

Unnamed: 0_level_0,Percentage of Accepted Evidence Items,Percentage of Submitted Evidence Items
Variant Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Normalized,2032 / 5916 (34.35%),3884 / 5916 (65.65%)
Unable to Normalize,14 / 80 (17.50%),66 / 80 (82.50%)
Not Supported,2198 / 4286 (51.28),2088 / 4286 (48.72%)


In [None]:
summary_table_7 = all_variant_evidence_percent_evidence_df

In [370]:
not_supported_variant_categories = ['Expression','Epigenetic Modification', 'Fusion', 'Protein Consequence', 'Gene Function', 'Rearrangements', 'Copy Number', 'Other', 'Genotypes Easy', 'Genotypes Compound', 'Region Defined Variant', 'Transcript Variant' ]
not_supported_fraction_of_civic_evidence_items = [v["fraction_not_supported_category_variant_evidence_of_civic"] for k, v in not_supported_variant_categories_evidence_summary_data.items()]
not_supported_percent_of_civic_evidence_items = [v["percent_not_supported_category_variant_evidence_of_civic"] for k, v in not_supported_variant_categories_evidence_summary_data.items()]
not_supported_fraction_not_supported_variant_evidence_items = [v["fraction_not_supported_category_variant_evidence_of_total_not_supported"] for k, v in not_supported_variant_categories_evidence_summary_data.items()]
not_supported_percent_not_supported_variant_evidence_items = [v["percent_not_supported_category_variant_evidence_of_total_not_supported"] for k, v in not_supported_variant_categories_evidence_summary_data.items()]
not_supported_fraction_of_accepted_evidence_items = [v["fraction_accepted_evidence_not_supported_category_variants"] for k, v in not_supported_variant_categories_evidence_summary_data.items()]
not_supported_percent_of_accepted_evidence_items = [v["percentage_accepted_evidence_not_supported_category_variants"] for k, v in not_supported_variant_categories_evidence_summary_data.items()]
not_supported_fraction_of_submitted_evidence_items = [v["fraction_submitted_evidence_not_supported_category_variants"]for k, v in not_supported_variant_categories_evidence_summary_data.items()]
not_supported_percent_of_submitted_evidence_items = [v["percentage_submitted_evidence_not_supported_category_variants"] for k, v in not_supported_variant_categories_evidence_summary_data.items()]

In [371]:
not_supported_variant_evidence_dict = {'Category': not_supported_variant_categories,
                                'Fraction of Not Supported Variant Evidence Items': not_supported_fraction_not_supported_variant_evidence_items,
                                'Percent of Not Supported Variant Evidence Items': not_supported_percent_not_supported_variant_evidence_items,
                                'Fraction of CIViC Evidence Items': not_supported_fraction_of_civic_evidence_items,
                                'Percent of CIViC Evidence Items': not_supported_percent_of_civic_evidence_items,
                                'Fraction of Accepted Evidence Items': not_supported_fraction_of_accepted_evidence_items,
                                'Percent of Accepted Evidence Items': not_supported_percent_of_accepted_evidence_items,
                                'Fraction of Submitted Evidence Items': not_supported_fraction_of_submitted_evidence_items,
                                'Percent of Submitted Evidence Items':not_supported_percent_of_submitted_evidence_items}

In [372]:
not_supported_variant_evidence_df = pd.DataFrame(not_supported_variant_evidence_dict)

In [373]:
not_supported_variant_evidence_df["Percent of CIViC Evidence Items"] = not_supported_variant_evidence_df['Fraction of CIViC Evidence Items'].astype(str) + "  (" + not_supported_variant_evidence_df["Percent of CIViC Evidence Items"]+ ")"
not_supported_variant_evidence_df["Percent of Not Supported Variant Evidence Items"] = not_supported_variant_evidence_df['Fraction of Not Supported Variant Evidence Items'].astype(str) + "  ("+ not_supported_variant_evidence_df["Percent of Not Supported Variant Evidence Items"]+ ")"
not_supported_variant_evidence_df["Percent of Accepted Evidence Items"] = not_supported_variant_evidence_df['Fraction of Accepted Evidence Items'].astype(str) + "  ("+ not_supported_variant_evidence_df["Percent of Accepted Evidence Items"]+ ")"
not_supported_variant_evidence_df["Percent of Submitted Evidence Items"] = not_supported_variant_evidence_df['Fraction of Submitted Evidence Items'].astype(str) + "  ("+ not_supported_variant_evidence_df["Percent of Submitted Evidence Items"]+ ")"

In [374]:
not_supported_variant_evidence_df = not_supported_variant_evidence_df.drop(['Fraction of CIViC Evidence Items', 'Fraction of Not Supported Variant Evidence Items', 'Fraction of Accepted Evidence Items', 'Fraction of Submitted Evidence Items'], axis=1)

In [375]:
not_supported_variant_evidence_percent_of_civic_df = not_supported_variant_evidence_df.drop(['Percent of Not Supported Variant Evidence Items', 'Percent of Accepted Evidence Items', 'Percent of Submitted Evidence Items'], axis=1) 
not_supported_variant_evidence_percent_of_not_supported_df = not_supported_variant_evidence_df.drop(['Percent of CIViC Evidence Items',  'Percent of Accepted Evidence Items', 'Percent of Submitted Evidence Items'], axis=1) 
not_supported_variant_evidence_percent_evidence_df = not_supported_variant_evidence_df.drop(['Percent of Not Supported Variant Evidence Items', 'Percent of CIViC Evidence Items'], axis=1) 

Summary Table 8: The table below shows the percentage of all CIViC evidenec items that are associated with a Not Supported variant sub category. This will not add up to 100% since the evidence items can be associated with multiple variants.

In [376]:
not_supported_variant_evidence_percent_of_civic_df = not_supported_variant_evidence_percent_of_civic_df.set_index('Category')
not_supported_variant_evidence_percent_of_civic_df

Unnamed: 0_level_0,Percent of CIViC Evidence Items
Category,Unnamed: 1_level_1
Expression,617 / 9766 (6.32%)
Epigenetic Modification,24 / 9766 (0.25%)
Fusion,1220 / 9766 (12.49%)
Protein Consequence,301 / 9766 (3.08%)
Gene Function,346 / 9766 (3.54%)
Rearrangements,140 / 9766 (1.43%)
Copy Number,69 / 9766 (0.71%)
Other,163 / 9766 (1.67%)
Genotypes Easy,23 / 9766 (0.24%)
Genotypes Compound,7 / 9766 (0.07%)


In [None]:
summary_table_8 = not_supported_variant_evidence_percent_of_civic_df

Summary Table 9: The table below shows the percentage of all evidenec items associated with Not Supported variants that are associated with a variant sub category. 

In [377]:
not_supported_variant_evidence_percent_of_not_supported_df = not_supported_variant_evidence_percent_of_not_supported_df.set_index('Category')
not_supported_variant_evidence_percent_of_not_supported_df

Unnamed: 0_level_0,Percent of Not Supported Variant Evidence Items
Category,Unnamed: 1_level_1
Expression,617 / 4286 (14.40%)
Epigenetic Modification,24 / 4286 (0.56%)
Fusion,1220 / 4286 (28.46%)
Protein Consequence,301 / 4286 (7.02%)
Gene Function,346 / 4286 (8.07%)
Rearrangements,140 / 4286 (3.27%)
Copy Number,69 / 4286 (1.61%)
Other,163 / 4286 (3.80%)
Genotypes Easy,23 / 4286 (0.54%)
Genotypes Compound,7 / 4286 (0.16%)


In [None]:
summary_table_9 = not_supported_variant_evidence_percent_of_not_supported_df

Summary Table 10: The table below shows the percentage of evidence items associated with Not Supported variant sub categories that are accepted or submitted.

In [378]:
not_supported_variant_evidence_percent_evidence_df= not_supported_variant_evidence_percent_evidence_df.set_index('Category')
not_supported_variant_evidence_percent_evidence_df

Unnamed: 0_level_0,Percent of Accepted Evidence Items,Percent of Submitted Evidence Items
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Expression,342 / 617 (55.43%),275 / 617 (44.57%)
Epigenetic Modification,22 / 24 (91.67%),2 / 24 (8.33%)
Fusion,724 / 1220 (59.34%),496 / 1220 (40.66%)
Protein Consequence,191 / 301 (63.46%),110 / 301 (36.54%)
Gene Function,151 / 346 (43.64%),195 / 346 (56.36%)
Rearrangements,53 / 140 (37.86%),87 / 140 (62.14%)
Copy Number,31 / 69 (44.93%),38 / 69 (55.07%)
Other,60 / 163 (36.81%),103 / 163 (63.19%)
Genotypes Easy,13 / 23 (56.52%),10 / 23 (43.48%)
Genotypes Compound,4 / 7 (57.14%),3 / 7 (42.86%)


In [None]:
summary_table_10 = not_supported_variant_evidence_percent_evidence_df

## Impact

The bar graph below shows the relationship between the Not Suported variant sub category impact score and the sub category. Additionally, the colors illustrate the number of evidence items associated each sub category.

In [380]:
fig = px.bar(not_supported_variant_impact_df, x='Category', 
             y='Total Sum Impact Score',
             hover_data=['Total Number Evidence Items', 
             not_supported_variant_impact_df["% Accepted Evidence Items"]], 
             color='Total Number Evidence Items',
             labels={'Total Sum Impact Score':'Total Sum Impact Score'},
             text_auto= '.1f',
             color_continuous_scale= 'geyser')
fig.update_traces(width=1)
fig.show()

In [None]:
#fig.write_html("../../../../../../civic_ns_categories_impact_redgreen.html")

The scatterplot below shows the relationship between the Not Suported variant sub category impact score and the number of evidence items associated with variants in each sub category. Additionally, the sizes of the data point represent the number of variants in each sub category. 

In [None]:
fig2= px.scatter(data_frame= not_supported_variant_impact_df,
                x= 'Total Number Evidence Items',
                y= 'Total Sum Impact Score',
                size= 'Total Number Variants',
                size_max= 40,
                text= 'Total Number Variants',
                # color_discrete_sequence= Bold,
                color= 'Category',
                hover_data= '% Accepted Evidence Items')
fig2.show()

In [None]:
#fig2.write_html("../../../../../../civic_ns_categories_impact_scatterplot.html")