# Analysis of Normalized Queries

This notebook contains an analysis on CIViC evidence data

In [1]:
import csv
import pandas as pd
from civicpy import civic

In [2]:
# civic.load_cache(on_stale="ignore")

## Import a list of all the variant ID's in CIViC

In [3]:
civic_variant_ids = civic.get_all_variants()

In [4]:
total_number_variants = len(civic_variant_ids)
f"Total Number of variants in CIViC: {total_number_variants}"

'Total Number of variants in CIViC: 3509'

## Import a list of all the evidence ID's from CIViC

In [5]:
civic_evidence_ids = civic.get_all_evidence()

In [6]:
total_number_evidences = len(civic_evidence_ids)
f"Total Number of evidence items in CIViC: {total_number_evidences}"

'Total Number of evidence items in CIViC: 9956'

## Import queries that were normalized

In [7]:
normalized_queries_df = pd.read_csv("./able_to_normalize_queries.csv", sep= "\t")
normalized_queries_df

Unnamed: 0,variant_id,query,query_type,variant_accepted
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True
1,1988,NC_000003.11:g.10191649A>T,genomic,True
2,2488,3-10191647-T-G,genomic,True
3,1986,NC_000003.11:g.10191648G>T,genomic,True
4,1987,NC_000003.11:g.10191649A>G,genomic,True
...,...,...,...,...
1853,3161,3-10183878-G-A,genomic,False
1854,877,NC_000020.11:g.58903752C>T,genomic,True
1855,731,NC_000003.11:g.37056036G>A,genomic,True
1856,3045,VHL p.F76del,protein,False


## Create a list of the variant ID's of the normalized variants

In [8]:
normalized_variant_id_list = list(normalized_queries_df["variant_id"])

In [9]:
total_number_normalized_variants = len(normalized_variant_id_list)
f"Total Number of variants that were normalized: {total_number_normalized_variants}"

'Total Number of variants that were normalized: 1858'

In [10]:
percentage_normalized_variants = total_number_normalized_variants/total_number_variants*100
f"Percentage of variants that were normalized: {percentage_normalized_variants}%"

'Percentage of variants that were normalized: 52.94955827871188%'

## Import evidence ID's associated with the normalized variants using variant ID

Evidence items are linked to the molecular profiles associated with variant items. To pull evidence IDs, need to use molecular profiles. For more information on the structure of a variant item

In [11]:
normalized_variants_evidence_ids = []
for v in normalized_variant_id_list:
    variant_evidence_id_list = []
    for variant in civic_variant_ids:
        if int(v) == variant.id:
            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in variant_evidence_id_list:
                        variant_evidence_id_list.append(e.id)
            variant_evidence_id_list = variant_evidence_id_list or ""
    normalized_variants_evidence_ids.append(variant_evidence_id_list)
normalized_queries_df["evidence_ids"] = normalized_variants_evidence_ids
normalized_queries_df

Unnamed: 0,variant_id,query,query_type,variant_accepted,evidence_ids
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,"[9347, 6724]"
1,1988,NC_000003.11:g.10191649A>T,genomic,True,[5336]
2,2488,3-10191647-T-G,genomic,True,"[10779, 6723, 8258]"
3,1986,NC_000003.11:g.10191648G>T,genomic,True,[5334]
4,1987,NC_000003.11:g.10191649A>G,genomic,True,[5335]
...,...,...,...,...,...
1853,3161,3-10183878-G-A,genomic,False,[8659]
1854,877,NC_000020.11:g.58903752C>T,genomic,True,"[1997, 2892, 2893, 2895, 2896, 2897, 2898, 199..."
1855,731,NC_000003.11:g.37056036G>A,genomic,True,[1794]
1856,3045,VHL p.F76del,protein,False,[8240]


## Split the list of evidence ID's of each variant to be one per line

In [12]:
normalized_queries_df = normalized_queries_df.explode(column="evidence_ids")
normalized_queries_df

Unnamed: 0,variant_id,query,query_type,variant_accepted,evidence_ids
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,9347
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,6724
1,1988,NC_000003.11:g.10191649A>T,genomic,True,5336
2,2488,3-10191647-T-G,genomic,True,10779
2,2488,3-10191647-T-G,genomic,True,6723
...,...,...,...,...,...
1854,877,NC_000020.11:g.58903752C>T,genomic,True,1996
1854,877,NC_000020.11:g.58903752C>T,genomic,True,1995
1855,731,NC_000003.11:g.37056036G>A,genomic,True,1794
1856,3045,VHL p.F76del,protein,False,8240


## Create a list of the evidence ID's of the normalized variants

In [13]:
normalized_variant_evidence_id_list = list(normalized_queries_df["evidence_ids"])

In [14]:
total_number_normalized_variant_evidence_items = len(normalized_variant_evidence_id_list)
f"Total Number of times evidence items were associated with a normalized variant: {total_number_normalized_variant_evidence_items}"

'Total Number of times evidence items were associated with a normalized variant: 5896'

In [15]:
total_number_normalized_variant_unique_evidence_items = len(set(normalized_queries_df.evidence_ids))
f"Total Number of unique evidence items associated with normalized variants: {total_number_normalized_variant_unique_evidence_items}"

'Total Number of unique evidence items associated with normalized variants: 5845'

In [16]:
normalized_percentage_evidence_items = total_number_normalized_variant_unique_evidence_items/total_number_evidences*100
f"Percentage of all CIViC evidence items in Normalized variants: {normalized_percentage_evidence_items}%"

'Percentage of all CIViC evidence items in Normalized variants: 58.708316593009236%'

## Import evidence status, rating, and level associated with a specific evidence ID
    please see the CIViC documentation for evidence item attribute decriptions (https://civic.readthedocs.io/en/latest/model/evidence.html)

In [17]:
normalized_variants_evidence_statuses = []
normalized_variants_evidence_ratings = []
normalized_variants_evidence_levels = []
for e in normalized_variant_evidence_id_list:
    variant_evidence_status_list = []
    variant_evidence_rating_list = []
    variant_evidence_level_list = []
    for evidence in civic_evidence_ids: 
        if int(e) == evidence.id:
            if evidence.status not in variant_evidence_status_list:
                variant_evidence_status_list.append(evidence.status)
            variant_evidence_status_list = variant_evidence_status_list or ""
            if evidence.rating not in variant_evidence_rating_list:
                variant_evidence_rating_list.append(evidence.rating)
            variant_evidence_rating_list = variant_evidence_rating_list or ""
            if evidence.evidence_level not in variant_evidence_level_list:
                variant_evidence_level_list.append(evidence.evidence_level)
            variant_evidence_level_list = variant_evidence_level_list or ""
    normalized_variants_evidence_statuses.append(variant_evidence_status_list)
    normalized_variants_evidence_ratings.append(variant_evidence_rating_list)
    normalized_variants_evidence_levels.append(variant_evidence_level_list)
normalized_queries_df["evidence_status"] = normalized_variants_evidence_statuses
normalized_queries_df["evidence_rating"] = normalized_variants_evidence_ratings
normalized_queries_df["evidence_level"] = normalized_variants_evidence_levels
normalized_queries_df

Unnamed: 0,variant_id,query,query_type,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,9347,[submitted],[3],[C]
0,2489,NC_000003.11:g.10191648_10191649insC,genomic,True,6724,[accepted],[2],[C]
1,1988,NC_000003.11:g.10191649A>T,genomic,True,5336,[accepted],[2],[C]
2,2488,3-10191647-T-G,genomic,True,10779,[submitted],[3],[C]
2,2488,3-10191647-T-G,genomic,True,6723,[accepted],[2],[C]
...,...,...,...,...,...,...,...,...
1854,877,NC_000020.11:g.58903752C>T,genomic,True,1996,[accepted],[3],[B]
1854,877,NC_000020.11:g.58903752C>T,genomic,True,1995,[accepted],[2],[B]
1855,731,NC_000003.11:g.37056036G>A,genomic,True,1794,[accepted],[1],[C]
1856,3045,VHL p.F76del,protein,False,8240,[submitted],[2],[C]


# Analysis of Unable to Normalize Queries

## Import queries that were unable to normalize

In [18]:
not_normalized_quer_df = pd.read_csv("./unable_to_normalize_queries.csv", sep= "\t")
not_normalized_quer_df

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati..."
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati..."
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati..."
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati..."
4,4004,TERT C250T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati..."
...,...,...,...,...,...,...,...
58,3454,BRAF V599R,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati..."
59,2825,BRAF V601E,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati..."
60,3721,AR V757A,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati..."
61,3722,AR V757I,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati..."


## Create a list of the variant ID's of the variants unable to normalize

In [19]:
not_normalized_variant_id_list = list(not_normalized_quer_df["variant_id"])

In [20]:
total_number_not_normalized_variants = len(not_normalized_variant_id_list)
f"Total Number of variants that were not normalized: {total_number_not_normalized_variants}"

'Total Number of variants that were not normalized: 63'

In [21]:
percentage_not_normalized_variants = total_number_not_normalized_variants/total_number_variants*100
f"Percentage of variants that were not normalized: {percentage_not_normalized_variants}%"

'Percentage of variants that were not normalized: 1.7953833000854944%'

## Import evidence ID's associated with the unable to normalize variants using variant ID

Evidence items are linked to the molecular profiles associated with variant items. To pull evidence IDs, need to use molecular profiles. For more information on the structure of a variant item

In [22]:
not_normalized_variants_evidence_ids = []
for v in not_normalized_variant_id_list:
    not_normalized_variant_evidence_id_list = []
    for variant in civic_variant_ids: 
        if int(v) == variant.id:
            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in not_normalized_variant_evidence_id_list:
                        not_normalized_variant_evidence_id_list.append(e.id)
            not_normalized_variant_evidence_id_list = not_normalized_variant_evidence_id_list or ""
    not_normalized_variants_evidence_ids.append(not_normalized_variant_evidence_id_list)
not_normalized_quer_df["evidence_ids"] = not_normalized_variants_evidence_ids
not_normalized_quer_df

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,evidence_ids
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",[1812]
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[10128]
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[10135]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...","[655, 1646, 6934, 6935]"
4,4004,TERT C250T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[10331]
...,...,...,...,...,...,...,...,...
58,3454,BRAF V599R,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[9571]
59,2825,BRAF V601E,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...","[7633, 7627]"
60,3721,AR V757A,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[10131]
61,3722,AR V757I,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",[10132]


## Split the list of evidence ID's of each variant to be one per line

In [23]:
not_normalized_quer_df = not_normalized_quer_df.explode(column="evidence_ids")
not_normalized_quer_df

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,evidence_ids
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1812
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10128
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10135
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",655
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1646
...,...,...,...,...,...,...,...,...
59,2825,BRAF V601E,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",7633
59,2825,BRAF V601E,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",7627
60,3721,AR V757A,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10131
61,3722,AR V757I,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10132


## Create a list of the evidence ID's of the unable to normalize variants

In [24]:
not_normalized_evidence_id_list = list(not_normalized_quer_df["evidence_ids"])


In [25]:
total_number_not_normalized_variant_evidence_items = len(not_normalized_evidence_id_list)
f"Total Number of times evidence items were associated with a not normalized variant: {total_number_not_normalized_variant_evidence_items}"

'Total Number of times evidence items were associated with a not normalized variant: 80'

In [26]:
total_number_not_normalized_variant_unique_evidence_items = len(not_normalized_evidence_id_list)
f"Total Number of unique evidence items associated with a not normalized variant: {total_number_not_normalized_variant_unique_evidence_items}"

'Total Number of unique evidence items associated with a not normalized variant: 80'

In [27]:
not_normalized_percentage_evidence_items = total_number_not_normalized_variant_unique_evidence_items/total_number_evidences*100
f"Percentage of all CIViC evidence items in not Normalized variants: {not_normalized_percentage_evidence_items}%"

'Percentage of all CIViC evidence items in not Normalized variants: 0.8035355564483728%'

## Import evidence status, rating, and level associated with a specific evidence ID

In [28]:
not_normalized_variants_evidence_statuses = []
not_normalized_variants_evidence_ratings = []
not_normalized_variants_evidence_levels = []
for e in not_normalized_evidence_id_list:
    not_normalized_variant_evidence_status_list = []
    not_normalized_variant_evidence_rating_list = []
    not_normalized_variant_evidence_level_list = []
    for evidence in civic_evidence_ids: 
        if int(e) == evidence.id:
            if evidence.status not in not_normalized_variant_evidence_status_list:
                not_normalized_variant_evidence_status_list.append(evidence.status)
            not_normalized_variant_evidence_status_list = not_normalized_variant_evidence_status_list or ""
            if evidence.rating not in not_normalized_variant_evidence_rating_list:
                not_normalized_variant_evidence_rating_list.append(evidence.rating)
            not_normalized_variant_evidence_rating_list = not_normalized_variant_evidence_rating_list or ""
            if evidence.evidence_level not in not_normalized_variant_evidence_level_list:
                not_normalized_variant_evidence_level_list.append(evidence.evidence_level)
            not_normalized_variant_evidence_level_list = not_normalized_variant_evidence_level_list or ""
    not_normalized_variants_evidence_statuses.append(not_normalized_variant_evidence_status_list)
    not_normalized_variants_evidence_ratings.append(not_normalized_variant_evidence_rating_list)
    not_normalized_variants_evidence_levels.append(not_normalized_variant_evidence_level_list)
not_normalized_quer_df["evidence_status"] = not_normalized_variants_evidence_statuses
not_normalized_quer_df["evidence_rating"] = not_normalized_variants_evidence_ratings
not_normalized_quer_df["evidence_level"] = not_normalized_variants_evidence_levels
not_normalized_quer_df

Unnamed: 0,variant_id,query,query_type,variant_accepted,exception_raised,message,warnings,evidence_ids,evidence_status,evidence_rating,evidence_level
0,748,MLH1 *757L,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1812,[accepted],[1],[C]
1,3718,AR A748V,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10128,[submitted],[3],[D]
2,3725,AR A765T,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10135,[submitted],[3],[D]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",655,[accepted],[5],[B]
3,248,TERT C228T,protein,True,False,unable to normalize,"[""Unable to find valid result for classificati...",1646,[accepted],[3],[B]
...,...,...,...,...,...,...,...,...,...,...,...
59,2825,BRAF V601E,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",7633,[submitted],[3],[B]
59,2825,BRAF V601E,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",7627,[submitted],[3],[C]
60,3721,AR V757A,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10131,[submitted],[3],[D]
61,3722,AR V757I,protein,False,False,unable to normalize,"[""Unable to find valid result for classificati...",10132,[submitted],[3],[D]


In [29]:
len(set(not_normalized_quer_df.evidence_ids))

80

# Analysis of Not Supported Variants

## Import variants that were not supported

In [30]:
not_supported_variants_df = pd.read_csv("./not_supported_variants.csv", sep= "\t")
not_supported_variants_df

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted
0,4417,ALK,FBXO11::ALK,Fusion,False
1,4188,VHL,10kb Deletion,Region Defined Variant,False
2,785,CHEK2,1100DELC,Protein Consequence,True
3,2438,VHL,235 (CAG-TAG),Other,False
4,4186,VHL,2kb Deletion,Region Defined Variant,False
...,...,...,...,...,...
1199,4008,VHL,t(3;6)(p13;q25.1),Rearrangements,False
1200,3351,VHL,t(3;8)(p13;q24.1),Rearrangements,False
1201,3478,ESR2,underexpression beta-1,Other,False
1202,3508,CD274,v242,Protein Consequence,False


## Create a list of the variant ID's of the not supported variants

In [31]:
not_supported_variant_id_list = list(not_supported_variants_df["variant_id"])

In [32]:
total_number_not_supported_variants = len(not_supported_variants_df)
f"Total Number of variants that were not supported: {total_number_not_supported_variants}"

'Total Number of variants that were not supported: 1204'

In [33]:
percentage_not_supported_variants = total_number_not_supported_variants/total_number_variants*100
f"Percentage of variants that were not supported: {percentage_not_supported_variants}%"

'Percentage of variants that were not supported: 34.31176973496723%'

## Import evidence ID's associated with the not supported variants using variant ID

In [34]:
not_supported_variants_evidence_ids = []
for v in not_supported_variant_id_list:
    not_supported_variant_evidence_id_list = []
    for variant in civic_variant_ids:
        if int(v) == variant.id:
            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in not_supported_variant_evidence_id_list:
                        not_supported_variant_evidence_id_list.append(e.id)
            not_supported_variant_evidence_id_list = not_supported_variant_evidence_id_list or ""
    not_supported_variants_evidence_ids.append(not_supported_variant_evidence_id_list)
not_supported_variants_df["evidence_ids"] = not_supported_variants_evidence_ids
not_supported_variants_df

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids
0,4417,ALK,FBXO11::ALK,Fusion,False,[7428]
1,4188,VHL,10kb Deletion,Region Defined Variant,False,[10678]
2,785,CHEK2,1100DELC,Protein Consequence,True,"[7235, 1849, 1850]"
3,2438,VHL,235 (CAG-TAG),Other,False,[6524]
4,4186,VHL,2kb Deletion,Region Defined Variant,False,"[10791, 10676]"
...,...,...,...,...,...,...
1199,4008,VHL,t(3;6)(p13;q25.1),Rearrangements,False,"[10357, 10350]"
1200,3351,VHL,t(3;8)(p13;q24.1),Rearrangements,False,[9340]
1201,3478,ESR2,underexpression beta-1,Other,False,"[9612, 9613, 9618, 9619]"
1202,3508,CD274,v242,Protein Consequence,False,[9695]


## Split the list of evidence ID's of each variant to be one per line

In [35]:
not_supported_variants_df = not_supported_variants_df.explode(column="evidence_ids")
not_supported_variants_df

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids
0,4417,ALK,FBXO11::ALK,Fusion,False,7428
1,4188,VHL,10kb Deletion,Region Defined Variant,False,10678
2,785,CHEK2,1100DELC,Protein Consequence,True,7235
2,785,CHEK2,1100DELC,Protein Consequence,True,1849
2,785,CHEK2,1100DELC,Protein Consequence,True,1850
...,...,...,...,...,...,...
1201,3478,ESR2,underexpression beta-1,Other,False,9618
1201,3478,ESR2,underexpression beta-1,Other,False,9619
1202,3508,CD274,v242,Protein Consequence,False,9695
1203,2422,NTRK3,~DEPRECATED-ETV6-NTRK3,Other,False,10692


## Create a list of the evidence ID's of the not supported variants

In [36]:
not_supported_evidence_id_list = list(not_supported_variants_df["evidence_ids"])

In [37]:
total_number_not_supported_variant_evidence_items = len(not_supported_evidence_id_list)
f"Total Number of times evidence items were associated with a not supported variant: {total_number_not_supported_variant_evidence_items}"

'Total Number of times evidence items were associated with a not supported variant: 3766'

In [38]:
total_number_not_supported_variant_unique_evidence_items = len(set(not_supported_variants_df.evidence_ids))
f"Total Number of unique evidence items were associated with a not supported variant: {total_number_not_supported_variant_unique_evidence_items}"

'Total Number of unique evidence items were associated with a not supported variant: 3738'

In [39]:
not_supported_variant_percentage_evidence_items = total_number_not_supported_variant_unique_evidence_items/total_number_evidences*100
f"Percentage of all CIViC evidence items in Not Supported variants: {not_supported_variant_percentage_evidence_items}%"

'Percentage of all CIViC evidence items in Not Supported variants: 37.545198875050225%'

## Import evidence status, rating, and level associated with a specific evidence ID

In [40]:
not_supported_variants_evidence_statuses = []
not_supported_variants_evidence_ratings = []
not_supported_variants_evidence_levels = []
for e in not_supported_evidence_id_list:
    not_supported_variant_evidence_status_list = []
    not_supported_variant_evidence_rating_list = []
    not_supported_variant_evidence_level_list = []
    for evidence in civic_evidence_ids:
        if int(e) == evidence.id:
            if evidence.status not in not_supported_variant_evidence_status_list:
                not_supported_variant_evidence_status_list.append(evidence.status)
            not_supported_variant_evidence_status_list = not_supported_variant_evidence_status_list or ""
            if evidence.rating not in not_supported_variant_evidence_rating_list:
                not_supported_variant_evidence_rating_list.append(evidence.rating)
            not_supported_variant_evidence_rating_list = not_supported_variant_evidence_rating_list or ""
            if evidence.evidence_level not in not_supported_variant_evidence_level_list:
                not_supported_variant_evidence_level_list.append(evidence.evidence_level)
            not_supported_variant_evidence_level_list = not_supported_variant_evidence_level_list or ""
    not_supported_variants_evidence_statuses.append(not_supported_variant_evidence_status_list)
    not_supported_variants_evidence_ratings.append(not_supported_variant_evidence_rating_list)
    not_supported_variants_evidence_levels.append(not_supported_variant_evidence_level_list)
not_supported_variants_df["evidence_status"] = not_supported_variants_evidence_statuses
not_supported_variants_df["evidence_rating"] = not_supported_variants_evidence_ratings
not_supported_variants_df["evidence_level"] = not_supported_variants_evidence_levels
not_supported_variants_df

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
0,4417,ALK,FBXO11::ALK,Fusion,False,7428,[submitted],[3],[C]
1,4188,VHL,10kb Deletion,Region Defined Variant,False,10678,[submitted],[3],[C]
2,785,CHEK2,1100DELC,Protein Consequence,True,7235,[submitted],[4],[B]
2,785,CHEK2,1100DELC,Protein Consequence,True,1849,[rejected],[3],[B]
2,785,CHEK2,1100DELC,Protein Consequence,True,1850,[accepted],[3],[B]
...,...,...,...,...,...,...,...,...,...
1201,3478,ESR2,underexpression beta-1,Other,False,9618,[submitted],[4],[B]
1201,3478,ESR2,underexpression beta-1,Other,False,9619,[submitted],[4],[B]
1202,3508,CD274,v242,Protein Consequence,False,9695,[submitted],[4],[E]
1203,2422,NTRK3,~DEPRECATED-ETV6-NTRK3,Other,False,10692,[submitted],[3],[C]


## List all the possible variant categories

In [41]:
not_supported_variant_categories = not_supported_variants_df.category.unique()
not_supported_variant_categories

array(['Fusion', 'Region Defined Variant', 'Protein Consequence', 'Other',
       'Rearrangements', 'Copy Number', 'Expression', 'Gene Function',
       'Genotypes Compound', 'Epigenetic Modification', 'Genotypes Easy'],
      dtype=object)

## Find evidence ID's that qualify for more than one kind of variant category

In [42]:
duplicate = not_supported_variants_df[not_supported_variants_df.duplicated('evidence_ids', keep=False)]
duplicate

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
0,4417,ALK,FBXO11::ALK,Fusion,False,7428,[submitted],[3],[C]
181,200,IKZF1,Deletion,Gene Function,True,7786,[submitted],[5],[B]
189,2654,CDKN2A,Deletion,Gene Function,True,7786,[submitted],[5],[B]
192,4314,CDKN2B,Deletion,Gene Function,False,7786,[submitted],[5],[B]
193,4316,PAX5,Deletion,Gene Function,False,7786,[submitted],[5],[B]
194,4317,ERG,Deletion,Gene Function,False,7786,[submitted],[5],[B]
218,4241,EGFR,EGFR::SEPT14,Fusion,True,11152,[submitted],[2],[B]
218,4241,EGFR,EGFR::SEPT14,Fusion,True,11154,[submitted],[3],[B]
224,1516,EGFR,EGFRVIII,Gene Function,False,11152,[submitted],[2],[B]
224,1516,EGFR,EGFRVIII,Gene Function,False,11154,[submitted],[3],[B]


### How many unique evidence ID's there are

In [43]:
len(set(not_supported_variants_df.evidence_ids))

3738

### How many evidence ID's are paired with a not supported variant

In [44]:
len(not_supported_variants_df.evidence_ids)

3766

## Create individual tables seperating variants of different categories

In [45]:
expression_non_supp_var_df = not_supported_variants_df[not_supported_variants_df.category == 'Expression']
epigenetic_modification_non_supp_var_df = not_supported_variants_df[not_supported_variants_df.category == 'Epigenetic Modification']
fusion_non_supp_var_df = not_supported_variants_df[not_supported_variants_df.category == 'Fusion']
protein_consequence_non_supp_var_df = not_supported_variants_df[not_supported_variants_df.category == 'Protein Consequence']
gene_function_non_supp_var_df = not_supported_variants_df[not_supported_variants_df.category == 'Gene Function']
rearrangements_non_supp_var_df = not_supported_variants_df[not_supported_variants_df.category == 'Rearrangements']
copy_number_non_supp_var_df = not_supported_variants_df[not_supported_variants_df.category == 'Copy Number']
other_non_supp_var_df = not_supported_variants_df[not_supported_variants_df.category == 'Other']
genotypes_easy_non_supp_var_df = not_supported_variants_df[not_supported_variants_df.category == 'Genotypes Easy']
genotypes_compound_non_supp_var_df = not_supported_variants_df[not_supported_variants_df.category == 'Genotypes Compound']
region_defined_variant_non_supp_var_df = not_supported_variants_df[not_supported_variants_df.category == 'Region Defined Variant']
transcript_variant_non_supp_var_df = not_supported_variants_df[not_supported_variants_df.category == 'Transcript Variant']

### Expression

In [46]:
expression_non_supp_var_df

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
33,309,EGFR,AUTOCRINE ACTIVATION,Expression,False,769,[rejected],[2],[B]
33,309,EGFR,AUTOCRINE ACTIVATION,Expression,False,768,[rejected],[3],[D]
126,3385,CD44,CD44s,Expression,False,9426,[submitted],[4],[B]
127,341,CD44,CD44s Expression,Expression,True,825,[accepted],[3],[D]
128,3386,CD44,CD44v5 and CD44v6,Expression,False,9432,[submitted],[4],[B]
...,...,...,...,...,...,...,...,...,...
1183,272,CDKN2A,p16 Expression,Expression,True,753,[accepted],[3],[B]
1183,272,CDKN2A,p16 Expression,Expression,True,804,[accepted],[3],[B]
1183,272,CDKN2A,p16 Expression,Expression,True,1155,[accepted],[2],[B]
1183,272,CDKN2A,p16 Expression,Expression,True,1314,[accepted],[2],[B]


In [47]:
unique_expression_variant_ids = len(set(expression_non_supp_var_df.variant_id))
f"Number of unique variant ID's in Expression Variant category: {unique_expression_variant_ids}"

"Number of unique variant ID's in Expression Variant category: 291"

In [48]:
expression_percentage_civic_variant_items = unique_expression_variant_ids/total_number_variants*100
f"Percentage of all CIViC Variant Items in Expression category: {expression_percentage_civic_variant_items}%"

'Percentage of all CIViC Variant Items in Expression category: 8.29296095753776%'

In [49]:
unique_expression_evidence_ids = len(set(expression_non_supp_var_df.evidence_ids))
f"Number of unique evidence ID's in Expression category: {unique_expression_evidence_ids}"

"Number of unique evidence ID's in Expression category: 617"

In [50]:
expression_evidence_ids = len(expression_non_supp_var_df)
f"Number of evidence ID's in Expression category: {expression_evidence_ids}"

"Number of evidence ID's in Expression category: 619"

In [51]:
expression_percentage_civic_evidence_items = unique_expression_evidence_ids/total_number_evidences*100
f"Percentage of all CIViC Evidence Items in Expression category:{expression_percentage_civic_evidence_items}%"

'Percentage of all CIViC Evidence Items in Expression category:6.197267979108076%'

In [52]:
expression_percentage_not_supported_evidence_items = unique_expression_evidence_ids/total_number_not_supported_variant_unique_evidence_items*100
f"Percentage of all Not Supported Evidence Items in Expression category:{expression_percentage_not_supported_evidence_items}%"

'Percentage of all Not Supported Evidence Items in Expression category:16.506153023006956%'

### Epigenetic Modification

In [53]:
epigenetic_modification_non_supp_var_df

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
613,538,MLH1,METHYLATION,Epigenetic Modification,True,1315,[accepted],[2],[B]
614,784,CDKN2B,METHYLATION,Epigenetic Modification,True,1848,[accepted],[4],[B]
908,387,EIF4EBP1,PHOSPHORYLATION,Epigenetic Modification,True,918,[accepted],[2],[D]
908,387,EIF4EBP1,PHOSPHORYLATION,Epigenetic Modification,True,932,[accepted],[2],[D]
909,395,RPS6,PHOSPHORYLATION,Epigenetic Modification,True,930,[accepted],[2],[D]
910,632,RB1,PHOSPHORYLATION,Epigenetic Modification,True,1609,[accepted],[3],[B]
919,22,CCND2,PROMOTER DEMETHYLATION,Epigenetic Modification,True,219,[accepted],[3],[D]
920,2254,DBI,PROMOTER DEMETHYLATION,Epigenetic Modification,False,6033,[rejected],[5],[B]
921,711,KLLN,PROMOTER METHYLATION,Epigenetic Modification,True,1761,[accepted],[3],[B]
921,711,KLLN,PROMOTER METHYLATION,Epigenetic Modification,True,1762,[accepted],[3],[B]


In [54]:
unique_epigenetic_modification_variant_ids = len(set(epigenetic_modification_non_supp_var_df.variant_id))
f"Number of unique variant ID's in Epigenetic Modification Variant category: {unique_epigenetic_modification_variant_ids}"

"Number of unique variant ID's in Epigenetic Modification Variant category: 15"

In [55]:
epigenetic_modification_percentage_civic_variant_items = unique_epigenetic_modification_variant_ids/total_number_variants*100
f"Percentage of all CIViC Variant Items in Epigenetic Modification category: {epigenetic_modification_percentage_civic_variant_items}%"

'Percentage of all CIViC Variant Items in Epigenetic Modification category: 0.42747221430607013%'

In [56]:
unique_epigenetic_evidence_ids = len(set(epigenetic_modification_non_supp_var_df.evidence_ids))
f"Number of unique evidence ID's in Epigenetic Modification category: {unique_epigenetic_evidence_ids}"

"Number of unique evidence ID's in Epigenetic Modification category: 24"

In [57]:
epigenetic_evidence_ids = len(epigenetic_modification_non_supp_var_df)
f"Number of evidence ID's in Epigenetic Modification category: {epigenetic_evidence_ids}"

"Number of evidence ID's in Epigenetic Modification category: 24"

In [58]:
epigenetic_percentage_civic_evidence_items = unique_epigenetic_evidence_ids/total_number_evidences*100
f"Percentage of all CIViC Evidence Items in Epigenetic Modification category:{epigenetic_percentage_civic_evidence_items}%"

'Percentage of all CIViC Evidence Items in Epigenetic Modification category:0.24106066693451184%'

In [59]:
epigenetic_percentage_not_supported_evidence_items = unique_epigenetic_evidence_ids/total_number_not_supported_variant_unique_evidence_items*100
f"Percentage of all Not Supported Evidence Items in Epigenetic Modification category:{epigenetic_percentage_not_supported_evidence_items}%"

'Percentage of all Not Supported Evidence Items in Epigenetic Modification category:0.6420545746388443%'

### Fusion

In [60]:
fusion_non_supp_var_df

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
0,4417,ALK,FBXO11::ALK,Fusion,False,7428,[submitted],[3],[C]
18,780,GLI1,ACTB::GLI1,Fusion,True,1844,[accepted],[3],[B]
19,3702,NTRK2,AFAP1::NTRK2,Fusion,True,10098,[accepted],[3],[D]
20,2577,PDGFRB,AGGF1::PDGFRB C843G,Fusion,False,6999,[submitted],[3],[D]
21,285,BRAF,AGK::BRAF,Fusion,True,723,[accepted],[2],[C]
...,...,...,...,...,...,...,...,...,...
1161,466,FGFR1,ZNF198::FGFR1,Fusion,True,1104,[accepted],[4],[C]
1162,2864,JAK2,ZNF274::JAK2,Fusion,True,7706,[accepted],[3],[C]
1163,2977,JAK2,ZNF430::JAK2,Fusion,True,7999,[accepted],[3],[C]
1172,4406,EGFR,fusion,Fusion,False,11143,[submitted],[1],[C]


In [61]:
unique_fusion_variant_ids = len(set(fusion_non_supp_var_df.variant_id))
f"Number of unique variant ID's in Fusion Variant category: {unique_fusion_variant_ids}"

"Number of unique variant ID's in Fusion Variant category: 301"

In [62]:
fusion_percentage_civic_variant_items = unique_fusion_variant_ids/total_number_variants*100
f"Percentage of all CIViC Variant Items in Fusion category: {fusion_percentage_civic_variant_items}%"

'Percentage of all CIViC Variant Items in Fusion category: 8.577942433741807%'

In [63]:
unique_fusion_evidence_ids = len(set(fusion_non_supp_var_df.evidence_ids))
f"Number of unique evidence ID's in Fusion category: {unique_fusion_evidence_ids}"

"Number of unique evidence ID's in Fusion category: 1186"

In [64]:
fusion_evidence_ids = len(fusion_non_supp_var_df.evidence_ids)
f"Number of evidence ID's in Fusion category: {fusion_evidence_ids}"


"Number of evidence ID's in Fusion category: 1187"

In [65]:
fusion_percentage_civic_evidence_items = unique_fusion_evidence_ids/total_number_evidences*100
f"Percentage of all CIViC Evidence Items in Fusion category:{fusion_percentage_civic_evidence_items}%"

'Percentage of all CIViC Evidence Items in Fusion category:11.912414624347127%'

In [66]:
fusion_percentage_not_supported_evidence_items = unique_fusion_evidence_ids/total_number_not_supported_variant_unique_evidence_items*100
f"Percentage of all Not Supported Evidence Items in Fusion category:{fusion_percentage_not_supported_evidence_items}%"

'Percentage of all Not Supported Evidence Items in Fusion category:31.728196896736222%'

### Protein Consequence

In [67]:
protein_consequence_non_supp_var_df 

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
2,785,CHEK2,1100DELC,Protein Consequence,True,7235,[submitted],[4],[B]
2,785,CHEK2,1100DELC,Protein Consequence,True,1849,[rejected],[3],[B]
2,785,CHEK2,1100DELC,Protein Consequence,True,1850,[accepted],[3],[B]
12,2727,ERBB2,A129,Protein Consequence,False,7346,[rejected],[4],[D]
13,2729,TP53,A129,Protein Consequence,False,7348,[submitted],[4],[D]
...,...,...,...,...,...,...,...,...,...
1154,3055,FLT3,Y693,Protein Consequence,False,8285,[submitted],[3],[D]
1155,1672,EGFR,Y69FS*11,Protein Consequence,False,4808,[submitted],[None],[D]
1157,3410,TEK,Y897C and R915C,Protein Consequence,False,9476,[submitted],[2],[C]
1182,3057,GATA1,p.Ser51Alafs*86,Protein Consequence,False,8302,[submitted],[2],[C]


In [68]:
unique_protein_consequence_variant_ids = len(set(protein_consequence_non_supp_var_df.variant_id))
f"Number of unique variant ID's in Protein Consequence Variant category: {unique_protein_consequence_variant_ids}"

"Number of unique variant ID's in Protein Consequence Variant category: 128"

In [69]:
protein_consequence_percentage_civic_variant_items = unique_protein_consequence_variant_ids/total_number_variants*100
f"Percentage of all CIViC Variant Items in Protein Consequence category: {protein_consequence_percentage_civic_variant_items}%"

'Percentage of all CIViC Variant Items in Protein Consequence category: 3.647762895411798%'

In [70]:
unique_protein_evidence_ids = len(set(protein_consequence_non_supp_var_df.evidence_ids))
f"Number of unique evidence ID's in Protein Consequence category: {unique_protein_evidence_ids}"

"Number of unique evidence ID's in Protein Consequence category: 295"

In [71]:
protein_evidece_ids = len(protein_consequence_non_supp_var_df.evidence_ids)
f"Number of evidence ID's in Protein Consequence category: {protein_evidece_ids}"

"Number of evidence ID's in Protein Consequence category: 299"

In [72]:
protein_evidece_percentage_civic_evidence_items = unique_protein_evidence_ids/total_number_evidences*100
f"Percentage of all CIViC Evidence Items in Protein Evidence category:{protein_evidece_percentage_civic_evidence_items}%"

'Percentage of all CIViC Evidence Items in Protein Evidence category:2.963037364403375%'

In [73]:
protein_evidece_percentage_not_supported_evidence_items = unique_protein_evidence_ids/total_number_not_supported_variant_unique_evidence_items*100
f"Percentage of all Not Supported Evidence Items in Protein Evidence category:{protein_evidece_percentage_not_supported_evidence_items}%"

'Percentage of all Not Supported Evidence Items in Protein Evidence category:7.891920813269128%'

### Gene Function

In [74]:
gene_function_non_supp_var_df 

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
34,1296,CTNNB1,Activating Mutation,Gene Function,True,2988,[accepted],[4],[D]
35,2649,KRAS,Activating Mutation,Gene Function,False,7180,[submitted],[3],[B]
35,2649,KRAS,Activating Mutation,Gene Function,False,7182,[submitted],[2],[D]
35,2649,KRAS,Activating Mutation,Gene Function,False,7184,[submitted],[1],[D]
36,2657,ERBB2,Activating Mutation,Gene Function,False,7201,[submitted],[3],[B]
...,...,...,...,...,...,...,...,...,...
1133,312,EGFR,VIII,Gene Function,True,773,[accepted],[2],[C]
1133,312,EGFR,VIII,Gene Function,True,8191,[submitted],[4],[D]
1133,312,EGFR,VIII,Gene Function,True,4500,[accepted],[3],[D]
1133,312,EGFR,VIII,Gene Function,True,8192,[submitted],[3],[D]


In [75]:
unique_gene_function_variant_ids = len(set(gene_function_non_supp_var_df.variant_id))
f"Number of unique variant ID's in Gene Function Variant category: {unique_gene_function_variant_ids}"

"Number of unique variant ID's in Gene Function Variant category: 87"

In [76]:
gene_function_percentage_civic_variant_items = unique_gene_function_variant_ids/total_number_variants*100
f"Percentage of all CIViC Variant Items in Gene Function category: {gene_function_percentage_civic_variant_items}%"

'Percentage of all CIViC Variant Items in Gene Function category: 2.479338842975207%'

In [77]:
unique_gene_function_evidence_ids = len(set(gene_function_non_supp_var_df.evidence_ids))
f"Number of unique evidence ID's in Gene Function category: {unique_gene_function_evidence_ids}"

"Number of unique evidence ID's in Gene Function category: 343"

In [78]:
gene_function_evidence_ids = len(gene_function_non_supp_var_df.evidence_ids)
f"Number of evidence ID's in Gene Function category: {gene_function_evidence_ids}"

"Number of evidence ID's in Gene Function category: 347"

In [79]:
gene_function_percentage_civic_evidence_items = unique_gene_function_evidence_ids/total_number_evidences*100
f"Percentage of all CIViC Evidence Items in Gene Function category:{gene_function_percentage_civic_evidence_items}%"

'Percentage of all CIViC Evidence Items in Gene Function category:3.4451586982723983%'

In [80]:
gene_function_percentage_not_supported_evidence_items = unique_gene_function_evidence_ids/total_number_not_supported_variant_unique_evidence_items*100
f"Percentage of all Not Supported Evidence Items in Gene Function category:{gene_function_percentage_not_supported_evidence_items}%"

'Percentage of all Not Supported Evidence Items in Gene Function category:9.176029962546817%'

### Rearrangements

In [81]:
rearrangements_non_supp_var_df 

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
10,2390,MAP2K1,56_61QKQKVG>R,Rearrangements,False,6384,[submitted],[3],[D]
40,709,BRCA1,Alu insertion,Rearrangements,True,1759,[accepted],[3],[C]
165,943,PDGFRA,D842_H845DELDIMH,Rearrangements,True,2460,[accepted],[2],[D]
196,3009,ABL1,Double Ph,Rearrangements,True,8093,[accepted],[4],[C]
207,1577,EGFR,E709_T710>D,Rearrangements,False,4663,[submitted],[None],[C]
...,...,...,...,...,...,...,...,...,...
1197,3524,MECOM,t(3;21)(q26.2;q22),Rearrangements,False,9719,[submitted],[2],[C]
1198,3264,VHL,t(3;6)(p12.3;q24.3),Rearrangements,False,9063,[submitted],[2],[C]
1199,4008,VHL,t(3;6)(p13;q25.1),Rearrangements,False,10357,[rejected],[3],[C]
1199,4008,VHL,t(3;6)(p13;q25.1),Rearrangements,False,10350,[rejected],[2],[C]


In [82]:
unique_rearrangements_variant_ids = len(set(rearrangements_non_supp_var_df.variant_id))
f"Number of unique variant ID's in Rearrangements Variant category: {unique_rearrangements_variant_ids}"

"Number of unique variant ID's in Rearrangements Variant category: 50"

In [83]:
rearrangements_percentage_civic_variant_items = unique_rearrangements_variant_ids/total_number_variants*100
f"Percentage of all CIViC Variant Items in Rearrangements category: {rearrangements_percentage_civic_variant_items}%"

'Percentage of all CIViC Variant Items in Rearrangements category: 1.4249073810202337%'

In [84]:
unique_rearrangements_evidence_ids = len(set(rearrangements_non_supp_var_df.evidence_ids))
f"Number of unique evidence ID's in Rearrangements category: {unique_rearrangements_evidence_ids}"

"Number of unique evidence ID's in Rearrangements category: 136"

In [85]:
rearrangements_evidence_ids = len(rearrangements_non_supp_var_df.evidence_ids)
f"Number of evidence ID's in Rearrangements category: {rearrangements_evidence_ids}"

"Number of evidence ID's in Rearrangements category: 138"

In [86]:
rearrangements_percentage_civic_evidence_items = unique_rearrangements_evidence_ids/total_number_evidences*100
f"Percentage of all CIViC Evidence Items in Rearrangements category:{rearrangements_percentage_civic_evidence_items}%"

'Percentage of all CIViC Evidence Items in Rearrangements category:1.3660104459622338%'

In [87]:
rearrangements_percentage_not_supported_evidence_items = unique_rearrangements_evidence_ids/total_number_not_supported_variant_unique_evidence_items*100
f"Percentage of all Not Supported Evidence Items in Rearrangements category:{rearrangements_percentage_not_supported_evidence_items}%"

'Percentage of all Not Supported Evidence Items in Rearrangements category:3.6383092562867843%'

### Copy Number

In [88]:
copy_number_non_supp_var_df 

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
15,3195,VHL,A149fs (c.444dup),Copy Number,False,9982,[submitted],[3],[C]
15,3195,VHL,A149fs (c.444dup),Copy Number,False,8822,[submitted],[2],[C]
15,3195,VHL,A149fs (c.444dup),Copy Number,False,6156,[submitted],[1],[C]
16,2144,VHL,A56fs (c.166dup),Copy Number,True,5742,[accepted],[3],[C]
17,1579,EGFR,A767_V769dupASV,Copy Number,True,4665,[accepted],[None],[C]
...,...,...,...,...,...,...,...,...,...
1156,414,ERBB2,Y772_A775DUP,Copy Number,True,4428,[submitted],[None],[D]
1156,414,ERBB2,Y772_A775DUP,Copy Number,True,1046,[rejected],[4],[D]
1156,414,ERBB2,Y772_A775DUP,Copy Number,True,1047,[accepted],[4],[D]
1156,414,ERBB2,Y772_A775DUP,Copy Number,True,960,[accepted],[3],[D]


In [89]:
unique_copy_number_variant_ids = len(set(copy_number_non_supp_var_df.variant_id))
f"Number of unique variant ID's in Copy Number Variant category: {unique_copy_number_variant_ids}"

"Number of unique variant ID's in Copy Number Variant category: 34"

In [90]:
copy_number_percentage_civic_variant_items = unique_copy_number_variant_ids/total_number_variants*100
f"Percentage of all CIViC Variant Items in Copy Number category: {copy_number_percentage_civic_variant_items}%"

'Percentage of all CIViC Variant Items in Copy Number category: 0.968937019093759%'

In [91]:
unique_copy_number_evidence_ids = len(set(copy_number_non_supp_var_df.evidence_ids))
f"Number of unique evidence ID's in Copy Number category: {unique_copy_number_evidence_ids}"

"Number of unique evidence ID's in Copy Number category: 69"

In [92]:
copy_number_evidence_ids = len(copy_number_non_supp_var_df.evidence_ids)
f"Number of evidence ID's in Copy Number category: {copy_number_evidence_ids}"

"Number of evidence ID's in Copy Number category: 69"

In [93]:
copy_number_percentage_civic_evidence_items = unique_copy_number_evidence_ids/total_number_evidences*100
f"Percentage of all CIViC Evidence Items in Copy Number category:{copy_number_percentage_civic_evidence_items}%"

'Percentage of all CIViC Evidence Items in Copy Number category:0.6930494174367215%'

In [94]:
copy_number_percentage_not_supported_evidence_items = unique_copy_number_evidence_ids/total_number_not_supported_variant_unique_evidence_items*100
f"Percentage of all Not Supported Evidence Items in Copy Number category:{copy_number_percentage_not_supported_evidence_items}%"

'Percentage of all Not Supported Evidence Items in Copy Number category:1.8459069020866776%'

### Other

In [95]:
other_non_supp_var_df

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
3,2438,VHL,235 (CAG-TAG),Other,False,6524,[rejected],[3],[C]
9,265,TYMS,5' TANDEM REPEAT,Other,True,678,[accepted],[3],[B]
24,2211,BAP1,ALTERNATIVE TRANSCRIPT (ATI),Other,True,5929,[accepted],[3],[D]
25,842,BRAF,APC,Other,False,1941,[rejected],[3],[B]
27,3460,AR,AR alternative transcripts,Other,False,9578,[submitted],[4],[B]
...,...,...,...,...,...,...,...,...,...
1201,3478,ESR2,underexpression beta-1,Other,False,9613,[submitted],[4],[B]
1201,3478,ESR2,underexpression beta-1,Other,False,9618,[submitted],[4],[B]
1201,3478,ESR2,underexpression beta-1,Other,False,9619,[submitted],[4],[B]
1203,2422,NTRK3,~DEPRECATED-ETV6-NTRK3,Other,False,10692,[submitted],[3],[C]


In [96]:
unique_other_variant_ids = len(set(other_non_supp_var_df.variant_id))
f"Number of unique variant ID's in Other Variant category: {unique_other_variant_ids}"

"Number of unique variant ID's in Other Variant category: 92"

In [97]:
other_percentage_civic_variant_items = unique_other_variant_ids/total_number_variants*100
f"Percentage of all CIViC Variant Items in Other category: {other_percentage_civic_variant_items}%"

'Percentage of all CIViC Variant Items in Other category: 2.62182958107723%'

In [98]:
unique_other_evidence_ids = len(set(other_non_supp_var_df.evidence_ids))
f"Number of unique evidence IDs in Other category:{unique_other_evidence_ids}"

'Number of unique evidence IDs in Other category:160'

In [99]:
other_evidence_ids = len(other_non_supp_var_df.evidence_ids)
f"Number of evidence IDs in Other category:{other_evidence_ids}"

'Number of evidence IDs in Other category:160'

In [100]:
other_percentage_civic_evidence_items = unique_other_evidence_ids/total_number_evidences*100
f"Percentage of all CIViC Evidence Items in Other category:{other_percentage_civic_evidence_items}%"

'Percentage of all CIViC Evidence Items in Other category:1.6070711128967456%'

In [101]:
other_percentage_not_supported_evidence_items = unique_other_evidence_ids/total_number_not_supported_variant_unique_evidence_items*100
f"Percentage of all Not Supported Evidence Items in Other category:{other_percentage_not_supported_evidence_items}%"

'Percentage of all Not Supported Evidence Items in Other category:4.280363830925628%'

### Genotypes Easy

In [102]:
genotypes_easy_non_supp_var_df 

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
1083,729,UGT1A1,UGT1A1*28,Genotypes Easy,True,1792,[accepted],[5],[A]
1084,732,UGT1A1,UGT1A1*60,Genotypes Easy,True,1795,[accepted],[4],[B]
1141,426,BRAF,WILD TYPE,Genotypes Easy,True,995,[accepted],[2],[B]
1142,2366,PTEN,WILD TYPE,Genotypes Easy,False,6272,[rejected],[2],[D]
1143,2651,KIT,WILDTYPE,Genotypes Easy,True,4144,[accepted],[2],[B]
1145,369,TP53,Wildtype,Genotypes Easy,True,875,[accepted],[3],[B]
1145,369,TP53,Wildtype,Genotypes Easy,True,906,[accepted],[3],[B]
1145,369,TP53,Wildtype,Genotypes Easy,True,1149,[accepted],[3],[B]
1145,369,TP53,Wildtype,Genotypes Easy,True,2965,[accepted],[3],[B]
1145,369,TP53,Wildtype,Genotypes Easy,True,2963,[accepted],[4],[D]


In [103]:
unique_genotypes_easy_variant_ids = len(set(genotypes_easy_non_supp_var_df.variant_id))
f"Number of unique variant ID's in Genotypes Easy Variant category: {unique_genotypes_easy_variant_ids}"

"Number of unique variant ID's in Genotypes Easy Variant category: 10"

In [104]:
genotypes_easy_percentage_civic_variant_items = unique_genotypes_easy_variant_ids/total_number_variants*100
f"Percentage of all CIViC Variant Items in Genotypes Easy category: {genotypes_easy_percentage_civic_variant_items}%"

'Percentage of all CIViC Variant Items in Genotypes Easy category: 0.28498147620404674%'

In [105]:
unique_genotypes_easy_evidence_ids = len(set(genotypes_easy_non_supp_var_df.evidence_ids))
f"Number of unique evidence ID's in Genotypes Easy category:{unique_genotypes_easy_evidence_ids}"

"Number of unique evidence ID's in Genotypes Easy category:20"

In [106]:
genotypes_easy_evidence_ids = len(genotypes_easy_non_supp_var_df.evidence_ids)
f"Number of evidence ID's in Genotypes Easy category:{genotypes_easy_evidence_ids}"

"Number of evidence ID's in Genotypes Easy category:20"

In [107]:
genotypes_easy_percentage_civic_evidence_items = unique_genotypes_easy_evidence_ids/total_number_evidences*100
f"Percentage of all CIViC Evidence Items in Genotypes Easy category:{genotypes_easy_percentage_civic_evidence_items}%"

'Percentage of all CIViC Evidence Items in Genotypes Easy category:0.2008838891120932%'

In [108]:
genotypes_easy_percentage_not_supported_evidence_items = unique_genotypes_easy_evidence_ids/total_number_not_supported_variant_unique_evidence_items*100
f"Percentage of all Not Supported Evidence Items in Genotypes Easy category:{genotypes_easy_percentage_not_supported_evidence_items}%"

'Percentage of all Not Supported Evidence Items in Genotypes Easy category:0.5350454788657035%'

### Genotypes Compound

In [109]:
genotypes_compound_non_supp_var_df 

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
149,821,HLA-C,COPY-NEUTRAL LOSS OF HETEROZYGOSITY,Genotypes Compound,True,1899,[accepted],[3],[C]
175,738,DPYD,DPYD*13 HOMOZYGOSITY,Genotypes Compound,True,1801,[accepted],[5],[A]
176,737,DPYD,DPYD*2A HOMOZYGOSITY,Genotypes Compound,True,1800,[accepted],[5],[A]
548,302,PTCH1,LOH,Genotypes Compound,True,749,[accepted],[2],[B]
918,3235,PRKAR1A,"PRKAR1A LOH, allelic imbalance 17q",Genotypes Compound,False,8936,[submitted],[3],[B]
1044,3108,VHL,Single Allele Deletion,Genotypes Compound,False,8499,[submitted],[3],[C]
1044,3108,VHL,Single Allele Deletion,Genotypes Compound,False,8580,[submitted],[3],[C]


In [110]:
unique_genotypes_compound_variant_ids = len(set(genotypes_compound_non_supp_var_df.variant_id))
f"Number of unique variant ID's in Genotypes Compound Variant category: {unique_genotypes_compound_variant_ids}"

"Number of unique variant ID's in Genotypes Compound Variant category: 6"

In [111]:
genotypes_compound_percentage_civic_variant_items = unique_genotypes_compound_variant_ids/total_number_variants*100
f"Percentage of all CIViC Variant Items in Genotypes Compound category: {genotypes_compound_percentage_civic_variant_items}%"

'Percentage of all CIViC Variant Items in Genotypes Compound category: 0.17098888572242804%'

In [112]:
unique_genotypes_compound_evidence_ids = len(set(genotypes_compound_non_supp_var_df.evidence_ids))
f"Number of unique evidence ID's in Genptypes Compound category: {unique_genotypes_compound_evidence_ids}"

"Number of unique evidence ID's in Genptypes Compound category: 7"

In [113]:
genotypes_compound_evidence_ids = len(genotypes_compound_non_supp_var_df.evidence_ids)
f"Number of evidence ID's in Genptypes Compound category: {genotypes_compound_evidence_ids}"

"Number of evidence ID's in Genptypes Compound category: 7"

In [114]:
genotypes_compound_percentage_civic_evidence_items = unique_genotypes_compound_evidence_ids/total_number_evidences*100
f"Percentage of all CIViC Evidence Items in Genotypes Compound category:{genotypes_compound_percentage_civic_evidence_items}%"

'Percentage of all CIViC Evidence Items in Genotypes Compound category:0.07030936118923262%'

In [115]:
genotypes_compound_percentage_not_supported_evidence_items = unique_genotypes_compound_evidence_ids/total_number_not_supported_variant_unique_evidence_items*100
f"Percentage of all Not Supported Evidence Items in Genotypes Compound category:{genotypes_compound_percentage_not_supported_evidence_items}%"

'Percentage of all Not Supported Evidence Items in Genotypes Compound category:0.18726591760299627%'

### Region Defined Variant

In [116]:
region_defined_variant_non_supp_var_df 

Unnamed: 0,variant_id,gene_name,variant_name,category,variant_accepted,evidence_ids,evidence_status,evidence_rating,evidence_level
1,4188,VHL,10kb Deletion,Region Defined Variant,False,10678,[submitted],[3],[C]
4,4186,VHL,2kb Deletion,Region Defined Variant,False,10791,[submitted],[3],[C]
4,4186,VHL,2kb Deletion,Region Defined Variant,False,10676,[submitted],[2],[C]
5,823,EPCAM,3' Exon Deletion,Region Defined Variant,True,1901,[accepted],[4],[B]
6,253,EGFR,3' UTR MUTATION,Region Defined Variant,False,663,[rejected],[4],[B]
...,...,...,...,...,...,...,...,...,...
1030,536,SULT1E1,SNP,Region Defined Variant,False,1312,[rejected],[2],[B]
1048,491,HSPH1,T17 DELETION,Region Defined Variant,True,1164,[accepted],[2],[B]
1169,2349,IL7R,exon 6 mutations,Region Defined Variant,False,6228,[submitted],[4],[C]
1178,2845,HOXD8,mutation,Region Defined Variant,False,7670,[submitted],[2],[C]


In [117]:
unique_region_defined_variant_ids = len(set(region_defined_variant_non_supp_var_df.variant_id))
f"Number of unique variant ID's in Region Defined Variant category: {unique_region_defined_variant_ids}"

"Number of unique variant ID's in Region Defined Variant category: 190"

In [118]:
region_defined_percentage_civic_variant_items = unique_region_defined_variant_ids/total_number_variants*100
f"Percentage of all CIViC Variant Items in Region Defined category: {region_defined_percentage_civic_variant_items}%"

'Percentage of all CIViC Variant Items in Region Defined category: 5.414648047876888%'

In [119]:
unique_region_defined_evidence_ids = len(set(region_defined_variant_non_supp_var_df.evidence_ids))
f"Number of unique evidence ID's in Region Defined Variant category: {unique_region_defined_evidence_ids}"

"Number of unique evidence ID's in Region Defined Variant category: 894"

In [120]:
region_defined_evidence_ids = len(region_defined_variant_non_supp_var_df.evidence_ids)
f"Number of evidence ID's in Region Defined Variant category: {region_defined_evidence_ids}"

"Number of evidence ID's in Region Defined Variant category: 896"

In [121]:
region_defined_percentage_civic_evidence_items = unique_region_defined_evidence_ids/total_number_evidences*100
f"Percentage of all CIViC Evidence Items in Region Defined category:{region_defined_percentage_civic_evidence_items}%"

'Percentage of all CIViC Evidence Items in Region Defined category:8.979509843310566%'

In [122]:
region_defined_percentage_not_supported_evidence_items = unique_region_defined_evidence_ids/total_number_not_supported_variant_unique_evidence_items*100
f"Percentage of all Not Supported Evidence Items in Region Defined category:{region_defined_percentage_not_supported_evidence_items}%"

'Percentage of all Not Supported Evidence Items in Region Defined category:23.91653290529695%'

# Statistical Analysis

In [123]:
all_variant_dict = {'Variant Category':['Normalized', 'Unable to Normalize', 'Not Supported'],
        # 'Total Number of Evidence Items': [total_number_not_normalized_variant_unique_evidence_items, total_number_not_normalized_variant_unique_evidence_items, total_number_not_supported_variant_unique_evidence_items],
        'Percentage of all CIViC Evidence Items': [normalized_percentage_evidence_items, not_normalized_percentage_evidence_items, not_supported_variant_percentage_evidence_items],
        'Percentage of all CIViC Variant Items': [percentage_normalized_variants, percentage_not_normalized_variants, percentage_not_supported_variants]}

In [124]:
variant_statistics_df = pd.DataFrame(all_variant_dict)
variant_statistics_df

Unnamed: 0,Variant Category,Percentage of all CIViC Evidence Items,Percentage of all CIViC Variant Items
0,Normalized,58.708317,52.949558
1,Unable to Normalize,0.803536,1.795383
2,Not Supported,37.545199,34.31177


In [125]:
not_supported_variant_dict = {'Category': ['Expression','Epigenetic Modification', 'Fusion', 'Protein Consequence', 'Gene Function', 'Rearrangements', 'Copy Number', 'Other', 'Genotypes Easy', 'Genotypes Compound', 'Region Defined Variant' ],
                                # 'Total Number of Evidence Items': [expression_evidence_ids, epigenetic_evidence_ids, fusion_evidence_ids, protein_evidece_ids, gene_function_evidence_ids, rearrangements_evidence_ids, copy_number_evidence_ids, other_evidence_ids, genotypes_easy_evidence_ids, genotypes_compound_evidence_ids, region_defined_evidence_ids],
                                'Percentage of all Not Supported Evidence Items':[expression_percentage_not_supported_evidence_items, epigenetic_percentage_not_supported_evidence_items, fusion_percentage_not_supported_evidence_items, protein_evidece_percentage_not_supported_evidence_items, gene_function_percentage_not_supported_evidence_items, rearrangements_percentage_not_supported_evidence_items, copy_number_percentage_not_supported_evidence_items, other_percentage_not_supported_evidence_items, genotypes_easy_percentage_not_supported_evidence_items, genotypes_compound_percentage_not_supported_evidence_items, region_defined_percentage_not_supported_evidence_items],
                                'Percentage of all CIViC Evidence Items':[expression_percentage_civic_evidence_items, epigenetic_percentage_civic_evidence_items, fusion_percentage_civic_evidence_items, protein_evidece_percentage_civic_evidence_items, gene_function_percentage_civic_evidence_items, rearrangements_percentage_civic_evidence_items, copy_number_percentage_civic_evidence_items, other_percentage_civic_evidence_items, genotypes_easy_percentage_civic_evidence_items, genotypes_compound_percentage_civic_evidence_items, region_defined_percentage_civic_evidence_items],
                                'Percentage of all CIViC Varinat Items': [expression_percentage_civic_variant_items, epigenetic_modification_percentage_civic_variant_items, fusion_percentage_civic_variant_items, protein_consequence_percentage_civic_variant_items, gene_function_percentage_civic_variant_items, rearrangements_percentage_civic_variant_items, copy_number_percentage_civic_variant_items, other_percentage_civic_variant_items, genotypes_easy_percentage_civic_variant_items, genotypes_compound_percentage_civic_variant_items, region_defined_percentage_civic_variant_items]}

In [126]:
not_supported_variant_statistics_df = pd.DataFrame(not_supported_variant_dict)
not_supported_variant_statistics_df

Unnamed: 0,Category,Percentage of all Not Supported Evidence Items,Percentage of all CIViC Evidence Items,Percentage of all CIViC Varinat Items
0,Expression,16.506153,6.197268,8.292961
1,Epigenetic Modification,0.642055,0.241061,0.427472
2,Fusion,31.728197,11.912415,8.577942
3,Protein Consequence,7.891921,2.963037,3.647763
4,Gene Function,9.17603,3.445159,2.479339
5,Rearrangements,3.638309,1.36601,1.424907
6,Copy Number,1.845907,0.693049,0.968937
7,Other,4.280364,1.607071,2.62183
8,Genotypes Easy,0.535045,0.200884,0.284981
9,Genotypes Compound,0.187266,0.070309,0.170989


#### Notes: 
    Kori's analysis has variants under the transcript ariant category, while I do not
    Kori's analysis yields 45.26% of not supported variants, mine yeilds 34.4%
    other categories in not supported variants match in variant percentages b/w Kori's and mine