# Analysis of Normalized Queries

This notebook contains an analysis on CIViC evidence data

In [1]:
import csv
import pandas as pd
from civicpy import civic

In [2]:
civic.update_cache(from_remote_cache=False)



KeyboardInterrupt: 

In [None]:
civic.load_cache(on_stale="ignore")

## Import a list of all the variant ID's in CIViC

In [None]:
civic_variant_ids = civic.get_all_variants()

In [None]:
total_number_variants = len(civic_variant_ids)
f"Total Number of variants in CIViC: {total_number_variants}"

## Import a list of all the evidence ID's from CIViC

In [None]:
civic_evidence_ids = civic.get_all_evidence()

In [None]:
total_number_evidences = len(civic_evidence_ids)
f"Total Number of evidence items in CIViC: {total_number_evidences}"

## Import queries that were normalized

In [None]:
normalized_queries_df = pd.read_csv("./able_to_normalize_queries.csv", sep= "\t")
normalized_queries_df

## Create a list of the variant ID's of the normalized variants

In [None]:
normalized_variant_id_list = list(normalized_queries_df["variant_id"])

In [None]:
total_number_normalized_variants = len(normalized_variant_id_list)
f"Total Number of variants that were normalized: {total_number_normalized_variants}"

## Import evidence ID's associated with the normalized variants using variant ID

Evidence items are linked to the molecular profiles associated with variant items. To pull evidence IDs, need to use molecular profiles. For more information on the structure of a variant item

In [None]:
normalized_variants_evidence_ids = []
for v in normalized_variant_id_list:
    variant_evidence_id_list = []
    for variant in civic_variant_ids:
        if int(v) == variant.id:
            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in variant_evidence_id_list:
                        variant_evidence_id_list.append(e.id)
            variant_evidence_id_list = variant_evidence_id_list or ""
    normalized_variants_evidence_ids.append(variant_evidence_id_list)
normalized_queries_df["evidence_ids"] = normalized_variants_evidence_ids
normalized_queries_df

## Split the list of evidence ID's of each variant to be one per line

In [None]:
normalized_queries_df = normalized_queries_df.explode(column="evidence_ids")
normalized_queries_df

## Create a list of the evidence ID's of the normalized variants

In [None]:
normalized_variant_evidence_id_list = list(normalized_queries_df["evidence_ids"])

In [None]:
total_number_normalized_variant_evidence_items = len(normalized_variant_evidence_id_list)
f"Total Number of times evidence items were associated with a normalized variant: {total_number_normalized_variant_evidence_items}"

In [None]:
total_number_normalized_variant_unique_evidence_items = len(set(normalized_queries_df.evidence_ids))
f"Total Number of unique evidence items associated with normalized variants: {total_number_normalized_variant_unique_evidence_items}"

## Import evidence status, rating, and level associated with a specific evidence ID
    please see the CIViC documentation for evidence item attribute decriptions (https://civic.readthedocs.io/en/latest/model/evidence.html)

In [None]:
normalized_variants_evidence_statuses = []
normalized_variants_evidence_ratings = []
normalized_variants_evidence_levels = []
for e in normalized_variant_evidence_id_list:
    variant_evidence_status_list = []
    variant_evidence_rating_list = []
    variant_evidence_level_list = []
    for evidence in civic_evidence_ids: 
        if int(e) == evidence.id:
            if evidence.status not in variant_evidence_status_list:
                variant_evidence_status_list.append(evidence.status)
            variant_evidence_status_list = variant_evidence_status_list or ""
            if evidence.rating not in variant_evidence_rating_list:
                variant_evidence_rating_list.append(evidence.rating)
            variant_evidence_rating_list = variant_evidence_rating_list or ""
            if evidence.evidence_level not in variant_evidence_level_list:
                variant_evidence_level_list.append(evidence.evidence_level)
            variant_evidence_level_list = variant_evidence_level_list or ""
    normalized_variants_evidence_statuses.append(variant_evidence_status_list)
    normalized_variants_evidence_ratings.append(variant_evidence_rating_list)
    normalized_variants_evidence_levels.append(variant_evidence_level_list)
normalized_queries_df["evidence_status"] = normalized_variants_evidence_statuses
normalized_queries_df["evidence_rating"] = normalized_variants_evidence_ratings
normalized_queries_df["evidence_level"] = normalized_variants_evidence_levels
normalized_queries_df

# Unable to Normalize Queries

In [None]:
non_norm_quer_df = pd.read_csv("./unable_to_normalize_queries.csv", sep= "\t")
non_norm_quer_df

In [None]:
nn_variant_id_list = list(non_norm_quer_df["variant_id"])

In [None]:
nn_e_ids = []
for v in nn_variant_id_list:
    nn_civic_e_ids = []
    for variant in civic_variant_ids: 
        if int(v) == variant.id:
            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in nn_civic_e_ids:
                        nn_civic_e_ids.append(e.id)
            nn_civic_e_ids = nn_civic_e_ids or ""
    nn_e_ids.append(nn_civic_e_ids)
non_norm_quer_df["evidence_ids"] = nn_e_ids
non_norm_quer_df

In [None]:
non_norm_quer_df = non_norm_quer_df.explode(column="evidence_ids")
non_norm_quer_df

In [None]:
nn_evidence_id_list = list(non_norm_quer_df["evidence_ids"])

In [None]:
nn_e_status = []
nn_e_rating = []
nn_e_level = []
for e in nn_evidence_id_list:
    nn_civic_e_status = []
    nn_civic_e_rating = []
    nn_civic_e_level = []
    for evidence in civic_evidence_ids: 
        # print(civic_id, type(civic_id))
        # print(   = =variant.gene_id, type(variant.gene_id))
        if int(e) == evidence.id:
            if evidence.status not in nn_civic_e_status:
                nn_civic_e_status.append(evidence.status)
            nn_civic_e_status = nn_civic_e_status or ""
            if evidence.rating not in nn_civic_e_rating:
                nn_civic_e_rating.append(evidence.rating)
            nn_civic_e_rating = nn_civic_e_rating or ""
            if evidence.evidence_level not in nn_civic_e_level:
                nn_civic_e_level.append(evidence.evidence_level)
            nn_civic_e_level = nn_civic_e_level or ""
    nn_e_status.append(nn_civic_e_status)
    nn_e_rating.append(nn_civic_e_rating)
    nn_e_level.append(nn_civic_e_level)
non_norm_quer_df["evidence_status"] = nn_e_status
non_norm_quer_df["evidence_rating"] = nn_e_rating
non_norm_quer_df["evidence_level"] = nn_e_level
non_norm_quer_df

In [None]:
len(set(non_norm_quer_df.evidence_ids))

# Not Supported Variants

In [None]:
non_supp_var_df = pd.read_csv("./not_supported_variants.csv", sep= "\t")
non_supp_var_df

In [None]:
ns_variant_id_list = list(non_supp_var_df["variant_id"])

In [None]:
ns_e_ids = []
for v in ns_variant_id_list:
    ns_civic_e_ids = []
    for variant in civic_variant_ids: 
        # print(civic_id, type(civic_id))
        # print(   = =variant.gene_id, type(variant.gene_id))
        if int(v) == variant.id:
            for mp in variant.molecular_profiles:
                for e in mp.evidence_items:
                    if e.id not in ns_civic_e_ids:
                        ns_civic_e_ids.append(e.id)
            ns_civic_e_ids = ns_civic_e_ids or ""
    ns_e_ids.append(ns_civic_e_ids)
non_supp_var_df["evidence_ids"] = ns_e_ids
non_supp_var_df

In [None]:
non_supp_var_df = non_supp_var_df.explode(column="evidence_ids")
non_supp_var_df

In [None]:
ns_evidence_id_list = list(non_supp_var_df["evidence_ids"])

In [None]:
ns_e_status = []
ns_e_rating = []
ns_e_level = []
for e in ns_evidence_id_list:
    ns_civic_e_status = []
    ns_civic_e_rating = []
    ns_civic_e_level = []
    for evidence in civic_evidence_ids: 
        # print(civic_id, type(civic_id))
        # print(   = =variant.gene_id, type(variant.gene_id))
        if int(e) == evidence.id:
            if evidence.status not in ns_civic_e_status:
                ns_civic_e_status.append(evidence.status)
            ns_civic_e_status = ns_civic_e_status or ""
            if evidence.rating not in ns_civic_e_rating:
                ns_civic_e_rating.append(evidence.rating)
            ns_civic_e_rating = ns_civic_e_rating or ""
            if evidence.evidence_level not in ns_civic_e_level:
                ns_civic_e_level.append(evidence.evidence_level)
            ns_civic_e_level = ns_civic_e_level or ""
    ns_e_status.append(ns_civic_e_status)
    ns_e_rating.append(ns_civic_e_rating)
    ns_e_level.append(ns_civic_e_level)
non_supp_var_df["evidence_status"] = ns_e_status
non_supp_var_df["evidence_rating"] = ns_e_rating
non_supp_var_df["evidence_level"] = ns_e_level
non_supp_var_df

In [None]:
non_supp_var_df.category.unique()

In [None]:
duplicate = non_supp_var_df[non_supp_var_df.duplicated('evidence_ids', keep=False)]
duplicate

In [None]:
len(set(non_supp_var_df.evidence_ids))

In [None]:
len(non_supp_var_df.evidence_ids)

In [None]:
expression_non_supp_var_df = non_supp_var_df[non_supp_var_df.category == 'Expression']
epigenetic_modification_non_supp_var_df = non_supp_var_df[non_supp_var_df.category == 'Epigenetic Modification']
fusion_non_supp_var_df = non_supp_var_df[non_supp_var_df.category == 'Fusion']
protein_consequence_non_supp_var_df = non_supp_var_df[non_supp_var_df.category == 'Protein Consequence']
gene_function_non_supp_var_df = non_supp_var_df[non_supp_var_df.category == 'Gene Function']
rearrangements_non_supp_var_df = non_supp_var_df[non_supp_var_df.category == 'Rearrangements']
copy_number_non_supp_var_df = non_supp_var_df[non_supp_var_df.category == 'Copy Number']
other_non_supp_var_df = non_supp_var_df[non_supp_var_df.category == 'Other']
genotypes_easy_non_supp_var_df = non_supp_var_df[non_supp_var_df.category == 'Genotypes Easy']
genotypes_compound_non_supp_var_df = non_supp_var_df[non_supp_var_df.category == 'Genotypes Compound']
region_defined_variant_non_supp_var_df = non_supp_var_df[non_supp_var_df.category == 'Region Defined Variant']
transcript_variant_non_supp_var_df = non_supp_var_df[non_supp_var_df.category == 'Transcript Variant']

In [None]:
expression_non_supp_var_df

In [None]:
len(set(expression_non_supp_var_df.evidence_ids))

In [None]:
len(expression_non_supp_var_df)

In [None]:
epigenetic_modification_non_supp_var_df

In [None]:
len(set(epigenetic_modification_non_supp_var_df.evidence_ids))

In [None]:
fusion_non_supp_var_df

In [None]:
len(set(fusion_non_supp_var_df.evidence_ids))

In [None]:
protein_consequence_non_supp_var_df 

In [None]:
len(set(protein_consequence_non_supp_var_df.evidence_ids))

In [None]:
gene_function_non_supp_var_df 

In [None]:
len(set(gene_function_non_supp_var_df.evidence_ids))

In [None]:
rearrangements_non_supp_var_df 

In [None]:
len(set(rearrangements_non_supp_var_df.evidence_ids))

In [None]:
copy_number_non_supp_var_df 

In [None]:
len(set(copy_number_non_supp_var_df.evidence_ids))

In [None]:
other_non_supp_var_df

In [None]:
len(set(other_non_supp_var_df.evidence_ids))

In [None]:
genotypes_easy_non_supp_var_df 

In [None]:
len(set(genotypes_easy_non_supp_var_df.evidence_ids))

In [None]:
genotypes_compound_non_supp_var_df 

In [None]:
len(set(genotypes_compound_non_supp_var_df.evidence_ids))

In [None]:
region_defined_variant_non_supp_var_df 

In [None]:
len(set(region_defined_variant_non_supp_var_df.evidence_ids))

In [None]:
transcript_variant_non_supp_var_df 

In [None]:
len(set(transcript_variant_non_supp_var_df.evidence_ids))