In [62]:
import pandas as pd
from itertools import product
import plotly.express as px



## Join and Normalize GENIE Data
As part of the MetaKB v2 analysis, we will utilize part of the GENIE dataset to look at available interpretations in the aggregate knowledgebase for concepts as determined by real patient samples. We will join the mutation and clinical datasets, normalize values, and quantify interpretations.

In [63]:
# Project GENIE Mutation Dataset
muts = pd.read_csv('genie_data/data_mutations_extended.txt',sep='\t', low_memory=False)
muts[0:5]

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,FILTER,Polyphen_Prediction,Polyphen_Score,SIFT_Prediction,SIFT_Score,SWISSPROT,n_depth,t_depth,Annotation_Status,mutationInCis_Flag
0,KRAS,3845.0,JHU,GRCh37,12,25398285,25398285,+,missense_variant,Missense_Mutation,...,PASS,probably_damaging,0.991,deleterious,0.04,,,1623.0,SUCCESS,False
1,BRAF,673.0,JHU,GRCh37,7,140453136,140453136,+,missense_variant,Missense_Mutation,...,PASS,probably_damaging,0.963,deleterious,0.0,,,1031.0,SUCCESS,False
2,EGFR,1956.0,JHU,GRCh37,7,55249071,55249071,+,missense_variant,Missense_Mutation,...,PASS,probably_damaging,1.0,deleterious,0.0,,,692.0,SUCCESS,False
3,TP53,7157.0,JHU,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,...,PASS,possibly_damaging,0.643,tolerated,0.13,,,930.0,SUCCESS,False
4,NRAS,4893.0,JHU,GRCh37,1,115256529,115256529,+,missense_variant,Missense_Mutation,...,PASS,benign,0.251,tolerated,0.06,,,2277.0,SUCCESS,False


In [64]:
# Project GENIE Clinical Dataset
clin = pd.read_csv('genie_data/data_clinical_sample.txt',sep='\t', low_memory=False, header=4)
clin[0:5]

Unnamed: 0,PATIENT_ID,SAMPLE_ID,AGE_AT_SEQ_REPORT,ONCOTREE_CODE,SAMPLE_TYPE,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS
0,GENIE-VICC-101416,GENIE-VICC-101416-unk-1,52,MAAP,Primary,VICC-01-T7,Appendiceal Cancer,Mucinous Adenocarcinoma of the Appendix,Primary tumor,Tumor
1,GENIE-VICC-102225,GENIE-VICC-102225-unk-1,31,READ,Metastasis,VICC-01-T7,Colorectal Cancer,Rectal Adenocarcinoma,Metastasis site unspecified,Tumor
2,GENIE-VICC-102424,GENIE-VICC-102424-unk-1,63,SCCNOS,Metastasis,VICC-01-T7,Cancer of Unknown Primary,"Squamous Cell Carcinoma, NOS",Metastasis site unspecified,Tumor
3,GENIE-VICC-102966,GENIE-VICC-102966-unk-1,64,LUAD,Metastasis,VICC-01-T7,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Metastasis site unspecified,Tumor
4,GENIE-VICC-103244,GENIE-VICC-103244-unk-1,50,IDC,Primary,VICC-01-T5A,Breast Cancer,Breast Invasive Ductal Carcinoma,Primary tumor,Tumor


In [65]:
# Join the two data sets on patient identifiers
df = pd.merge(muts, clin, left_on='Tumor_Sample_Barcode', right_on='SAMPLE_ID', how="left")
df[0:5]

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,PATIENT_ID,SAMPLE_ID,AGE_AT_SEQ_REPORT,ONCOTREE_CODE,SAMPLE_TYPE,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS
0,KRAS,3845.0,JHU,GRCh37,12,25398285,25398285,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor
1,BRAF,673.0,JHU,GRCh37,7,140453136,140453136,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor
2,EGFR,1956.0,JHU,GRCh37,7,55249071,55249071,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor
3,TP53,7157.0,JHU,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor
4,NRAS,4893.0,JHU,GRCh37,1,115256529,115256529,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor


In [82]:
len(df)

1876024

In [66]:
# From MetaKB v1 paper --> Exclude silent, 3' Flank, 3'UTR, 5'Flank, 5'UR, Intron, Splice_region
# 3_prime_UTR_variant, 5_prime_UTR_variant, splice_region_variant, splice_acceptor_variant, splice_donor_variant, intron_variant, synonymous_variant 
all_consequences = []
for entry in df['Consequence']:
    if isinstance(entry,float):
        pass
    else:
        consequences = entry.split(',')
        all_consequences.extend(consequences)
set(all_consequences) 

# TODO: how do start/stop_retained variants function? are these silent?

{'3_prime_UTR_variant',
 '5_prime_UTR_variant',
 'NMD_transcript_variant',
 'coding_sequence_variant',
 'downstream_gene_variant',
 'frameshift_variant',
 'incomplete_terminal_codon_variant',
 'inframe_deletion',
 'inframe_insertion',
 'intron_variant',
 'mature_miRNA_variant',
 'missense_variant',
 'non_coding_transcript_exon_variant',
 'non_coding_transcript_variant',
 'protein_altering_variant',
 'splice_acceptor_variant',
 'splice_donor_variant',
 'splice_region_variant',
 'start_lost',
 'start_retained_variant',
 'stop_gained',
 'stop_lost',
 'stop_retained_variant',
 'synonymous_variant',
 'upstream_gene_variant'}

In [67]:
# No Consequence (NA) also means silent? Drop!
df = df.dropna(subset=['Consequence']).reset_index(drop=True)
df[0:5]

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,PATIENT_ID,SAMPLE_ID,AGE_AT_SEQ_REPORT,ONCOTREE_CODE,SAMPLE_TYPE,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS
0,KRAS,3845.0,JHU,GRCh37,12,25398285,25398285,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor
1,BRAF,673.0,JHU,GRCh37,7,140453136,140453136,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor
2,EGFR,1956.0,JHU,GRCh37,7,55249071,55249071,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor
3,TP53,7157.0,JHU,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor
4,NRAS,4893.0,JHU,GRCh37,1,115256529,115256529,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor


In [68]:
# Using the above lists, excluse silent consequences 
exclusion_criteria = ['3_prime_UTR_variant', 
                     '5_prime_UTR_variant', 
                     'splice_region_variant', 
                     'splice_acceptor_variant', 
                     'splice_donor_variant', 
                     'intron_variant', 
                     'synonymous_variant']

df['Exclusion?'] = False
for idx, row in df.iterrows():
    consequences = row['Consequence'].split(',')
    for consequence in consequences:
        if consequence in exclusion_criteria:
            df.at[idx, 'Exclusion?'] = True

df = df[df['Exclusion?']==False].reset_index(drop=True)
df[0:5]

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,SAMPLE_ID,AGE_AT_SEQ_REPORT,ONCOTREE_CODE,SAMPLE_TYPE,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS,Exclusion?
0,KRAS,3845.0,JHU,GRCh37,12,25398285,25398285,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False
1,BRAF,673.0,JHU,GRCh37,7,140453136,140453136,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False
2,EGFR,1956.0,JHU,GRCh37,7,55249071,55249071,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False
3,TP53,7157.0,JHU,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False
4,NRAS,4893.0,JHU,GRCh37,1,115256529,115256529,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False


In [9]:
# Get concept IDs for everything attached to the patients by the following corresponding fields
# Variant -> HGVSc
# Gene -> Hugo_Symbol
# Disease -> CANCER_TYPE
df.keys()

Index(['Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'NCBI_Build', 'Chromosome',
       'Start_Position', 'End_Position', 'Strand', 'Consequence',
       'Variant_Classification', 'Variant_Type', 'Reference_Allele',
       'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS',
       'dbSNP_Val_Status', 'Tumor_Sample_Barcode',
       'Matched_Norm_Sample_Barcode', 'Match_Norm_Seq_Allele1',
       'Match_Norm_Seq_Allele2', 'Tumor_Validation_Allele1',
       'Tumor_Validation_Allele2', 'Match_Norm_Validation_Allele1',
       'Match_Norm_Validation_Allele2', 'Verification_Status',
       'Validation_Status', 'Mutation_Status', 'Sequencing_Phase',
       'Sequence_Source', 'Validation_Method', 'Score', 'BAM_File',
       'Sequencer', 't_ref_count', 't_alt_count', 'n_ref_count', 'n_alt_count',
       'HGVSc', 'HGVSp', 'HGVSp_Short', 'Transcript_ID', 'RefSeq',
       'Protein_position', 'Codons', 'Exon_Number', 'gnomAD_AF',
       'gnomAD_AFR_AF', 'gnomAD_AMR_AF', 'gnomAD_ASJ_AF', 'gnomAD_EAS_AF

### Normalize

In [69]:
import requests 

def normalize_disease(term):
    r = requests.get(f'https://normalize.cancervariants.org/disease/normalize?q={term}')
    # print(r.json())
    if r.status_code == 200:
        try:
            identifier = r.json()['disease']['id']
        except:
            identifier = None
    return identifier

unique_diseases = list(df['CANCER_TYPE'].unique())
disease_identifiers = [None] * len(unique_diseases)
disease_wordlist = pd.DataFrame({'unique_disease': unique_diseases,
                                 'disease_identifier': disease_identifiers})
disease_wordlist['disease_identifier'] = disease_wordlist['unique_disease'].apply(normalize_disease) 
disease_wordlist_map = disease_wordlist.set_index('unique_disease')['disease_identifier'] #TODO: why two variables?
df['disease_concept_id'] = df['CANCER_TYPE'].map(disease_wordlist_map)
df[0:5]

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,AGE_AT_SEQ_REPORT,ONCOTREE_CODE,SAMPLE_TYPE,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS,Exclusion?,disease_concept_id
0,KRAS,3845.0,JHU,GRCh37,12,25398285,25398285,+,missense_variant,Missense_Mutation,...,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926
1,BRAF,673.0,JHU,GRCh37,7,140453136,140453136,+,missense_variant,Missense_Mutation,...,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926
2,EGFR,1956.0,JHU,GRCh37,7,55249071,55249071,+,missense_variant,Missense_Mutation,...,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926
3,TP53,7157.0,JHU,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,...,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926
4,NRAS,4893.0,JHU,GRCh37,1,115256529,115256529,+,missense_variant,Missense_Mutation,...,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926


In [70]:
def normalize_gene(term):
    r = requests.get(f'https://normalize.cancervariants.org/gene/normalize?q={term}')
    # print(r.json())
    if r.status_code == 200:
        try:
            identifier = r.json()['gene']['id']
        except:
            identifier = None
    return identifier

unique_genes = list(df['Hugo_Symbol'].unique())
gene_identifiers = [None] * len(unique_genes)
gene_wordlist = pd.DataFrame({'unique_gene': unique_genes,
                                 'gene_identifier': gene_identifiers})

gene_wordlist['gene_identifier'] = gene_wordlist['unique_gene'].apply(normalize_gene) 
gene_wordlist = gene_wordlist.set_index('unique_gene')['gene_identifier'] 
df['gene_concept_id'] = df['Hugo_Symbol'].map(gene_wordlist)
df[0:5]

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,ONCOTREE_CODE,SAMPLE_TYPE,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS,Exclusion?,disease_concept_id,gene_concept_id
0,KRAS,3845.0,JHU,GRCh37,12,25398285,25398285,+,missense_variant,Missense_Mutation,...,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:6407
1,BRAF,673.0,JHU,GRCh37,7,140453136,140453136,+,missense_variant,Missense_Mutation,...,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:1097
2,EGFR,1956.0,JHU,GRCh37,7,55249071,55249071,+,missense_variant,Missense_Mutation,...,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:3236
3,TP53,7157.0,JHU,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,...,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:11998
4,NRAS,4893.0,JHU,GRCh37,1,115256529,115256529,+,missense_variant,Missense_Mutation,...,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:7989


In [71]:
def build_varnorm_query(HGVSp, Hugo_Symbol):
    if isinstance(HGVSp, float):
        return None
    if isinstance(Hugo_Symbol, float):
        return None
    return f'{Hugo_Symbol} {HGVSp.split(".")[1]}'
    
df['var_norm_query'] = df.apply(lambda x: build_varnorm_query(x['HGVSp_Short'], x['Hugo_Symbol']), axis=1)
df

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,SAMPLE_TYPE,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS,Exclusion?,disease_concept_id,gene_concept_id,var_norm_query
0,KRAS,3845.0,JHU,GRCh37,12,25398285,25398285,+,missense_variant,Missense_Mutation,...,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:6407,KRAS G12C
1,BRAF,673.0,JHU,GRCh37,7,140453136,140453136,+,missense_variant,Missense_Mutation,...,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:1097,BRAF V600E
2,EGFR,1956.0,JHU,GRCh37,7,55249071,55249071,+,missense_variant,Missense_Mutation,...,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:3236,EGFR T790M
3,TP53,7157.0,JHU,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,...,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:11998,TP53 R273H
4,NRAS,4893.0,JHU,GRCh37,1,115256529,115256529,+,missense_variant,Missense_Mutation,...,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:7989,NRAS Q61R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876019,TBX3,6926.0,PROV,GRCh37,12,115112292,115112292,+,missense_variant,Missense_Mutation,...,Not Applicable or Heme,PROV-FOUNDATIONONELIQUIDCDX,UNKNOWN,UNKNOWN,Not applicable or hematologic malignancy,cfDNA,False,,normalize.gene.hgnc:11602,TBX3 T483M
1876020,AR,367.0,PROV,GRCh37,X,66765014,66765014,+,missense_variant,Missense_Mutation,...,Not Applicable or Heme,PROV-FOUNDATIONONELIQUIDCDX,UNKNOWN,UNKNOWN,Not applicable or hematologic malignancy,cfDNA,False,,normalize.gene.hgnc:644,AR R9K
1876021,MSH3,4437.0,PROV,GRCh37,5,80021325,80021325,+,missense_variant,Missense_Mutation,...,Not Applicable or Heme,PROV-FOUNDATIONONELIQUIDCDX,UNKNOWN,UNKNOWN,Not applicable or hematologic malignancy,cfDNA,False,,normalize.gene.hgnc:7326,MSH3 Y465C
1876022,ATM,472.0,PROV,GRCh37,11,108098589,108098589,+,missense_variant,Missense_Mutation,...,Not Applicable or Heme,PROV-FOUNDATIONONELIQUIDCDX,UNKNOWN,UNKNOWN,Not applicable or hematologic malignancy,cfDNA,False,,normalize.gene.hgnc:795,ATM K53N


In [13]:
len(list(df['var_norm_query'].unique()))

767237

## Normalize Variants
This block takes a very long time to run, run only when needed. Data was normalized most recently on 2025 04 02. Load the protein seq consequence to variant map and map from there to the dataframes as needed.

In [None]:
# TODO: Keep this for record of how normalize variants
# # Only run this once, takes a long long time

# import requests
# import pandas as pd
# from tqdm import tqdm

# tqdm.pandas()  # Enables progress_apply

# def normalize_variation(term):
#     try:
#         r = requests.get(f'http://127.0.0.1:8000/variation/normalize?q={term}', verify=False)
#         if r.status_code == 200:
#             try:
#                 identifier = r.json()['variation']['id']
#             except KeyError:
#                 identifier = 'Normalization Attempted'
#             return identifier
#     except requests.RequestException:
#         return None  # Return None if there's a network error

# def batch_process_variations(variations, batch_size=10000):
#     result_df = pd.DataFrame(columns=['unique_variation', 'variation_identifier'])
#     for i in tqdm(range(0, len(variations), batch_size)):
#         batch_variations = variations[i:i + batch_size]
#         # Convert numpy array slice to pandas Series to use progress_apply
#         batch_series = pd.Series(batch_variations)
#         batch_identifiers = batch_series.progress_apply(normalize_variation)
#         temp_df = pd.DataFrame({
#             'unique_variation': batch_variations,
#             'variation_identifier': batch_identifiers
#         })
#         result_df = pd.concat([result_df, temp_df], ignore_index=True)
#         print(i)
        
#         # Save periodically
#         temp_df.to_csv(f'reattempt_output_{i//batch_size}.csv', index=False)
#     return result_df

# # Main DataFrame from your data
# unique_variations = try_one_more_time['unique_variation'].unique()

# # Process the variations in batches
# final_df = batch_process_variations(unique_variations)

# # TODO: Make more efficient? Takes literally 236:39:46 (almost 10 days)
# # TODO: Clean up this whole section, do not resave a file
# # TODO: Fill in the blanks that are still missing for final_df and map over


## Continue Analysis
Load variants, map and aggregate

In [72]:
tdf1 = pd.read_excel('norm_data/0-89-variant-norm-20250324.xlsx').drop(labels='Unnamed: 0',axis=1)
tdf2 = pd.read_excel('norm_data/variant-norm-remainders-take1-20250331.xlsx').drop(labels='Unnamed: 0',axis=1)
tdf3 = pd.read_excel('norm_data/variant-norm-remainders-take2-20250402.xlsx').drop(labels='Unnamed: 0',axis=1)
variant_map = pd.concat([tdf1,tdf2,tdf3]).reset_index(drop=True)
variant_map

Unnamed: 0,unique_variation,variation_identifier
0,KRAS G12C,ga4gh:VA.udBCHwlrf8xNiRy_19bLi-h5LhnZLgCt
1,BRAF V600E,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L
2,EGFR T790M,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-
3,TP53 R273H,ga4gh:VA.NbmGko5VG3K44V5sy-uVWrQaBdnqJ227
4,NRAS Q61R,ga4gh:VA.CpnlaV2B8565obATF-UlE706sBYp0D6M
...,...,...
1258581,CHEK2 *544Sext*48,
1258582,TAF1 L1389W,
1258583,MAP3K14 *501*,
1258584,CDKN2A A13Gfs*2,


In [73]:
# Drop NaNs and 'Normalization Attempted's
variant_map = variant_map[variant_map['variation_identifier'].isna()==False]
variant_map = variant_map[variant_map['variation_identifier']!='Normalization Attempted'].reset_index(drop=True)

print(f'Total Unique Variants: {len(variant_map)}')

Total Unique Variants: 756267


In [74]:
# Map the identifiers back to the original DataFrame
variation_dict = variant_map.set_index('unique_variation')['variation_identifier'].to_dict()
df['variation_concept_id'] = df['var_norm_query'].map(variation_dict)
df[['var_norm_query','variation_concept_id']].value_counts()


var_norm_query  variation_concept_id                     
KRAS G12D       ga4gh:VA.cdcXITbBWUXPjy3zPdhs7xjU1yvlkSrj    9903
KRAS G12V       ga4gh:VA.AhXGahW4SCTEgBSXlABoqX7N4mWXL-MD    7945
BRAF V600E      ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L    7019
PIK3CA E545K    ga4gh:VA.-dMnJf9oUBfl9De0llc3LqJaGdFzfATK    5034
KRAS G12C       ga4gh:VA.udBCHwlrf8xNiRy_19bLi-h5LhnZLgCt    4993
                                                             ... 
MAPK3 A342T     ga4gh:VA.5bwgw1bSxuOsZkLeuTc976O3G8Qj8rG9       1
MAPK3 A326D     ga4gh:VA.NJj3AWyoI6IIhAX21r6IYZDTd-LnQNSd       1
MAPK3 A277V     ga4gh:VA.Tu0cDmu-a3-yfP3Kg5GpMkqKT1ArodqU       1
MAPK3 A188V     ga4gh:VA.DRb75TA4O6OmC-0p2pEc1PHTFUQS1oaG       1
ABCB1 D457N     ga4gh:VA.pzoIug3WcO7UaFcEIatXAL2UEGv_RyZG       1
Name: count, Length: 662723, dtype: int64

In [75]:
maximum = len(df['variation_concept_id'])
print(f'Variants Without Mapping: {(len(df[df["variation_concept_id"].isna()]) / maximum) * 100}')

Variants Without Mapping: 14.529664865694683


In [76]:
# These are our relevant fields for quantification
analysis_df = df[['PATIENT_ID','disease_concept_id','gene_concept_id','var_norm_query','variation_concept_id']]
analysis_df = analysis_df.groupby('PATIENT_ID').agg(pd.unique)

# Filter out NaNs
analysis_df['disease_concept_id'] = analysis_df['disease_concept_id'].apply(lambda lst: [x for x in lst if pd.notna(x) and x is not None])
analysis_df['gene_concept_id'] = analysis_df['gene_concept_id'].apply(lambda lst: [x for x in lst if pd.notna(x) and x is not None])
analysis_df['variation_concept_id'] = analysis_df['variation_concept_id'].apply(lambda lst: [x for x in lst if pd.notna(x) and x is not None])

analysis_df

Unnamed: 0_level_0,disease_concept_id,gene_concept_id,var_norm_query,variation_concept_id
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GENIE-CHOP-C1002819,[],"[normalize.gene.hgnc:6075, normalize.gene.hgnc...","[INPP4B P572R, ARID2 S989L, ARAF S172L]","[ga4gh:VA.cpPxlgVD0QOz1zvk_84ZHBUI7IYE_00h, ga..."
GENIE-CHOP-C1002942,[normalize.disease.ncit:C3161],"[normalize.gene.hgnc:1787, normalize.gene.hgnc...","[CDKN2A R80*, FLT3 D600del]","[ga4gh:VA.wrKO0LoBmRv1zWHRPksfj6FnG0uf5QU0, ga..."
GENIE-CHOP-C1003065,[normalize.disease.ncit:C3264],"[normalize.gene.hgnc:12363, normalize.gene.hgn...","[TSC2 V299Tfs*56, BRIP1 A551E, EP300 P873A]","[ga4gh:VA.baThgMy_ByqclJ6NnbPH9wMuSv6LdMUI, ga..."
GENIE-CHOP-C1003188,[normalize.disease.ncit:C3264],"[normalize.gene.hgnc:3942, normalize.gene.hgnc...","[MTOR Q1492R, MUTYH I220V, MSH6 K854M, XPO1 R4...","[ga4gh:VA.IAC5UiVyvpQuGjtELYfnnIL9BKnlsVd_, ga..."
GENIE-CHOP-C1003434,[normalize.disease.ncit:C3059],"[normalize.gene.hgnc:952, normalize.gene.hgnc:...","[BARD1 T598N, MET V919I, BRAF V600E, BRCA2 A19...","[ga4gh:VA.uIoCtTobAQxL2ux7fhTIaKYjQAmempA1, ga..."
...,...,...,...,...
GENIE-YALE-TPL1871,[normalize.disease.mondo:0005065],"[normalize.gene.hgnc:1101, normalize.gene.hgnc...","[BRCA2 S1982Rfs*22, NOTCH1 N104S, ATM N1005S, ...","[ga4gh:VA.J9aYt41fi60Bf08ll7QLHO7Kk25w84m8, ga..."
GENIE-YALE-TPL1872,[normalize.disease.ncit:C2926],"[normalize.gene.hgnc:1925, normalize.gene.hgnc...","[CHEK1 F83Y, EGFR S768_D770dup]",[ga4gh:VA.VIk43vHfYbi-5JeDg5QD_TqvCwtuUzgz]
GENIE-YALE-TPL1873,[normalize.disease.ncit:C4978],"[normalize.gene.hgnc:6407, normalize.gene.hgnc...","[KRAS G12D, PIK3CA E542K, TP53 R249M]","[ga4gh:VA.cdcXITbBWUXPjy3zPdhs7xjU1yvlkSrj, ga..."
GENIE-YALE-TPL1875,[normalize.disease.ncit:C2926],"[normalize.gene.hgnc:6973, normalize.gene.hgnc...","[MDM2 L142_C145del, MSH6 R1331Q, ARID1A M442L,...","[ga4gh:VA.VCZicRqC130uB4f-PsR0Fp1X8Hh7yHiY, ga..."


In [None]:
# TODO: Simulate a search for a patient, quantify the number of different interpretations for differing levels of concepts
# TODO: Granularity of evidence: Single concept, pairwise, full statement
# TODO: Patients with multiple attached concepts: de-aggregate data (row level), quantify per concept separate counts, all possibility statements, rejoin original statements? 

In [60]:
df[df['PATIENT_ID']=='GENIE-CHOP-C1002942']

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS,Exclusion?,disease_concept_id,gene_concept_id,var_norm_query,variation_concept_id
1496352,CDKN2A,1029.0,CHOP,GRCh37,9,21971120,21971120,+,stop_gained,Nonsense_Mutation,...,CHOP-HEMEP,Leukemia,Acute Myeloid Leukemia,Local recurrence,Tumor,False,normalize.disease.ncit:C3161,normalize.gene.hgnc:1787,CDKN2A R80*,ga4gh:VA.wrKO0LoBmRv1zWHRPksfj6FnG0uf5QU0
1496353,FLT3,2322.0,CHOP,GRCh37,13,28608256,28608258,+,inframe_deletion,In_Frame_Del,...,CHOP-HEMEP,Leukemia,Acute Myeloid Leukemia,Local recurrence,Tumor,False,normalize.disease.ncit:C3161,normalize.gene.hgnc:3765,FLT3 D600del,ga4gh:VA.Z3ExHu3DToMcoLuODWa1gcrcLP0Y3Htx
1496354,CDKN2A,1029.0,CHOP,GRCh37,9,21971120,21971120,+,stop_gained,Nonsense_Mutation,...,CHOP-HEMEP,Leukemia,Acute Myeloid Leukemia,Local recurrence,Tumor,False,normalize.disease.ncit:C3161,normalize.gene.hgnc:1787,CDKN2A R80*,ga4gh:VA.wrKO0LoBmRv1zWHRPksfj6FnG0uf5QU0
1496355,FLT3,2322.0,CHOP,GRCh37,13,28608256,28608258,+,inframe_deletion,In_Frame_Del,...,CHOP-HEMEP,Leukemia,Acute Myeloid Leukemia,Local recurrence,Tumor,False,normalize.disease.ncit:C3161,normalize.gene.hgnc:3765,FLT3 D600del,ga4gh:VA.Z3ExHu3DToMcoLuODWa1gcrcLP0Y3Htx


#### Simple Counts (for use in simple statistics)

In [77]:
stats = {
    'no_gene_concepts' : [],
    'no_disease_concepts': [],
    'no_variation_concepts': []
}
stats = pd.DataFrame(stats)

stats['no_gene_concepts'] = analysis_df['gene_concept_id'].apply(lambda x: len(x))
stats['no_disease_concepts'] = analysis_df['disease_concept_id'].apply(lambda x: len(x))
stats['no_variation_concepts'] = analysis_df['variation_concept_id'].apply(lambda x: len(x))

stats

Unnamed: 0_level_0,no_gene_concepts,no_disease_concepts,no_variation_concepts
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GENIE-CHOP-C1002819,3,0,3
GENIE-CHOP-C1002942,2,1,2
GENIE-CHOP-C1003065,3,1,2
GENIE-CHOP-C1003188,10,1,9
GENIE-CHOP-C1003434,4,1,3
...,...,...,...
GENIE-YALE-TPL1871,6,1,4
GENIE-YALE-TPL1872,2,1,1
GENIE-YALE-TPL1873,3,1,3
GENIE-YALE-TPL1875,6,1,5


## Load the MetaKB v2 Dataset
The GENIE dataset is normalized and ready for comparison, now load in the MetaKB v2 dataset to quantify available interpretations.

In [78]:
from neo4j import GraphDatabase

# Functions
def create_db_connection(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    return driver

def execute_query(query):
    uri = "bolt://localhost:7687"
    user = "neo4j"
    password = "password"
    driver = create_db_connection(uri, user, password)
    with driver.session() as session:
        result = session.run(query)
        data = [record for record in result]
    driver.close()
    return data

def grab_subject_variant(study_id):
    query = f"""MATCH (s:StudyStatement) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_VARIANT]-(c)
        MATCH (c)-[:HAS_DEFINING_CONTEXT]-(v)
        RETURN properties(s) AS Study,
                properties(c) AS Variation,
                properties(v) AS Context
    """
    result = execute_query(query)
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    subject = result[0]['Context']['id']
    return(subject)

def grab_object_therapeutic(study_id): 
    query = f"""MATCH (s:StudyStatement) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_THERAPEUTIC]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Therapeutic
    """
    result = execute_query(query)
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')

    # Check for empty result
    if not(result): 
        return(None)

    # Grab concept identifier (if not single therapeutic, handle other cases)
    if 'normalizer_id' in result[0]['Therapeutic']:
        therapeutic = result[0]['Therapeutic']['normalizer_id']
        return(therapeutic)
    
    if 'groupType' in result[0]['Therapeutic'].keys():
        ther_type = result[0]['Therapeutic']['groupType']
        return((ther_type, result[0]['Therapeutic']['id']))    

def grab_qualifier_disease(study_id): 
    query = f"""MATCH (s:StudyStatement) WHERE s.id = '{study_id}'
        MATCH (s)-[:HAS_TUMOR_TYPE]-(c)
        RETURN properties(s) AS Study,
                properties(c) AS Disease
    """
    result = execute_query(query)
    if len(result) > 1:
        print(f'{study_id} has {len(result)} subjects possible')
    
    try: # TODO: This id should be normalizer_id not disease_normalizer_id. Will likely need to redo analysis, db IDs != normalizer IDs
        disease = result[0]['Disease']['broken on purpose']
    except: # TODO: This is gross, handle it better
        try:
            disease = result[0]['Disease']['label']
        except:
            disease = result[0]['Disease']['id']
    return(disease)

In [37]:
# Queries
q_all_statements = """MATCH (s:StudyStatement)
    RETURN properties(s) AS Study
"""

# Run Query
result = execute_query(q_all_statements)

data = []
for record in result:
    study = record.get('Study', {})
    row = {
        'id': study.get('id', None),
        'studyType': study.get('type',None),
        'propositionType': study.get('propositionType',None),
        'description': study.get('description', None),
        'alleleOriginQualifier': study.get('alleleOriginQualifier',None),
        'direction': study.get('direction',None),
        'predicate': study.get('predicate', None)
    }
    data.append(row)

metakb_df = pd.DataFrame(data)
metakb_df[0:5]


Unnamed: 0,id,studyType,propositionType,description,alleleOriginQualifier,direction,predicate
0,civic.eid:238,Statement,VariantTherapeuticResponseProposition,The T790M mutation in EGFR has been shown to c...,somatic,supports,predictsResistanceTo
1,civic.eid:1409,Statement,VariantTherapeuticResponseProposition,Phase 3 randomized clinical trial comparing ve...,somatic,supports,predictsSensitivityTo
2,civic.eid:1592,Statement,VariantTherapeuticResponseProposition,Osimertinib has been approved for the treatmen...,somatic,supports,predictsSensitivityTo
3,civic.eid:1867,Statement,VariantTherapeuticResponseProposition,"Randomized, international, open-label, phase 3...",somatic,supports,predictsSensitivityTo
4,civic.eid:2994,Statement,VariantTherapeuticResponseProposition,"On May 14, 2013, the U.S. Food and Drug Admini...",somatic,supports,predictsSensitivityTo


### Variant Therapeutic Response Proposition


In [79]:
data = metakb_df[metakb_df['propositionType']=='VariantTherapeuticResponseProposition'].reset_index(drop=True)
print(f'Studies found: {len(data)}')

data[['Variant','Disease','Therapeutic']] = None 
data['Variant'] = data['id'].apply(grab_subject_variant)
data['Disease'] = data['id'].apply(grab_qualifier_disease)
data['Disease ID'] = data['Disease'].apply(normalize_disease)
data['Therapeutic'] = data['id'].apply(grab_object_therapeutic)
data['source'] = data['id'].apply(lambda x: x.split('.')[0])
data['statement'] = data.apply(lambda row: (row['Variant'], row['predicate'], row['Therapeutic'], row['Disease ID']), axis=1)
data['statement_no_ther_or_gene'] = data.apply(lambda row: (row['Disease ID'], row['Variant']), axis=1)
print(len(data))

data[0:5]

Studies found: 1099
1099


Unnamed: 0,id,studyType,propositionType,description,alleleOriginQualifier,direction,predicate,Variant,Disease,Therapeutic,Disease ID,source,statement,statement_no_ther_or_gene
0,civic.eid:238,Statement,VariantTherapeuticResponseProposition,The T790M mutation in EGFR has been shown to c...,somatic,supports,predictsResistanceTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,Lung Non-small Cell Carcinoma,337525,normalize.disease.ncit:C2926,civic,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr...","(normalize.disease.ncit:C2926, ga4gh:VA.sMA9h8..."
1,civic.eid:1409,Statement,VariantTherapeuticResponseProposition,Phase 3 randomized clinical trial comparing ve...,somatic,supports,predictsSensitivityTo,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,Skin Melanoma,1147220,normalize.disease.ncit:C3510,civic,"(ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L, pr...","(normalize.disease.ncit:C3510, ga4gh:VA.j4XnsL..."
2,civic.eid:1592,Statement,VariantTherapeuticResponseProposition,Osimertinib has been approved for the treatmen...,somatic,supports,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,Lung Non-small Cell Carcinoma,1721560,normalize.disease.ncit:C2926,civic,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr...","(normalize.disease.ncit:C2926, ga4gh:VA.sMA9h8..."
3,civic.eid:1867,Statement,VariantTherapeuticResponseProposition,"Randomized, international, open-label, phase 3...",somatic,supports,predictsSensitivityTo,ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-,Lung Non-small Cell Carcinoma,1721560,normalize.disease.ncit:C2926,civic,"(ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-, pr...","(normalize.disease.ncit:C2926, ga4gh:VA.sMA9h8..."
4,civic.eid:2994,Statement,VariantTherapeuticResponseProposition,"On May 14, 2013, the U.S. Food and Drug Admini...",somatic,supports,predictsSensitivityTo,ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ,Lung Non-small Cell Carcinoma,337525,normalize.disease.ncit:C2926,civic,"(ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ, pr...","(normalize.disease.ncit:C2926, ga4gh:VA.S41CcM..."


In [None]:
def create_statements(row):
    # Extract the lists from the row
    diseases = row['disease_concept_id']
    genes = row['gene_concept_id']
    variations = row['variation_concept_id']
    
    # Generate all possible combinations if all lists are non-empty
    if diseases and genes and variations:
        # return list(product(diseases, genes, variations)) TODO: Genes ever included in metakb evidence statements?
        return list(product(diseases, variations))
    else:
        return []
    
analysis_df['ther_response_statements'] = analysis_df.apply(create_statements, axis=1)

In [86]:
# Assess number of interpretations 
analysis_df['ther_response_interp_counts'] = 'Not Assessed'
for idx, row in analysis_df.iterrows():
    interp_counts = []
    for statement in row['ther_response_statements']:
        subset = data[data['statement_no_ther_or_gene']==statement]
        interp_counts.append(len(subset))
    analysis_df.at[idx, 'ther_response_interp_counts'] = interp_counts

analysis_df['#_ther_response_interps'] = analysis_df['ther_response_interp_counts'].apply(lambda x: sum(x))

In [83]:
len(analysis_df)

178800

In [87]:
analysis_df[0:5]

Unnamed: 0_level_0,disease_concept_id,gene_concept_id,var_norm_query,variation_concept_id,ther_response_statements,ther_response_interp_counts,#_ther_response_interps
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GENIE-CHOP-C1002819,[],"[normalize.gene.hgnc:6075, normalize.gene.hgnc...","[INPP4B P572R, ARID2 S989L, ARAF S172L]","[ga4gh:VA.cpPxlgVD0QOz1zvk_84ZHBUI7IYE_00h, ga...",[],[],0
GENIE-CHOP-C1002942,[normalize.disease.ncit:C3161],"[normalize.gene.hgnc:1787, normalize.gene.hgnc...","[CDKN2A R80*, FLT3 D600del]","[ga4gh:VA.wrKO0LoBmRv1zWHRPksfj6FnG0uf5QU0, ga...","[(normalize.disease.ncit:C3161, ga4gh:VA.wrKO0...","[0, 0]",0
GENIE-CHOP-C1003065,[normalize.disease.ncit:C3264],"[normalize.gene.hgnc:12363, normalize.gene.hgn...","[TSC2 V299Tfs*56, BRIP1 A551E, EP300 P873A]","[ga4gh:VA.baThgMy_ByqclJ6NnbPH9wMuSv6LdMUI, ga...","[(normalize.disease.ncit:C3264, ga4gh:VA.baThg...","[0, 0]",0
GENIE-CHOP-C1003188,[normalize.disease.ncit:C3264],"[normalize.gene.hgnc:3942, normalize.gene.hgnc...","[MTOR Q1492R, MUTYH I220V, MSH6 K854M, XPO1 R4...","[ga4gh:VA.IAC5UiVyvpQuGjtELYfnnIL9BKnlsVd_, ga...","[(normalize.disease.ncit:C3264, ga4gh:VA.IAC5U...","[0, 0, 0, 0, 0, 0, 0, 0, 0]",0
GENIE-CHOP-C1003434,[normalize.disease.ncit:C3059],"[normalize.gene.hgnc:952, normalize.gene.hgnc:...","[BARD1 T598N, MET V919I, BRAF V600E, BRCA2 A19...","[ga4gh:VA.uIoCtTobAQxL2ux7fhTIaKYjQAmempA1, ga...","[(normalize.disease.ncit:C3059, ga4gh:VA.uIoCt...","[0, 0, 1]",1


In [99]:
fig = px.histogram(analysis_df, x='#_ther_response_interps',
                   title='# of Interpretations per Patient (Variant Therapeutic Response)', # Adding a title
                   labels={'#_ther_response_interps': 'Number of Interpretations'}, # Changing axis label
                   color_discrete_sequence=['#636EFA'],
                   text_auto=True) # Changing the bar color


# Update layout
fig.update_layout(
    title_font_size=18, # Customize title font size
    title_x=0.5, # Center the title
    legend_title_text='Legend', # Custom legend title
    xaxis_title='Number of Interpretations', # X-axis label
    yaxis_title='# Patients', # Y-axis label
    template='plotly_white', # Using a light theme for better visual
    showlegend=True # Ensure the legend is shown
)

# Customize fonts and colors
fig.update_xaxes(title_font=dict(size=18), color='#333') # Customize X-axis title
fig.update_yaxes(title_font=dict(size=18), color='#333') # Customize Y-axis title
fig.update_traces(marker_line_color='#333', marker_line_width=0.5, # Border line for bars
                  textfont_size=10, # Increase text font size for readability
                  textposition='outside',
                  textangle=-90) # Position text outside the bars for clarity

fig.show()
fig.write_image("histogram_ther_response_interpretations.png", scale=1.5, width=960, height=540)

In [101]:
import plotly.express as px

# Assuming analysis_df is your DataFrame and '#_interps' is the column you're plotting
fig = px.histogram(analysis_df[analysis_df['#_ther_response_interps'] > 0], x='#_ther_response_interps',
                   title='# of Interpretations per Patient (Variant Therapeutic Response) (At least 1 Interpretation)', # Adding a title
                   labels={'#_ther_response_interps': 'Number of Interpretations'}, # Changing axis label
                   color_discrete_sequence=['#636EFA'],
                   text_auto=True) # Automatically add text labels

# Update layout
fig.update_layout(
    title_font_size=18, # Customize title font size
    title_x=0.5, # Center the title
    legend_title_text='Legend', # Custom legend title
    xaxis_title='Number of Interpretations', # X-axis label
    yaxis_title='# Patients', # Y-axis label
    template='plotly_white', # Using a light theme for better visual
    showlegend=True # Ensure the legend is shown
)

# Customize fonts and colors
fig.update_xaxes(title_font=dict(size=18), color='#333') # Customize X-axis title
fig.update_yaxes(title_font=dict(size=18), color='#333') # Customize Y-axis title
fig.update_traces(marker_line_color='#333', marker_line_width=0.5, # Border line for bars
                  textfont_size=10, # Increase text font size for readability
                  textposition='outside',
                  textangle=-90) # Position text outside the bars for clarity

fig.show()
fig.write_image("histogram_ther_response_interpretations_at_least_1.png", scale=1.5, width=960, height=540)


### Variant Prognostic Evidence

In [91]:
data = metakb_df[metakb_df['propositionType']=='VariantPrognosticProposition'].reset_index(drop=True)
print(f'Studies found: {len(data)}')
data[0:5]

data[['Variant','Disease']] = None 
data['Variant'] = data['id'].apply(grab_subject_variant)
data['Disease'] = data['id'].apply(grab_qualifier_disease)
data['Disease ID'] = data['Disease'].apply(normalize_disease)
data['statement_no_ther_or_gene'] = data.apply(lambda row: (row['Disease ID'], row['Variant']), axis=1)
data[0:5]

Studies found: 156


Unnamed: 0,id,studyType,propositionType,description,alleleOriginQualifier,direction,predicate,Variant,Disease,Disease ID,statement_no_ther_or_gene
0,civic.eid:102,Statement,VariantPrognosticProposition,Unlike other studies that suggest a poorer out...,somatic,disputes,associatedWithWorseOutcomeFor,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,Papillary Thyroid Carcinoma,normalize.disease.ncit:C4035,"(normalize.disease.ncit:C4035, ga4gh:VA.j4XnsL..."
1,civic.eid:103,Statement,VariantPrognosticProposition,V600E is associated with adverse pathological ...,somatic,supports,associatedWithWorseOutcomeFor,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,Colorectal Cancer,normalize.disease.ncit:C4978,"(normalize.disease.ncit:C4978, ga4gh:VA.j4XnsL..."
2,civic.eid:656,Statement,VariantPrognosticProposition,In patients with papillary thyroid cancer harb...,somatic,supports,associatedWithWorseOutcomeFor,ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L,Papillary Thyroid Carcinoma,normalize.disease.ncit:C4035,"(normalize.disease.ncit:C4035, ga4gh:VA.j4XnsL..."
3,civic.eid:1420,Statement,VariantPrognosticProposition,Study examined rare SNPs on MAP2K7 among a dis...,germline,supports,associatedWithWorseOutcomeFor,ga4gh:VA.nE0U2q_0hSEFMnTRq7MM3ZrVByPOuTdE,Lung Cancer,normalize.disease.ncit:C7377,"(normalize.disease.ncit:C7377, ga4gh:VA.nE0U2q..."
4,civic.eid:26,Statement,VariantPrognosticProposition,"In acute myloid leukemia patients, D816 mutati...",somatic,supports,associatedWithWorseOutcomeFor,ga4gh:VA.nhiDwIq1klrGm3wtWO4a4BiS0jdW79Wd,Acute Myeloid Leukemia,normalize.disease.ncit:C3171,"(normalize.disease.ncit:C3171, ga4gh:VA.nhiDwI..."


In [92]:
analysis_df['prognostic_statements'] = analysis_df.apply(create_statements, axis=1)

analysis_df['prognostic_interp_counts'] = 'Not Assessed'
for idx, row in analysis_df.iterrows():
    interp_counts = []
    for statement in row['prognostic_statements']:
        subset = data[data['statement_no_ther_or_gene']==statement]
        interp_counts.append(len(subset))
    analysis_df.at[idx, 'prognostic_interp_counts'] = interp_counts

analysis_df['#_prognostic_interps'] = analysis_df['prognostic_interp_counts'].apply(lambda x: sum(x))

In [102]:
fig = px.histogram(analysis_df, x='#_prognostic_interps',
                   title='# of Interpretations per Patient (Variant Prognostic)', # Adding a title
                   labels={'#_prognostic_interps': 'Number of Interpretations'}, # Changing axis label
                   color_discrete_sequence=['#636EFA'],
                   text_auto=True) # Changing the bar color


# Update layout
fig.update_layout(
    title_font_size=18, # Customize title font size
    title_x=0.5, # Center the title
    legend_title_text='Legend', # Custom legend title
    xaxis_title='Number of Interpretations', # X-axis label
    yaxis_title='# of Patients', # Y-axis label
    template='plotly_white', # Using a light theme for better visual
    showlegend=True # Ensure the legend is shown
)

# Customize fonts and colors
fig.update_xaxes(title_font=dict(size=18), color='#333') # Customize X-axis title
fig.update_yaxes(title_font=dict(size=18), color='#333') # Customize Y-axis title
fig.update_traces(marker_line_color='#333', marker_line_width=0.5, # Border line for bars
                  textfont_size=10, # Increase text font size for readability
                  textposition='outside',
                  textangle=-90) # Position text outside the bars for clarity

fig.show()
fig.write_image("histogram_interpretations_prognostic.png", scale=1.5, width=960, height=540)

In [103]:
# Assuming analysis_df is your DataFrame and '#_interps' is the column you're plotting
fig = px.histogram(analysis_df[analysis_df['#_prognostic_interps'] > 0], x='#_prognostic_interps',
                   title='# of Interpretations per Patient (Variant Prognostic) (At least 1 Interpretation)', # Adding a title
                   labels={'#_prognostic_interps': 'Number of Interpretations'}, # Changing axis label
                   color_discrete_sequence=['#636EFA'],
                   text_auto=True) # Automatically add text labels

# Update layout
fig.update_layout(
    title_font_size=18, # Customize title font size
    title_x=0.5, # Center the title
    legend_title_text='Legend', # Custom legend title
    xaxis_title='Number of Interpretations', # X-axis label
    yaxis_title='# of Patients', # Y-axis label
    template='plotly_white', # Using a light theme for better visual
    showlegend=True # Ensure the legend is shown
)

# Customize fonts and colors
fig.update_xaxes(title_font=dict(size=18), color='#333') # Customize X-axis title
fig.update_yaxes(title_font=dict(size=18), color='#333') # Customize Y-axis title
fig.update_traces(marker_line_color='#333', marker_line_width=0.5, # Border line for bars
                  textfont_size=10, # Increase text font size for readability
                  textposition='outside',
                  textangle=-90) # Position text outside the bars for clarity

fig.show()
fig.write_image("histogram_interpretations_prognostic_at_least_1.png", scale=1.5, width=960, height=540)


### TODOs

In [98]:
# TODO: Get count of number of statements, as well as other granularity concerns from above
total_count = 0
for idx, row in analysis_df.iterrows():
    scalar = len(row['ther_response_statements'])
    total_count += scalar

total_count

1471400

In [None]:
# TODO: Split out the graphs by resource (current is by aggregate) (illustrate gains by source)
# TODO: This has been done by variant id, would be interesting to look at by variant category
# TODO: Perhaps consider a category/tree (rule based) search, how to match concepts to things in the tree
#       for rolling up the tree for particular diseases

# For the specific stuff:
# TODO: Granularity: Single concept, pairwise, full statement
# TODO: Patients with multiple attached concepts: de-aggregate data (row level), quantify per concept separate counts, all possibility statements, rejoin original statements? 

# TODO: Make this notebook nice, add table of contents and consolidate blocks 