In [1]:
import pandas as pd

## Join and Normalize GENIE Data
As part of the MetaKB v2 analysis, we will utilize part of the GENIE dataset to look at available interpretations in the aggregate knowledgebase for concepts as determined by real patient samples. We will join the mutation and clinical datasets, normalize values, and quantify interpretations.

In [None]:
# Project GENIE Mutation Dataset
muts = pd.read_csv('genie_data/data_mutations_extended.txt',sep='\t', low_memory=False)
muts[0:5]

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,FILTER,Polyphen_Prediction,Polyphen_Score,SIFT_Prediction,SIFT_Score,SWISSPROT,n_depth,t_depth,Annotation_Status,mutationInCis_Flag
0,KRAS,3845.0,JHU,GRCh37,12,25398285,25398285,+,missense_variant,Missense_Mutation,...,PASS,probably_damaging,0.991,deleterious,0.04,,,1623.0,SUCCESS,False
1,BRAF,673.0,JHU,GRCh37,7,140453136,140453136,+,missense_variant,Missense_Mutation,...,PASS,probably_damaging,0.963,deleterious,0.0,,,1031.0,SUCCESS,False
2,EGFR,1956.0,JHU,GRCh37,7,55249071,55249071,+,missense_variant,Missense_Mutation,...,PASS,probably_damaging,1.0,deleterious,0.0,,,692.0,SUCCESS,False
3,TP53,7157.0,JHU,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,...,PASS,possibly_damaging,0.643,tolerated,0.13,,,930.0,SUCCESS,False
4,NRAS,4893.0,JHU,GRCh37,1,115256529,115256529,+,missense_variant,Missense_Mutation,...,PASS,benign,0.251,tolerated,0.06,,,2277.0,SUCCESS,False


In [None]:
# Project GENIE Clinical Dataset
clin = pd.read_csv('genie_data/data_clinical_sample.txt',sep='\t', low_memory=False, header=4)
clin[0:5]

Unnamed: 0,PATIENT_ID,SAMPLE_ID,AGE_AT_SEQ_REPORT,ONCOTREE_CODE,SAMPLE_TYPE,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS
0,GENIE-VICC-101416,GENIE-VICC-101416-unk-1,52,MAAP,Primary,VICC-01-T7,Appendiceal Cancer,Mucinous Adenocarcinoma of the Appendix,Primary tumor,Tumor
1,GENIE-VICC-102225,GENIE-VICC-102225-unk-1,31,READ,Metastasis,VICC-01-T7,Colorectal Cancer,Rectal Adenocarcinoma,Metastasis site unspecified,Tumor
2,GENIE-VICC-102424,GENIE-VICC-102424-unk-1,63,SCCNOS,Metastasis,VICC-01-T7,Cancer of Unknown Primary,"Squamous Cell Carcinoma, NOS",Metastasis site unspecified,Tumor
3,GENIE-VICC-102966,GENIE-VICC-102966-unk-1,64,LUAD,Metastasis,VICC-01-T7,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Metastasis site unspecified,Tumor
4,GENIE-VICC-103244,GENIE-VICC-103244-unk-1,50,IDC,Primary,VICC-01-T5A,Breast Cancer,Breast Invasive Ductal Carcinoma,Primary tumor,Tumor


In [None]:
# Join the two data sets on patient identifiers
df = pd.merge(muts, clin, left_on='Tumor_Sample_Barcode', right_on='SAMPLE_ID', how="left")
df[0:5]

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,PATIENT_ID,SAMPLE_ID,AGE_AT_SEQ_REPORT,ONCOTREE_CODE,SAMPLE_TYPE,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS
0,KRAS,3845.0,JHU,GRCh37,12,25398285,25398285,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor
1,BRAF,673.0,JHU,GRCh37,7,140453136,140453136,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor
2,EGFR,1956.0,JHU,GRCh37,7,55249071,55249071,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor
3,TP53,7157.0,JHU,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor
4,NRAS,4893.0,JHU,GRCh37,1,115256529,115256529,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor


In [None]:
# From MetaKB v1 paper --> Exclude silent, 3' Flank, 3'UTR, 5'Flank, 5'UR, Intron, Splice_region
# 3_prime_UTR_variant, 5_prime_UTR_variant, splice_region_variant, splice_acceptor_variant, splice_donor_variant, intron_variant, synonymous_variant 
all_consequences = []
for entry in df['Consequence']:
    if isinstance(entry,float):
        pass
    else:
        consequences = entry.split(',')
        all_consequences.extend(consequences)
set(all_consequences) 

# TODO: how do start/stop_retained variants function? are these silent?

{'3_prime_UTR_variant',
 '5_prime_UTR_variant',
 'NMD_transcript_variant',
 'coding_sequence_variant',
 'downstream_gene_variant',
 'frameshift_variant',
 'incomplete_terminal_codon_variant',
 'inframe_deletion',
 'inframe_insertion',
 'intron_variant',
 'mature_miRNA_variant',
 'missense_variant',
 'non_coding_transcript_exon_variant',
 'non_coding_transcript_variant',
 'protein_altering_variant',
 'splice_acceptor_variant',
 'splice_donor_variant',
 'splice_region_variant',
 'start_lost',
 'start_retained_variant',
 'stop_gained',
 'stop_lost',
 'stop_retained_variant',
 'synonymous_variant',
 'upstream_gene_variant'}

In [None]:
# No Consequence (NA) also means silent? Drop!
df = df.dropna(subset=['Consequence']).reset_index(drop=True)
df[0:5]

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,SAMPLE_ID,AGE_AT_SEQ_REPORT,ONCOTREE_CODE,SAMPLE_TYPE,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS,Exclusion?
0,KRAS,3845.0,JHU,GRCh37,12,25398285,25398285,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False
1,BRAF,673.0,JHU,GRCh37,7,140453136,140453136,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False
2,EGFR,1956.0,JHU,GRCh37,7,55249071,55249071,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False
3,TP53,7157.0,JHU,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False
4,NRAS,4893.0,JHU,GRCh37,1,115256529,115256529,+,missense_variant,Missense_Mutation,...,GENIE-JHU-00006-00185,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False


In [None]:
# Using the above lists, excluse silent consequences 
exclusion_criteria = ['3_prime_UTR_variant', 
                     '5_prime_UTR_variant', 
                     'splice_region_variant', 
                     'splice_acceptor_variant', 
                     'splice_donor_variant', 
                     'intron_variant', 
                     'synonymous_variant']

df['Exclusion?'] = False
for idx, row in df.iterrows():
    consequences = row['Consequence'].split(',')
    for consequence in consequences:
        if consequence in exclusion_criteria:
            df.at[idx, 'Exclusion?'] = True

df = df[df['Exclusion?']==False].reset_index(drop=True)
df[0:5]

0          False
1          False
2          False
3          False
4          False
           ...  
2264728     True
2264729    False
2264730    False
2264731    False
2264732    False
Name: Exclusion?, Length: 2264733, dtype: bool

In [None]:
# Get concept IDs for everything attached to the patients by the following corresponding fields
# Variant -> HGVSc
# Gene -> Hugo_Symbol
# Disease -> CANCER_TYPE
df.keys()

Index(['Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'NCBI_Build', 'Chromosome',
       'Start_Position', 'End_Position', 'Strand', 'Consequence',
       'Variant_Classification', 'Variant_Type', 'Reference_Allele',
       'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS',
       'dbSNP_Val_Status', 'Tumor_Sample_Barcode',
       'Matched_Norm_Sample_Barcode', 'Match_Norm_Seq_Allele1',
       'Match_Norm_Seq_Allele2', 'Tumor_Validation_Allele1',
       'Tumor_Validation_Allele2', 'Match_Norm_Validation_Allele1',
       'Match_Norm_Validation_Allele2', 'Verification_Status',
       'Validation_Status', 'Mutation_Status', 'Sequencing_Phase',
       'Sequence_Source', 'Validation_Method', 'Score', 'BAM_File',
       'Sequencer', 't_ref_count', 't_alt_count', 'n_ref_count', 'n_alt_count',
       'HGVSc', 'HGVSp', 'HGVSp_Short', 'Transcript_ID', 'RefSeq',
       'Protein_position', 'Codons', 'Exon_Number', 'gnomAD_AF',
       'gnomAD_AFR_AF', 'gnomAD_AMR_AF', 'gnomAD_ASJ_AF', 'gnomAD_EAS_AF

### Normalize

In [None]:
import requests 

def normalize_disease(term):
    r = requests.get(f'https://normalize.cancervariants.org/disease/normalize?q={term}')
    # print(r.json())
    if r.status_code == 200:
        try:
            identifier = r.json()['disease']['id']
        except:
            identifier = None
    return identifier

unique_diseases = list(df['CANCER_TYPE'].unique())
disease_identifiers = [None] * len(unique_diseases)
disease_wordlist = pd.DataFrame({'unique_disease': unique_diseases,
                                 'disease_identifier': disease_identifiers})
disease_wordlist['disease_identifier'] = disease_wordlist['unique_disease'].apply(normalize_disease) 
disease_wordlist_map = disease_wordlist.set_index('unique_disease')['disease_identifier'] #TODO: why two variables?
df['disease_concept_id'] = df['CANCER_TYPE'].map(disease_wordlist_map)
df[0:5]

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,AGE_AT_SEQ_REPORT,ONCOTREE_CODE,SAMPLE_TYPE,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS,Exclusion?,disease_concept_id
0,KRAS,3845.0,JHU,GRCh37,12,25398285,25398285,+,missense_variant,Missense_Mutation,...,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926
1,BRAF,673.0,JHU,GRCh37,7,140453136,140453136,+,missense_variant,Missense_Mutation,...,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926
2,EGFR,1956.0,JHU,GRCh37,7,55249071,55249071,+,missense_variant,Missense_Mutation,...,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926
3,TP53,7157.0,JHU,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,...,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926
4,NRAS,4893.0,JHU,GRCh37,1,115256529,115256529,+,missense_variant,Missense_Mutation,...,61,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926


In [None]:
def normalize_gene(term):
    r = requests.get(f'https://normalize.cancervariants.org/gene/normalize?q={term}')
    # print(r.json())
    if r.status_code == 200:
        try:
            identifier = r.json()['gene']['id']
        except:
            identifier = None
    return identifier

unique_genes = list(df['Hugo_Symbol'].unique())
gene_identifiers = [None] * len(unique_genes)
gene_wordlist = pd.DataFrame({'unique_gene': unique_genes,
                                 'gene_identifier': gene_identifiers})

gene_wordlist['gene_identifier'] = gene_wordlist['unique_gene'].apply(normalize_gene) 
gene_wordlist = gene_wordlist.set_index('unique_gene')['gene_identifier'] 
df['gene_concept_id'] = df['Hugo_Symbol'].map(gene_wordlist)
df[0:5]

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,ONCOTREE_CODE,SAMPLE_TYPE,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS,Exclusion?,disease_concept_id,gene_concept_id
0,KRAS,3845.0,JHU,GRCh37,12,25398285,25398285,+,missense_variant,Missense_Mutation,...,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:6407
1,BRAF,673.0,JHU,GRCh37,7,140453136,140453136,+,missense_variant,Missense_Mutation,...,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:1097
2,EGFR,1956.0,JHU,GRCh37,7,55249071,55249071,+,missense_variant,Missense_Mutation,...,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:3236
3,TP53,7157.0,JHU,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,...,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:11998
4,NRAS,4893.0,JHU,GRCh37,1,115256529,115256529,+,missense_variant,Missense_Mutation,...,LUAD,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:7989


In [93]:
def build_varnorm_query(HGVSp, Hugo_Symbol):
    if isinstance(HGVSp, float):
        return None
    if isinstance(Hugo_Symbol, float):
        return None
    return f'{Hugo_Symbol} {HGVSp.split(".")[1]}'
    
df['var_norm_query'] = df.apply(lambda x: build_varnorm_query(x['HGVSp_Short'], x['Hugo_Symbol']), axis=1)
df

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,SAMPLE_TYPE,SEQ_ASSAY_ID,CANCER_TYPE,CANCER_TYPE_DETAILED,SAMPLE_TYPE_DETAILED,SAMPLE_CLASS,Exclusion?,disease_concept_id,gene_concept_id,var_norm_query
0,KRAS,3845.0,JHU,GRCh37,12,25398285,25398285,+,missense_variant,Missense_Mutation,...,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:6407,KRAS G12C
1,BRAF,673.0,JHU,GRCh37,7,140453136,140453136,+,missense_variant,Missense_Mutation,...,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:1097,BRAF V600E
2,EGFR,1956.0,JHU,GRCh37,7,55249071,55249071,+,missense_variant,Missense_Mutation,...,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:3236,EGFR T790M
3,TP53,7157.0,JHU,GRCh37,17,7577120,7577120,+,missense_variant,Missense_Mutation,...,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:11998,TP53 R273H
4,NRAS,4893.0,JHU,GRCh37,1,115256529,115256529,+,missense_variant,Missense_Mutation,...,Primary,JHU-50GP,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,Primary tumor,Tumor,False,normalize.disease.ncit:C2926,normalize.gene.hgnc:7989,NRAS Q61R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876019,TBX3,6926.0,PROV,GRCh37,12,115112292,115112292,+,missense_variant,Missense_Mutation,...,Not Applicable or Heme,PROV-FOUNDATIONONELIQUIDCDX,UNKNOWN,UNKNOWN,Not applicable or hematologic malignancy,cfDNA,False,,normalize.gene.hgnc:11602,TBX3 T483M
1876020,AR,367.0,PROV,GRCh37,X,66765014,66765014,+,missense_variant,Missense_Mutation,...,Not Applicable or Heme,PROV-FOUNDATIONONELIQUIDCDX,UNKNOWN,UNKNOWN,Not applicable or hematologic malignancy,cfDNA,False,,normalize.gene.hgnc:644,AR R9K
1876021,MSH3,4437.0,PROV,GRCh37,5,80021325,80021325,+,missense_variant,Missense_Mutation,...,Not Applicable or Heme,PROV-FOUNDATIONONELIQUIDCDX,UNKNOWN,UNKNOWN,Not applicable or hematologic malignancy,cfDNA,False,,normalize.gene.hgnc:7326,MSH3 Y465C
1876022,ATM,472.0,PROV,GRCh37,11,108098589,108098589,+,missense_variant,Missense_Mutation,...,Not Applicable or Heme,PROV-FOUNDATIONONELIQUIDCDX,UNKNOWN,UNKNOWN,Not applicable or hematologic malignancy,cfDNA,False,,normalize.gene.hgnc:795,ATM K53N


In [96]:
len(list(df['var_norm_query'].unique()))

767237

In [None]:
# TODO: Get the variation normalization service up and running locally, 767237 unique values is too many to run on the Fast API
# Kori is updating some README instructions for loading Variation Normalizer locally
# In the meantime, just use the var_norm_query field to build out the notebook.

# from tqdm import tqdm
# tqdm.pandas()
# def normalize_variation(term):
#     r = requests.get(f'https://normalize.cancervariants.org/variation/normalize?q={term}')
#     # print(r.json())
#     if r.status_code == 200:
#         try:
#             identifier = r.json()['variation']['id']
#         except:
#             identifier = None
#     return identifier

# unique_variations = list(df['var_norm_query'].unique())
# variation_identifiers = [None] * len(unique_variations)
# variation_wordlist = pd.DataFrame({'unique_variation': unique_variations,
#                                  'variation_identifier': variation_identifiers})

# variation_wordlist['variation_identifier'] = variation_wordlist['unique_variation'].progress_apply(normalize_variation) 
# variation_wordlist = variation_wordlist.set_index('unique_variation')['variation_identifier'] 
# df['variation_concept_id'] = df['var_norm_query'].map(gene_wordlist)
# df[0:5]

In [None]:
# These are our relevant fields for quantification
df[['PATIENT_ID','disease_concept_id','gene_concept_id','var_norm_query']]

Unnamed: 0,PATIENT_ID,disease_concept_id,gene_concept_id,var_norm_query
0,GENIE-JHU-00006,normalize.disease.ncit:C2926,normalize.gene.hgnc:6407,KRAS G12C
1,GENIE-JHU-00006,normalize.disease.ncit:C2926,normalize.gene.hgnc:1097,BRAF V600E
2,GENIE-JHU-00006,normalize.disease.ncit:C2926,normalize.gene.hgnc:3236,EGFR T790M
3,GENIE-JHU-00006,normalize.disease.ncit:C2926,normalize.gene.hgnc:11998,TP53 R273H
4,GENIE-JHU-00006,normalize.disease.ncit:C2926,normalize.gene.hgnc:7989,NRAS Q61R
...,...,...,...,...
1876019,GENIE-PROV-9a7b04d343,,normalize.gene.hgnc:11602,TBX3 T483M
1876020,GENIE-PROV-9a7b04d343,,normalize.gene.hgnc:644,AR R9K
1876021,GENIE-PROV-9a7b04d343,,normalize.gene.hgnc:7326,MSH3 Y465C
1876022,GENIE-PROV-9a7b04d343,,normalize.gene.hgnc:795,ATM K53N


## Load the MetaKB v2 Dataset
The GENIE dataset is normalized and ready for comparison, now load in the MetaKB v2 dataset to quantify available interpretations.