This notebook does processing over the annotated mutations to generate a 7776 dim vector as in DruID. This will be done on the 5 datasets considered - TCGA, CCLE 23Q4, Moore's, CBIO HCC and CBIO BRCA.

This notebook takes the ClinVar, GPD and Annovar annotations for each dataset and generates processed files for training. Each variant is put into 1 of 6 bins - PIU/LU/NCU and Pathogenic/VUS/Benign. Within each bucket, all variants in the same gene for a patient are aggregated using count(variants), max(Annovar score), sum(Annovar score) and mean(Annovar score).

The annotations are obtained from the pre-built vocabulary of all mutations.

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pickle

In [4]:
GENES_324 = list(pd.read_csv("/data/druid_data/raw_data/gene2ind.txt", header=None)[0])

In [5]:
def get_alias_to_canonical_name_map():
    with open("/data/druid_data/raw_data/gene_aliases.json", "r") as fp:
        aliases_on_disk = json.load(fp)

    alias_to_canonical_name_map = {}
    for canonical_name, aliases in aliases_on_disk.items():
        # Some canonical names have only one alias - convert those as list for consistency
        if type(aliases) != list:
            aliases = [aliases]

        for alias in aliases:

            # If an alias is one of the canonical names in GENES_324, do not add it to the map
            # Else, we'd be renaming a canonical named column into something else
            if alias in GENES_324:
                print(f"Alias {alias} is a canonical_name, skipping")
                continue

            if alias in alias_to_canonical_name_map:
                print(
                    f"Found multiple canonical names for alias - {alias} = {[canonical_name, alias_to_canonical_name_map[alias]]}"
                )
                # Drop aliases with conflicting canonical names per recommendation from clinicians
                alias_to_canonical_name_map.pop(alias)

            # Convert all aliases to be upper case for consistency
            alias_to_canonical_name_map[alias.upper()] = canonical_name.upper()

    return alias_to_canonical_name_map

In [6]:
alias2canonicalmap = get_alias_to_canonical_name_map()
len(set(alias2canonicalmap.values()))

Alias RAD54L is a canonical_name, skipping
Found multiple canonical names for alias - CDK4I = ['CDKN2B', 'CDKN2A']
Found multiple canonical names for alias - IDH = ['IDH2', 'IDH1']
Found multiple canonical names for alias - IDP = ['IDH2', 'IDH1']
Found multiple canonical names for alias - HDMX = ['MDM4', 'MDM2']
Found multiple canonical names for alias - HNPCC = ['MSH2', 'MLH1']
Found multiple canonical names for alias - MRP1 = ['MSH3', 'MDM4']
Alias KRAS is a canonical_name, skipping
Found multiple canonical names for alias - ADPRTL2 = ['PARP3', 'PARP2']
Found multiple canonical names for alias - ADPRTL3 = ['PARP3', 'PARP2']
Found multiple canonical names for alias - MCAP = ['PIK3CA', 'BRD4']
Found multiple canonical names for alias - PI3K = ['PIK3CB', 'PIK3CA']
Found multiple canonical names for alias - R51H3 = ['RAD51D', 'RAD51C']
Found multiple canonical names for alias - PTC = ['RET', 'PTCH1']
Found multiple canonical names for alias - SDH1 = ['SDHB', 'SDHA']
Found multiple canoni

311

In [7]:
def convert2canonical(gene):
    if gene in GENES_324: # already a canonical name
        return gene
    if gene in alias2canonicalmap.keys(): # not canonical name => convert to canonical name
        return alias2canonicalmap[gene]
    return np.NaN


In [8]:
def get_matrices(df, merged_df, criteria="GPD_unit", criteria_value="PIU", index_name = "DepMap_ID", column_name = "Hugo_Symbol"):
    df_reduced = df[df[criteria] == criteria_value]
    df_reduced_matrix = pd.pivot_table(df_reduced, index=index_name, columns=column_name, values="xon17_score")
    print(df_reduced_matrix.shape)
    
    # Adding missing features from GENES_324 and replacing NaNs with 0 and adding patients across all
    for g in GENES_324:
        if g not in df_reduced_matrix.columns:
            df_reduced_matrix[g] = 0

    all_patients = list(merged_df[index_name].unique())
    df_reduced_matrix = df_reduced_matrix.reset_index()
    for p in set(all_patients) - set(df_reduced_matrix[index_name]):
        df_reduced_matrix = df_reduced_matrix._append({index_name: p}, ignore_index=True)

    df_reduced_matrix.set_index(index_name, drop=True, inplace=True)
    df_reduced_matrix.fillna(0, inplace=True)
    df_reduced_matrix = df_reduced_matrix.sort_index()[GENES_324]
    
    return df_reduced_matrix

In [9]:
# pre-built vocab created from predict-ai-data-processing.ipynb
vocab_df = pd.read_csv("/data/ajayago/papers_data/systematic_assessment/processed/vocab_predict_ai_ccle_cbio_icgc_moores_tcga_genie_nci60_nuh_union.csv", index_col=0)
vocab_df

Unnamed: 0,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,m_cap_pred,primateai_pred,...,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,gpd_LU,gpd_NCU,gpd_PIU
A1BG@A191T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
A1BG@A268V,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
A1BG@A295T,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
A1BG@A332E,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,1,0,0
A1BG@A353V,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZZEF1@Y255H,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
ZZEF1@Y2618S,1,1,1,1,1,0,1,0,1,0,...,1,1,1,1,0,0,1,1,0,0
ZZEF1@Y399H,1,1,1,1,1,0,0,0,0,1,...,1,1,1,1,0,0,1,1,0,0
ZZEF1@Y702D,1,1,1,1,1,0,1,0,1,0,...,1,0,1,1,0,0,1,0,0,1


In [10]:
vocab_df.columns

Index(['sift_pred', 'sift4g_pred', 'lrt_pred', 'mutationtaster_pred',
       'mutationassessor_pred', 'fathmm_pred', 'provean_pred', 'metasvm_pred',
       'm_cap_pred', 'primateai_pred', 'deogen2_pred', 'bayesdel_addaf_pred',
       'bayesdel_noaf_pred', 'clinpred_pred', 'list_s2_pred',
       'fathmm_mkl_coding_pred', 'fathmm_xf_coding_pred', 'clinvar_Pathogenic',
       'clinvar_Benign', 'clinvar_Unknown', 'gpd_LU', 'gpd_NCU', 'gpd_PIU'],
      dtype='object')

In [11]:
# get xon17_score
vocab_df["xon17_score"] = vocab_df[['sift_pred', 'sift4g_pred', 'lrt_pred', 'mutationtaster_pred',
       'mutationassessor_pred', 'fathmm_pred', 'provean_pred', 'metasvm_pred',
       'm_cap_pred', 'primateai_pred', 'deogen2_pred', 'bayesdel_addaf_pred',
       'bayesdel_noaf_pred', 'clinpred_pred', 'list_s2_pred',
       'fathmm_mkl_coding_pred', 'fathmm_xf_coding_pred',]].sum(axis=1)/17
vocab_df

Unnamed: 0,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,m_cap_pred,primateai_pred,...,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
A1BG@A191T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0.000000
A1BG@A268V,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0.000000
A1BG@A295T,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0.058824
A1BG@A332E,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0.117647
A1BG@A353V,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0.058824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZZEF1@Y255H,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0.058824
ZZEF1@Y2618S,1,1,1,1,1,0,1,0,1,0,...,1,1,1,0,0,1,1,0,0,0.764706
ZZEF1@Y399H,1,1,1,1,1,0,0,0,0,1,...,1,1,1,0,0,1,1,0,0,0.588235
ZZEF1@Y702D,1,1,1,1,1,0,1,0,1,0,...,0,1,1,0,0,1,0,0,1,0.705882


#### CCLE 23Q4

In [12]:
ccle_mutations = pd.read_csv("/data/druid_data/CCLE_23Q4/patient_gene_alteration(mutation).csv")
ccle_mutations["canonical_gene_name"] = ccle_mutations["gene"].apply(lambda x: convert2canonical(x))
ccle_mutations

Unnamed: 0,depmap_id,gene,alteration,canonical_gene_name
0,PR-sxFiuq,SAMD11,L76V,
1,PR-DNEoiz,SAMD11,P107S,
2,PR-2ei6MD,SAMD11,E160K,
3,PR-CYz5sB,SAMD11,A218V,
4,PR-xcsbEI,SAMD11,N285S,
...,...,...,...,...
885436,PR-MX9ndc,KDM5D,R68H,
885437,PR-AiAKPa,EIF1AY,D83Y,
885438,PR-MX9ndc,RPS4Y2,T115A,
885439,PR-Bs4EcD,RPS4Y2,P152S,


In [13]:
ccle_mutations = ccle_mutations[ccle_mutations.canonical_gene_name.astype(str) != "nan"].reset_index(drop=True)
ccle_mutations["mutations"] = ccle_mutations["canonical_gene_name"] + "@" + ccle_mutations["alteration"]
ccle_mutations

Unnamed: 0,depmap_id,gene,alteration,canonical_gene_name,mutations
0,PR-rsoNmY,TNFRSF14,G68R,TNFRSF14,TNFRSF14@G68R
1,PR-yDgpga,TNFRSF14,G68R,TNFRSF14,TNFRSF14@G68R
2,PR-sgXEkc,TNFRSF14,G89C,TNFRSF14,TNFRSF14@G89C
3,PR-kRqGcx,TNFRSF14,A140S,TNFRSF14,TNFRSF14@A140S
4,PR-ZhEuUF,TNFRSF14,R149M,TNFRSF14,TNFRSF14@R149M
...,...,...,...,...,...
29444,PR-6Ybf3z,BCORL1,P1755QfsTer20,BCORL1,BCORL1@P1755QfsTer20
29445,PR-81oclJ,BCORL1,P1755QfsTer20,BCORL1,BCORL1@P1755QfsTer20
29446,PR-Qvs2q6,BCORL1,P1755QfsTer20,BCORL1,BCORL1@P1755QfsTer20
29447,PR-EaZDJD,BCORL1,E1767K,BCORL1,BCORL1@E1767K


In [14]:
len(ccle_mutations.depmap_id.unique())

2313

In [15]:
merged_ccle = ccle_mutations.merge(vocab_df[["xon17_score", 'clinvar_Pathogenic', 'clinvar_Benign', 'clinvar_Unknown', 'gpd_LU', 'gpd_NCU', 'gpd_PIU']], left_on="mutations", right_on=vocab_df.index, how="left")
merged_ccle["xon17_score"].fillna(0, inplace=True)
# NaNs are set to VUS
merged_ccle["clinvar_Pathogenic"].fillna(0, inplace=True)
merged_ccle["clinvar_Benign"].fillna(0, inplace=True)
merged_ccle["clinvar_Unknown"].fillna(1, inplace=True)
# NaNs are set to NCU
merged_ccle["gpd_LU"].fillna(0, inplace=True)
merged_ccle["gpd_PIU"].fillna(0, inplace=True)
merged_ccle["gpd_NCU"].fillna(1, inplace=True)
merged_ccle

Unnamed: 0,depmap_id,gene,alteration,canonical_gene_name,mutations,xon17_score,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,gpd_LU,gpd_NCU,gpd_PIU
0,PR-rsoNmY,TNFRSF14,G68R,TNFRSF14,TNFRSF14@G68R,0.117647,0.0,0.0,1.0,1.0,0.0,0.0
1,PR-yDgpga,TNFRSF14,G68R,TNFRSF14,TNFRSF14@G68R,0.117647,0.0,0.0,1.0,1.0,0.0,0.0
2,PR-sgXEkc,TNFRSF14,G89C,TNFRSF14,TNFRSF14@G89C,0.705882,0.0,0.0,1.0,0.0,0.0,1.0
3,PR-kRqGcx,TNFRSF14,A140S,TNFRSF14,TNFRSF14@A140S,0.058824,0.0,0.0,1.0,0.0,0.0,1.0
4,PR-ZhEuUF,TNFRSF14,R149M,TNFRSF14,TNFRSF14@R149M,0.294118,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
29444,PR-6Ybf3z,BCORL1,P1755QfsTer20,BCORL1,BCORL1@P1755QfsTer20,0.000000,0.0,0.0,1.0,0.0,1.0,0.0
29445,PR-81oclJ,BCORL1,P1755QfsTer20,BCORL1,BCORL1@P1755QfsTer20,0.000000,0.0,0.0,1.0,0.0,1.0,0.0
29446,PR-Qvs2q6,BCORL1,P1755QfsTer20,BCORL1,BCORL1@P1755QfsTer20,0.000000,0.0,0.0,1.0,0.0,1.0,0.0
29447,PR-EaZDJD,BCORL1,E1767K,BCORL1,BCORL1@E1767K,0.000000,0.0,0.0,1.0,0.0,1.0,0.0


In [16]:
merged_ccle.canonical_gene_name.isna().sum()

0

In [17]:
merged_ccle[["clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown", "gpd_LU", "gpd_NCU", "gpd_PIU"]].sum()

clinvar_Pathogenic     2150.0
clinvar_Benign           32.0
clinvar_Unknown       27267.0
gpd_LU                 6366.0
gpd_NCU                9718.0
gpd_PIU               13365.0
dtype: float64

#### GPD features

In [18]:
# Max feature
ccle_gpd_max = merged_ccle.groupby(["depmap_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("max")[["xon17_score"]].reset_index()
ccle_gpd_max

Unnamed: 0,depmap_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,PR-00UtU3,MAP3K13,1.0,0.0,0.0,0.588235
1,PR-00UtU3,P2RY8,1.0,0.0,0.0,0.176471
2,PR-00UtU3,PTEN,0.0,0.0,1.0,0.941176
3,PR-04VvBz,BARD1,0.0,1.0,0.0,0.000000
4,PR-04VvBz,KDM6A,1.0,0.0,0.0,0.529412
...,...,...,...,...,...,...
27225,PR-zyM15A,NRAS,0.0,0.0,1.0,0.823529
27226,PR-zyM15A,PIK3CA,1.0,0.0,0.0,0.529412
27227,PR-zyM15A,POLD1,0.0,0.0,1.0,0.823529
27228,PR-zyM15A,RNF43,0.0,1.0,0.0,0.000000


In [19]:
ccle_gpd_max_lu_matrix = get_matrices(ccle_gpd_max, merged_ccle, criteria="gpd_LU", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_gpd_max_piu_matrix = get_matrices(ccle_gpd_max, merged_ccle, criteria="gpd_PIU", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_gpd_max_ncu_matrix = get_matrices(ccle_gpd_max, merged_ccle, criteria="gpd_NCU", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
print(ccle_gpd_max_lu_matrix.shape)
print(ccle_gpd_max_piu_matrix.shape)
print(ccle_gpd_max_ncu_matrix.shape)

(1741, 279)
(2180, 303)
(2056, 309)
(2313, 324)
(2313, 324)
(2313, 324)


In [20]:
# Sum feature
ccle_gpd_sum = merged_ccle.groupby(["depmap_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("sum")[["xon17_score"]].reset_index()
ccle_gpd_sum

Unnamed: 0,depmap_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,PR-00UtU3,MAP3K13,1.0,0.0,0.0,0.588235
1,PR-00UtU3,P2RY8,1.0,0.0,0.0,0.176471
2,PR-00UtU3,PTEN,0.0,0.0,1.0,0.941176
3,PR-04VvBz,BARD1,0.0,1.0,0.0,0.000000
4,PR-04VvBz,KDM6A,1.0,0.0,0.0,0.529412
...,...,...,...,...,...,...
27225,PR-zyM15A,NRAS,0.0,0.0,1.0,0.823529
27226,PR-zyM15A,PIK3CA,1.0,0.0,0.0,0.529412
27227,PR-zyM15A,POLD1,0.0,0.0,1.0,0.823529
27228,PR-zyM15A,RNF43,0.0,1.0,0.0,0.000000


In [21]:
ccle_gpd_sum_lu_matrix = get_matrices(ccle_gpd_sum, merged_ccle, criteria="gpd_LU", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_gpd_sum_piu_matrix = get_matrices(ccle_gpd_sum, merged_ccle, criteria="gpd_PIU", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_gpd_sum_ncu_matrix = get_matrices(ccle_gpd_sum, merged_ccle, criteria="gpd_NCU", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
print(ccle_gpd_sum_lu_matrix.shape)
print(ccle_gpd_sum_piu_matrix.shape)
print(ccle_gpd_sum_ncu_matrix.shape)

(1741, 279)
(2180, 303)
(2056, 309)
(2313, 324)
(2313, 324)
(2313, 324)


In [22]:
# Mean feature
ccle_gpd_mean = merged_ccle.groupby(["depmap_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"])[["xon17_score"]].aggregate("mean").reset_index()
ccle_gpd_mean

Unnamed: 0,depmap_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,PR-00UtU3,MAP3K13,1.0,0.0,0.0,0.588235
1,PR-00UtU3,P2RY8,1.0,0.0,0.0,0.176471
2,PR-00UtU3,PTEN,0.0,0.0,1.0,0.941176
3,PR-04VvBz,BARD1,0.0,1.0,0.0,0.000000
4,PR-04VvBz,KDM6A,1.0,0.0,0.0,0.529412
...,...,...,...,...,...,...
27225,PR-zyM15A,NRAS,0.0,0.0,1.0,0.823529
27226,PR-zyM15A,PIK3CA,1.0,0.0,0.0,0.529412
27227,PR-zyM15A,POLD1,0.0,0.0,1.0,0.823529
27228,PR-zyM15A,RNF43,0.0,1.0,0.0,0.000000


In [23]:
ccle_gpd_mean_lu_matrix = get_matrices(ccle_gpd_mean, merged_ccle, criteria="gpd_LU", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_gpd_mean_piu_matrix = get_matrices(ccle_gpd_mean, merged_ccle, criteria="gpd_PIU", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_gpd_mean_ncu_matrix = get_matrices(ccle_gpd_mean, merged_ccle, criteria="gpd_NCU", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
print(ccle_gpd_mean_lu_matrix.shape)
print(ccle_gpd_mean_piu_matrix.shape)
print(ccle_gpd_mean_ncu_matrix.shape)

(1741, 279)
(2180, 303)
(2056, 309)
(2313, 324)
(2313, 324)
(2313, 324)


In [24]:
# Count feature
ccle_gpd_count = merged_ccle.groupby(["depmap_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("count")[["xon17_score"]].reset_index()
ccle_gpd_count

Unnamed: 0,depmap_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,PR-00UtU3,MAP3K13,1.0,0.0,0.0,1
1,PR-00UtU3,P2RY8,1.0,0.0,0.0,1
2,PR-00UtU3,PTEN,0.0,0.0,1.0,1
3,PR-04VvBz,BARD1,0.0,1.0,0.0,1
4,PR-04VvBz,KDM6A,1.0,0.0,0.0,1
...,...,...,...,...,...,...
27225,PR-zyM15A,NRAS,0.0,0.0,1.0,1
27226,PR-zyM15A,PIK3CA,1.0,0.0,0.0,1
27227,PR-zyM15A,POLD1,0.0,0.0,1.0,1
27228,PR-zyM15A,RNF43,0.0,1.0,0.0,1


In [25]:
ccle_gpd_count_lu_matrix = get_matrices(ccle_gpd_count, merged_ccle, criteria="gpd_LU", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_gpd_count_piu_matrix = get_matrices(ccle_gpd_count, merged_ccle, criteria="gpd_PIU", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_gpd_count_ncu_matrix = get_matrices(ccle_gpd_count, merged_ccle, criteria="gpd_NCU", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
print(ccle_gpd_count_lu_matrix.shape)
print(ccle_gpd_count_piu_matrix.shape)
print(ccle_gpd_count_ncu_matrix.shape)

(1741, 279)
(2180, 303)
(2056, 309)
(2313, 324)
(2313, 324)
(2313, 324)


#### ClinVar

In [26]:
# Max feature
ccle_clinvar_max = merged_ccle.groupby(["depmap_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"])[["xon17_score"]].aggregate("max").reset_index()
ccle_clinvar_max

Unnamed: 0,depmap_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,PR-00UtU3,MAP3K13,0.0,0.0,1.0,0.588235
1,PR-00UtU3,P2RY8,0.0,0.0,1.0,0.176471
2,PR-00UtU3,PTEN,0.0,0.0,1.0,0.941176
3,PR-04VvBz,BARD1,0.0,0.0,1.0,0.000000
4,PR-04VvBz,KDM6A,0.0,0.0,1.0,0.529412
...,...,...,...,...,...,...
25918,PR-zyM15A,NRAS,1.0,0.0,0.0,0.823529
25919,PR-zyM15A,PIK3CA,1.0,0.0,0.0,0.529412
25920,PR-zyM15A,POLD1,0.0,0.0,1.0,0.823529
25921,PR-zyM15A,RNF43,0.0,0.0,1.0,0.000000


In [27]:
ccle_clinvar_max_pathogenic_matrix = get_matrices(ccle_clinvar_max, merged_ccle, criteria="clinvar_Pathogenic", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_clinvar_max_vus_matrix = get_matrices(ccle_clinvar_max, merged_ccle, criteria="clinvar_Unknown", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_clinvar_max_benign_matrix = get_matrices(ccle_clinvar_max, merged_ccle, criteria="clinvar_Benign", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
print(ccle_clinvar_max_pathogenic_matrix.shape)
print(ccle_clinvar_max_vus_matrix.shape)
print(ccle_clinvar_max_benign_matrix.shape)

(1390, 105)
(2301, 318)
(32, 15)
(2313, 324)
(2313, 324)
(2313, 324)


In [28]:
# Sum feature
ccle_clinvar_sum = merged_ccle.groupby(["depmap_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"]).aggregate("sum")[["xon17_score"]].reset_index()
ccle_clinvar_sum

Unnamed: 0,depmap_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,PR-00UtU3,MAP3K13,0.0,0.0,1.0,0.588235
1,PR-00UtU3,P2RY8,0.0,0.0,1.0,0.176471
2,PR-00UtU3,PTEN,0.0,0.0,1.0,0.941176
3,PR-04VvBz,BARD1,0.0,0.0,1.0,0.000000
4,PR-04VvBz,KDM6A,0.0,0.0,1.0,0.529412
...,...,...,...,...,...,...
25918,PR-zyM15A,NRAS,1.0,0.0,0.0,0.823529
25919,PR-zyM15A,PIK3CA,1.0,0.0,0.0,0.529412
25920,PR-zyM15A,POLD1,0.0,0.0,1.0,0.823529
25921,PR-zyM15A,RNF43,0.0,0.0,1.0,0.000000


In [29]:
ccle_clinvar_sum_pathogenic_matrix = get_matrices(ccle_clinvar_sum, merged_ccle, criteria="clinvar_Pathogenic", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_clinvar_sum_vus_matrix = get_matrices(ccle_clinvar_sum, merged_ccle, criteria="clinvar_Unknown", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_clinvar_sum_benign_matrix = get_matrices(ccle_clinvar_sum, merged_ccle, criteria="clinvar_Benign", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
print(ccle_clinvar_sum_pathogenic_matrix.shape)
print(ccle_clinvar_sum_vus_matrix.shape)
print(ccle_clinvar_sum_benign_matrix.shape)

(1390, 105)
(2301, 318)
(32, 15)
(2313, 324)
(2313, 324)
(2313, 324)


In [30]:
# Mean feature
ccle_clinvar_mean = merged_ccle.groupby(["depmap_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"])[["xon17_score"]].aggregate("mean").reset_index()
ccle_clinvar_mean

Unnamed: 0,depmap_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,PR-00UtU3,MAP3K13,0.0,0.0,1.0,0.588235
1,PR-00UtU3,P2RY8,0.0,0.0,1.0,0.176471
2,PR-00UtU3,PTEN,0.0,0.0,1.0,0.941176
3,PR-04VvBz,BARD1,0.0,0.0,1.0,0.000000
4,PR-04VvBz,KDM6A,0.0,0.0,1.0,0.529412
...,...,...,...,...,...,...
25918,PR-zyM15A,NRAS,1.0,0.0,0.0,0.823529
25919,PR-zyM15A,PIK3CA,1.0,0.0,0.0,0.529412
25920,PR-zyM15A,POLD1,0.0,0.0,1.0,0.823529
25921,PR-zyM15A,RNF43,0.0,0.0,1.0,0.000000


In [31]:
ccle_clinvar_mean_pathogenic_matrix = get_matrices(ccle_clinvar_mean, merged_ccle, criteria="clinvar_Pathogenic", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_clinvar_mean_vus_matrix = get_matrices(ccle_clinvar_mean, merged_ccle, criteria="clinvar_Unknown", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_clinvar_mean_benign_matrix = get_matrices(ccle_clinvar_mean, merged_ccle, criteria="clinvar_Benign", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
print(ccle_clinvar_mean_pathogenic_matrix.shape)
print(ccle_clinvar_mean_vus_matrix.shape)
print(ccle_clinvar_mean_benign_matrix.shape)

(1390, 105)
(2301, 318)
(32, 15)
(2313, 324)
(2313, 324)
(2313, 324)


In [32]:
# Count feature
ccle_clinvar_count = merged_ccle.groupby(["depmap_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"]).aggregate("count")[["xon17_score"]].reset_index()
ccle_clinvar_count

Unnamed: 0,depmap_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,PR-00UtU3,MAP3K13,0.0,0.0,1.0,1
1,PR-00UtU3,P2RY8,0.0,0.0,1.0,1
2,PR-00UtU3,PTEN,0.0,0.0,1.0,1
3,PR-04VvBz,BARD1,0.0,0.0,1.0,1
4,PR-04VvBz,KDM6A,0.0,0.0,1.0,1
...,...,...,...,...,...,...
25918,PR-zyM15A,NRAS,1.0,0.0,0.0,1
25919,PR-zyM15A,PIK3CA,1.0,0.0,0.0,1
25920,PR-zyM15A,POLD1,0.0,0.0,1.0,1
25921,PR-zyM15A,RNF43,0.0,0.0,1.0,1


In [33]:
ccle_clinvar_count_pathogenic_matrix = get_matrices(ccle_clinvar_count, merged_ccle, criteria="clinvar_Pathogenic", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_clinvar_count_vus_matrix = get_matrices(ccle_clinvar_count, merged_ccle, criteria="clinvar_Unknown", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
ccle_clinvar_count_benign_matrix = get_matrices(ccle_clinvar_count, merged_ccle, criteria="clinvar_Benign", criteria_value=1, index_name='depmap_id', column_name='canonical_gene_name')
print(ccle_clinvar_count_pathogenic_matrix.shape)
print(ccle_clinvar_count_vus_matrix.shape)
print(ccle_clinvar_count_benign_matrix.shape)

(1390, 105)
(2301, 318)
(32, 15)
(2313, 324)
(2313, 324)
(2313, 324)


In [34]:
# Add suffixes to identify columns
ccle_gpd_max_piu_matrix = ccle_gpd_max_piu_matrix.add_suffix('_piu_max')
ccle_gpd_sum_piu_matrix = ccle_gpd_sum_piu_matrix.add_suffix("_piu_sum")
ccle_gpd_mean_piu_matrix = ccle_gpd_mean_piu_matrix.add_suffix("_piu_mean")
ccle_gpd_count_piu_matrix = ccle_gpd_count_piu_matrix.add_suffix("_piu_count")
ccle_gpd_max_lu_matrix = ccle_gpd_max_lu_matrix.add_suffix("_lu_max")
ccle_gpd_sum_lu_matrix = ccle_gpd_sum_lu_matrix.add_suffix("_lu_sum")
ccle_gpd_mean_lu_matrix = ccle_gpd_mean_lu_matrix.add_suffix("_lu_mean")
ccle_gpd_count_lu_matrix = ccle_gpd_count_ncu_matrix.add_suffix("_lu_count")
ccle_gpd_max_ncu_matrix = ccle_gpd_max_ncu_matrix.add_suffix("_ncu_max")
ccle_gpd_sum_ncu_matrix = ccle_gpd_sum_ncu_matrix.add_suffix("_ncu_sum")
ccle_gpd_mean_ncu_matrix = ccle_gpd_mean_ncu_matrix.add_suffix("_ncu_mean")
ccle_gpd_count_ncu_matrix = ccle_gpd_count_ncu_matrix.add_suffix("_ncu_count")

ccle_clinvar_max_pathogenic_matrix = ccle_clinvar_max_pathogenic_matrix.add_suffix("_pathogenic_max")
ccle_clinvar_sum_pathogenic_matrix = ccle_clinvar_sum_pathogenic_matrix.add_suffix("_pathogenic_sum")
ccle_clinvar_mean_pathogenic_matrix = ccle_clinvar_mean_pathogenic_matrix.add_suffix("_pathogenic_mean")
ccle_clinvar_count_pathogenic_matrix = ccle_clinvar_count_pathogenic_matrix.add_suffix("_pathogenic_count")
ccle_clinvar_max_vus_matrix = ccle_clinvar_max_vus_matrix.add_suffix("_vus_max")
ccle_clinvar_sum_vus_matrix = ccle_clinvar_sum_vus_matrix.add_suffix("_vus_sum")
ccle_clinvar_mean_vus_matrix = ccle_clinvar_mean_vus_matrix.add_suffix("_vus_mean")
ccle_clinvar_count_vus_matrix = ccle_clinvar_count_vus_matrix.add_suffix("_vus_count")
ccle_clinvar_max_benign_matrix = ccle_clinvar_max_benign_matrix.add_suffix("_benign_max")
ccle_clinvar_sum_benign_matrix = ccle_clinvar_sum_benign_matrix.add_suffix("_benign_sum")
ccle_clinvar_mean_benign_matrix = ccle_clinvar_mean_benign_matrix.add_suffix("_benign_mean")
ccle_clinvar_count_benign_matrix = ccle_clinvar_count_benign_matrix.add_suffix("_benign_count")

In [35]:
# PIU - Max, Sum, Mean, Count, LU - Max, Sum, Mean, Count, NCU - Max, Sum, Mean, Count, Pathogenic - Max, Sum, Mean, Count, VUS - Max, Sum, Mean, Count, Benign - Max, Sum, Mean, Count
ccle_feature_matrix = pd.concat([
    ccle_gpd_max_piu_matrix, ccle_gpd_sum_piu_matrix, ccle_gpd_mean_piu_matrix, ccle_gpd_count_piu_matrix,
    ccle_gpd_max_lu_matrix, ccle_gpd_sum_lu_matrix, ccle_gpd_mean_lu_matrix, ccle_gpd_count_lu_matrix,
    ccle_gpd_max_ncu_matrix, ccle_gpd_sum_ncu_matrix, ccle_gpd_mean_ncu_matrix, ccle_gpd_count_ncu_matrix,
    ccle_clinvar_max_pathogenic_matrix, ccle_clinvar_sum_pathogenic_matrix, ccle_clinvar_mean_pathogenic_matrix, ccle_clinvar_count_pathogenic_matrix,
    ccle_clinvar_max_vus_matrix, ccle_clinvar_sum_vus_matrix, ccle_clinvar_mean_vus_matrix, ccle_clinvar_count_vus_matrix,
    ccle_clinvar_max_benign_matrix, ccle_clinvar_sum_benign_matrix, ccle_clinvar_mean_benign_matrix, ccle_clinvar_count_benign_matrix,
], axis = 1)
ccle_feature_matrix.shape

(2313, 7776)

In [36]:
ccle_feature_matrix.to_csv("/data/ajayago/papers_data/systematic_assessment/raw/annotated_mutation_matrices/clinvar_gpd_annovar_annotated_CCLE_23Q4_feature_matrix.csv")

### Moore's

In [37]:
moores_mutations = pd.read_csv("/data/druid_data/Moores/patient_gene_alteration(mutation).csv")
moores_mutations["canonical_gene_name"] = moores_mutations["gene"].apply(lambda x: convert2canonical(x))
moores_mutations

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name
0,1,PTEN,splice site 493-1 G>A,PTEN
1,2,TP53,P151A,TP53
2,3,ESR1,Y537S,ESR1
3,4,PTEN,I67K,PTEN
4,4,CTNNB1,T257I,CTNNB1
...,...,...,...,...
220,84,GATA3,G335fs*18,GATA3
221,85,TP53,H168R,TP53
222,85,GATA3,N332fs*21,GATA3
223,86,MLL2,A4571T,MLL2


In [38]:
moores_mutations = moores_mutations[moores_mutations.canonical_gene_name.astype(str) != "nan"].reset_index(drop=True)
moores_mutations["mutations"] = moores_mutations["canonical_gene_name"] + "@" + moores_mutations["alteration"]
moores_mutations

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name,mutations
0,1,PTEN,splice site 493-1 G>A,PTEN,PTEN@splice site 493-1 G>A
1,2,TP53,P151A,TP53,TP53@P151A
2,3,ESR1,Y537S,ESR1,ESR1@Y537S
3,4,PTEN,I67K,PTEN,PTEN@I67K
4,4,CTNNB1,T257I,CTNNB1,CTNNB1@T257I
...,...,...,...,...,...
211,84,GATA3,G335fs*18,GATA3,GATA3@G335fs*18
212,85,TP53,H168R,TP53,TP53@H168R
213,85,GATA3,N332fs*21,GATA3,GATA3@N332fs*21
214,86,MLL2,A4571T,MLL2,MLL2@A4571T


In [39]:
len(moores_mutations.patient_id.unique())

81

In [40]:
merged_moores = moores_mutations.merge(vocab_df[["xon17_score", 'clinvar_Pathogenic', 'clinvar_Benign', 'clinvar_Unknown', 'gpd_LU', 'gpd_NCU', 'gpd_PIU']], left_on="mutations", right_on=vocab_df.index, how="left")
merged_moores["xon17_score"].fillna(0, inplace=True)
# NaNs are set to VUS
merged_moores["clinvar_Pathogenic"].fillna(0, inplace=True)
merged_moores["clinvar_Benign"].fillna(0, inplace=True)
merged_moores["clinvar_Unknown"].fillna(1, inplace=True)
# NaNs are set to NCU
merged_moores["gpd_LU"].fillna(0, inplace=True)
merged_moores["gpd_PIU"].fillna(0, inplace=True)
merged_moores["gpd_NCU"].fillna(1, inplace=True)
merged_moores

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name,mutations,xon17_score,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,gpd_LU,gpd_NCU,gpd_PIU
0,1,PTEN,splice site 493-1 G>A,PTEN,PTEN@splice site 493-1 G>A,0.000000,0.0,0.0,1.0,0.0,1.0,0.0
1,2,TP53,P151A,TP53,TP53@P151A,0.941176,1.0,0.0,0.0,0.0,0.0,1.0
2,3,ESR1,Y537S,ESR1,ESR1@Y537S,1.000000,0.0,0.0,1.0,0.0,0.0,1.0
3,4,PTEN,I67K,PTEN,PTEN@I67K,1.000000,0.0,0.0,1.0,0.0,0.0,1.0
4,4,CTNNB1,T257I,CTNNB1,CTNNB1@T257I,0.941176,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
211,84,GATA3,G335fs*18,GATA3,GATA3@G335fs*18,0.000000,0.0,0.0,1.0,0.0,1.0,0.0
212,85,TP53,H168R,TP53,TP53@H168R,0.882353,0.0,0.0,1.0,0.0,0.0,1.0
213,85,GATA3,N332fs*21,GATA3,GATA3@N332fs*21,0.000000,0.0,0.0,1.0,0.0,1.0,0.0
214,86,MLL2,A4571T,MLL2,MLL2@A4571T,0.000000,0.0,0.0,1.0,0.0,1.0,0.0


In [41]:
merged_moores.canonical_gene_name.isna().sum()

0

In [42]:
merged_moores[["clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown", "gpd_LU", "gpd_NCU", "gpd_PIU"]].sum()

clinvar_Pathogenic     73.0
clinvar_Benign          3.0
clinvar_Unknown       140.0
gpd_LU                 25.0
gpd_NCU               102.0
gpd_PIU                89.0
dtype: float64

#### GPD features

In [43]:
# Max feature
moores_gpd_max = merged_moores.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("max")[["xon17_score"]].reset_index()
moores_gpd_max

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,1,PTEN,0.0,1.0,0.0,0.000000
1,2,TP53,0.0,0.0,1.0,0.941176
2,3,ESR1,0.0,0.0,1.0,1.000000
3,4,CTNNB1,0.0,0.0,1.0,0.941176
4,4,PTEN,0.0,0.0,1.0,1.000000
...,...,...,...,...,...,...
197,84,GATA3,0.0,1.0,0.0,0.000000
198,85,GATA3,0.0,1.0,0.0,0.000000
199,85,TP53,0.0,0.0,1.0,0.882353
200,86,MLL2,0.0,1.0,0.0,0.000000


In [44]:
moores_gpd_max_lu_matrix = get_matrices(moores_gpd_max, merged_moores, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_gpd_max_piu_matrix = get_matrices(moores_gpd_max, merged_moores, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_gpd_max_ncu_matrix = get_matrices(moores_gpd_max, merged_moores, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(moores_gpd_max_lu_matrix.shape)
print(moores_gpd_max_piu_matrix.shape)
print(moores_gpd_max_ncu_matrix.shape)

(21, 12)
(55, 28)
(53, 46)
(81, 324)
(81, 324)
(81, 324)


In [45]:
# Sum feature
moores_gpd_sum = merged_moores.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("sum")[["xon17_score"]].reset_index()
moores_gpd_sum

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,1,PTEN,0.0,1.0,0.0,0.000000
1,2,TP53,0.0,0.0,1.0,0.941176
2,3,ESR1,0.0,0.0,1.0,1.000000
3,4,CTNNB1,0.0,0.0,1.0,0.941176
4,4,PTEN,0.0,0.0,1.0,1.000000
...,...,...,...,...,...,...
197,84,GATA3,0.0,1.0,0.0,0.000000
198,85,GATA3,0.0,1.0,0.0,0.000000
199,85,TP53,0.0,0.0,1.0,0.882353
200,86,MLL2,0.0,1.0,0.0,0.000000


In [46]:
moores_gpd_sum_lu_matrix = get_matrices(moores_gpd_sum, merged_moores, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_gpd_sum_piu_matrix = get_matrices(moores_gpd_sum, merged_moores, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_gpd_sum_ncu_matrix = get_matrices(moores_gpd_sum, merged_moores, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(moores_gpd_sum_lu_matrix.shape)
print(moores_gpd_sum_piu_matrix.shape)
print(moores_gpd_sum_ncu_matrix.shape)

(21, 12)
(55, 28)
(53, 46)
(81, 324)
(81, 324)
(81, 324)


In [47]:
# Mean feature
moores_gpd_mean = merged_moores.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"])[["xon17_score"]].aggregate("mean").reset_index()
moores_gpd_mean

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,1,PTEN,0.0,1.0,0.0,0.000000
1,2,TP53,0.0,0.0,1.0,0.941176
2,3,ESR1,0.0,0.0,1.0,1.000000
3,4,CTNNB1,0.0,0.0,1.0,0.941176
4,4,PTEN,0.0,0.0,1.0,1.000000
...,...,...,...,...,...,...
197,84,GATA3,0.0,1.0,0.0,0.000000
198,85,GATA3,0.0,1.0,0.0,0.000000
199,85,TP53,0.0,0.0,1.0,0.882353
200,86,MLL2,0.0,1.0,0.0,0.000000


In [48]:
moores_gpd_mean_lu_matrix = get_matrices(moores_gpd_mean, merged_moores, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_gpd_mean_piu_matrix = get_matrices(moores_gpd_mean, merged_moores, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_gpd_mean_ncu_matrix = get_matrices(moores_gpd_mean, merged_moores, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(moores_gpd_mean_lu_matrix.shape)
print(moores_gpd_mean_piu_matrix.shape)
print(moores_gpd_mean_ncu_matrix.shape)

(21, 12)
(55, 28)
(53, 46)
(81, 324)
(81, 324)
(81, 324)


In [49]:
# Count feature
moores_gpd_count = merged_moores.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("count")[["xon17_score"]].reset_index()
moores_gpd_count

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,1,PTEN,0.0,1.0,0.0,1
1,2,TP53,0.0,0.0,1.0,1
2,3,ESR1,0.0,0.0,1.0,1
3,4,CTNNB1,0.0,0.0,1.0,1
4,4,PTEN,0.0,0.0,1.0,1
...,...,...,...,...,...,...
197,84,GATA3,0.0,1.0,0.0,1
198,85,GATA3,0.0,1.0,0.0,1
199,85,TP53,0.0,0.0,1.0,1
200,86,MLL2,0.0,1.0,0.0,1


In [50]:
moores_gpd_count_lu_matrix = get_matrices(moores_gpd_count, merged_moores, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_gpd_count_piu_matrix = get_matrices(moores_gpd_count, merged_moores, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_gpd_count_ncu_matrix = get_matrices(moores_gpd_count, merged_moores, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(moores_gpd_count_lu_matrix.shape)
print(moores_gpd_count_piu_matrix.shape)
print(moores_gpd_count_ncu_matrix.shape)

(21, 12)
(55, 28)
(53, 46)
(81, 324)
(81, 324)
(81, 324)


#### ClinVar

In [51]:
# Max feature
moores_clinvar_max = merged_moores.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"])[["xon17_score"]].aggregate("max").reset_index()
moores_clinvar_max

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,1,PTEN,0.0,0.0,1.0,0.000000
1,2,TP53,1.0,0.0,0.0,0.941176
2,3,ESR1,0.0,0.0,1.0,1.000000
3,4,CTNNB1,0.0,0.0,1.0,0.941176
4,4,PTEN,0.0,0.0,1.0,1.000000
...,...,...,...,...,...,...
199,84,GATA3,0.0,0.0,1.0,0.000000
200,85,GATA3,0.0,0.0,1.0,0.000000
201,85,TP53,0.0,0.0,1.0,0.882353
202,86,MLL2,0.0,0.0,1.0,0.000000


In [52]:
moores_clinvar_max_pathogenic_matrix = get_matrices(moores_clinvar_max, merged_moores, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_clinvar_max_vus_matrix = get_matrices(moores_clinvar_max, merged_moores, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_clinvar_max_benign_matrix = get_matrices(moores_clinvar_max, merged_moores, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(moores_clinvar_max_pathogenic_matrix.shape)
print(moores_clinvar_max_vus_matrix.shape)
print(moores_clinvar_max_benign_matrix.shape)

(49, 18)
(66, 58)
(3, 3)
(81, 324)
(81, 324)
(81, 324)


In [53]:
# Sum feature
moores_clinvar_sum = merged_moores.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"]).aggregate("sum")[["xon17_score"]].reset_index()
moores_clinvar_sum

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,1,PTEN,0.0,0.0,1.0,0.000000
1,2,TP53,1.0,0.0,0.0,0.941176
2,3,ESR1,0.0,0.0,1.0,1.000000
3,4,CTNNB1,0.0,0.0,1.0,0.941176
4,4,PTEN,0.0,0.0,1.0,1.000000
...,...,...,...,...,...,...
199,84,GATA3,0.0,0.0,1.0,0.000000
200,85,GATA3,0.0,0.0,1.0,0.000000
201,85,TP53,0.0,0.0,1.0,0.882353
202,86,MLL2,0.0,0.0,1.0,0.000000


In [54]:
moores_clinvar_sum_pathogenic_matrix = get_matrices(moores_clinvar_sum, merged_moores, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_clinvar_sum_vus_matrix = get_matrices(moores_clinvar_sum, merged_moores, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_clinvar_sum_benign_matrix = get_matrices(moores_clinvar_sum, merged_moores, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(moores_clinvar_sum_pathogenic_matrix.shape)
print(moores_clinvar_sum_vus_matrix.shape)
print(moores_clinvar_sum_benign_matrix.shape)

(49, 18)
(66, 58)
(3, 3)
(81, 324)
(81, 324)
(81, 324)


In [55]:
# Mean feature
moores_clinvar_mean = merged_moores.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"])[["xon17_score"]].aggregate("mean").reset_index()
moores_clinvar_mean

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,1,PTEN,0.0,0.0,1.0,0.000000
1,2,TP53,1.0,0.0,0.0,0.941176
2,3,ESR1,0.0,0.0,1.0,1.000000
3,4,CTNNB1,0.0,0.0,1.0,0.941176
4,4,PTEN,0.0,0.0,1.0,1.000000
...,...,...,...,...,...,...
199,84,GATA3,0.0,0.0,1.0,0.000000
200,85,GATA3,0.0,0.0,1.0,0.000000
201,85,TP53,0.0,0.0,1.0,0.882353
202,86,MLL2,0.0,0.0,1.0,0.000000


In [56]:
moores_clinvar_mean_pathogenic_matrix = get_matrices(moores_clinvar_mean, merged_moores, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_clinvar_mean_vus_matrix = get_matrices(moores_clinvar_mean, merged_moores, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_clinvar_mean_benign_matrix = get_matrices(moores_clinvar_mean, merged_moores, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(moores_clinvar_mean_pathogenic_matrix.shape)
print(moores_clinvar_mean_vus_matrix.shape)
print(moores_clinvar_mean_benign_matrix.shape)

(49, 18)
(66, 58)
(3, 3)
(81, 324)
(81, 324)
(81, 324)


In [57]:
# Count feature
moores_clinvar_count = merged_moores.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"]).aggregate("count")[["xon17_score"]].reset_index()
moores_clinvar_count

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,1,PTEN,0.0,0.0,1.0,1
1,2,TP53,1.0,0.0,0.0,1
2,3,ESR1,0.0,0.0,1.0,1
3,4,CTNNB1,0.0,0.0,1.0,1
4,4,PTEN,0.0,0.0,1.0,1
...,...,...,...,...,...,...
199,84,GATA3,0.0,0.0,1.0,1
200,85,GATA3,0.0,0.0,1.0,1
201,85,TP53,0.0,0.0,1.0,1
202,86,MLL2,0.0,0.0,1.0,1


In [58]:
moores_clinvar_count_pathogenic_matrix = get_matrices(moores_clinvar_count, merged_moores, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_clinvar_count_vus_matrix = get_matrices(moores_clinvar_count, merged_moores, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
moores_clinvar_count_benign_matrix = get_matrices(moores_clinvar_count, merged_moores, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(moores_clinvar_count_pathogenic_matrix.shape)
print(moores_clinvar_count_vus_matrix.shape)
print(moores_clinvar_count_benign_matrix.shape)

(49, 18)
(66, 58)
(3, 3)
(81, 324)
(81, 324)
(81, 324)


In [59]:
# Add suffixes to identify columns
moores_gpd_max_piu_matrix = moores_gpd_max_piu_matrix.add_suffix('_piu_max')
moores_gpd_sum_piu_matrix = moores_gpd_sum_piu_matrix.add_suffix("_piu_sum")
moores_gpd_mean_piu_matrix = moores_gpd_mean_piu_matrix.add_suffix("_piu_mean")
moores_gpd_count_piu_matrix = moores_gpd_count_piu_matrix.add_suffix("_piu_count")
moores_gpd_max_lu_matrix = moores_gpd_max_lu_matrix.add_suffix("_lu_max")
moores_gpd_sum_lu_matrix = moores_gpd_sum_lu_matrix.add_suffix("_lu_sum")
moores_gpd_mean_lu_matrix = moores_gpd_mean_lu_matrix.add_suffix("_lu_mean")
moores_gpd_count_lu_matrix = moores_gpd_count_ncu_matrix.add_suffix("_lu_count")
moores_gpd_max_ncu_matrix = moores_gpd_max_ncu_matrix.add_suffix("_ncu_max")
moores_gpd_sum_ncu_matrix = moores_gpd_sum_ncu_matrix.add_suffix("_ncu_sum")
moores_gpd_mean_ncu_matrix = moores_gpd_mean_ncu_matrix.add_suffix("_ncu_mean")
moores_gpd_count_ncu_matrix = moores_gpd_count_ncu_matrix.add_suffix("_ncu_count")

moores_clinvar_max_pathogenic_matrix = moores_clinvar_max_pathogenic_matrix.add_suffix("_pathogenic_max")
moores_clinvar_sum_pathogenic_matrix = moores_clinvar_sum_pathogenic_matrix.add_suffix("_pathogenic_sum")
moores_clinvar_mean_pathogenic_matrix = moores_clinvar_mean_pathogenic_matrix.add_suffix("_pathogenic_mean")
moores_clinvar_count_pathogenic_matrix = moores_clinvar_count_pathogenic_matrix.add_suffix("_pathogenic_count")
moores_clinvar_max_vus_matrix = moores_clinvar_max_vus_matrix.add_suffix("_vus_max")
moores_clinvar_sum_vus_matrix = moores_clinvar_sum_vus_matrix.add_suffix("_vus_sum")
moores_clinvar_mean_vus_matrix = moores_clinvar_mean_vus_matrix.add_suffix("_vus_mean")
moores_clinvar_count_vus_matrix = moores_clinvar_count_vus_matrix.add_suffix("_vus_count")
moores_clinvar_max_benign_matrix = moores_clinvar_max_benign_matrix.add_suffix("_benign_max")
moores_clinvar_sum_benign_matrix = moores_clinvar_sum_benign_matrix.add_suffix("_benign_sum")
moores_clinvar_mean_benign_matrix = moores_clinvar_mean_benign_matrix.add_suffix("_benign_mean")
moores_clinvar_count_benign_matrix = moores_clinvar_count_benign_matrix.add_suffix("_benign_count")

In [60]:
# PIU - Max, Sum, Mean, Count, LU - Max, Sum, Mean, Count, NCU - Max, Sum, Mean, Count, Pathogenic - Max, Sum, Mean, Count, VUS - Max, Sum, Mean, Count, Benign - Max, Sum, Mean, Count
moores_feature_matrix = pd.concat([
    moores_gpd_max_piu_matrix, moores_gpd_sum_piu_matrix, moores_gpd_mean_piu_matrix, moores_gpd_count_piu_matrix,
    moores_gpd_max_lu_matrix, moores_gpd_sum_lu_matrix, moores_gpd_mean_lu_matrix, moores_gpd_count_lu_matrix,
    moores_gpd_max_ncu_matrix, moores_gpd_sum_ncu_matrix, moores_gpd_mean_ncu_matrix, moores_gpd_count_ncu_matrix,
    moores_clinvar_max_pathogenic_matrix, moores_clinvar_sum_pathogenic_matrix, moores_clinvar_mean_pathogenic_matrix, moores_clinvar_count_pathogenic_matrix,
    moores_clinvar_max_vus_matrix, moores_clinvar_sum_vus_matrix, moores_clinvar_mean_vus_matrix, moores_clinvar_count_vus_matrix,
    moores_clinvar_max_benign_matrix, moores_clinvar_sum_benign_matrix, moores_clinvar_mean_benign_matrix, moores_clinvar_count_benign_matrix,
], axis = 1)
moores_feature_matrix.shape

(81, 7776)

In [61]:
moores_feature_matrix.to_csv("/data/ajayago/papers_data/systematic_assessment/raw/annotated_mutation_matrices/clinvar_gpd_annovar_annotated_Moores_feature_matrix.csv")

### TCGA

In [62]:
tcga_mutations = pd.read_csv("/data/druid_data/Tcga//patient_gene_alteration(mutation).csv")
tcga_mutations["canonical_gene_name"] = tcga_mutations["gene"].apply(lambda x: convert2canonical(x))
tcga_mutations

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name
0,TCGA-50-5931,CAMTA1,V870E,
1,TCGA-50-5931,CATSPER4,P365=,
2,TCGA-50-5931,KDF1,I243T,
3,TCGA-50-5931,CSMD2,T417S,
4,TCGA-50-5931,SFPQ,G647C,
...,...,...,...,...
3093849,TCGA-YD-A9TA,CNGA2,G303G,
3093850,TCGA-YD-A9TA,MAGEA12,R243R,
3093851,TCGA-YD-A9TA,ZNF275,L224L,
3093852,TCGA-YD-A9TA,L1CAM,P279P,


In [63]:
tcga_mutations = tcga_mutations[tcga_mutations.canonical_gene_name.astype(str) != "nan"].reset_index(drop=True)
tcga_mutations["mutations"] = tcga_mutations["canonical_gene_name"] + "@" + tcga_mutations["alteration"]
tcga_mutations

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name,mutations
0,TCGA-50-5931,DNMT3A,V258M,DNMT3A,DNMT3A@V258M
1,TCGA-50-5931,MSH2,I356V,MSH2,MSH2@I356V
2,TCGA-50-5931,RICTOR,L42F,RICTOR,RICTOR@L42F
3,TCGA-50-5931,NOTCH1,D297G,NOTCH1,NOTCH1@D297G
4,TCGA-50-5931,MLL2,E668*,MLL2,MLL2@E668*
...,...,...,...,...,...
100931,TCGA-YD-A9TA,HNF1A,V251V,HNF1A,HNF1A@V251V
100932,TCGA-YD-A9TA,HNF1A,I268I,HNF1A,HNF1A@I268I
100933,TCGA-YD-A9TA,BCR,S317S,BCR,BCR@S317S
100934,TCGA-YD-A9TA,BCR,I688I,BCR,BCR@I688I


In [64]:
len(tcga_mutations.patient_id.unique())

9178

In [65]:
merged_tcga = tcga_mutations.merge(vocab_df[["xon17_score", 'clinvar_Pathogenic', 'clinvar_Benign', 'clinvar_Unknown', 'gpd_LU', 'gpd_NCU', 'gpd_PIU']], left_on="mutations", right_on=vocab_df.index, how="left")
merged_tcga["xon17_score"].fillna(0, inplace=True)
# NaNs are set to VUS
merged_tcga["clinvar_Pathogenic"].fillna(0, inplace=True)
merged_tcga["clinvar_Benign"].fillna(0, inplace=True)
merged_tcga["clinvar_Unknown"].fillna(1, inplace=True)
# NaNs are set to NCU
merged_tcga["gpd_LU"].fillna(0, inplace=True)
merged_tcga["gpd_PIU"].fillna(0, inplace=True)
merged_tcga["gpd_NCU"].fillna(1, inplace=True)
merged_tcga

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name,mutations,xon17_score,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,gpd_LU,gpd_NCU,gpd_PIU
0,TCGA-50-5931,DNMT3A,V258M,DNMT3A,DNMT3A@V258M,0.764706,0.0,0.0,1.0,0.0,0.0,1.0
1,TCGA-50-5931,MSH2,I356V,MSH2,MSH2@I356V,0.882353,0.0,0.0,1.0,0.0,0.0,1.0
2,TCGA-50-5931,RICTOR,L42F,RICTOR,RICTOR@L42F,0.823529,0.0,0.0,1.0,1.0,0.0,0.0
3,TCGA-50-5931,NOTCH1,D297G,NOTCH1,NOTCH1@D297G,0.941176,0.0,0.0,1.0,0.0,0.0,1.0
4,TCGA-50-5931,MLL2,E668*,MLL2,MLL2@E668*,0.000000,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
100931,TCGA-YD-A9TA,HNF1A,V251V,HNF1A,HNF1A@V251V,0.000000,0.0,0.0,1.0,0.0,1.0,0.0
100932,TCGA-YD-A9TA,HNF1A,I268I,HNF1A,HNF1A@I268I,0.000000,0.0,0.0,1.0,0.0,1.0,0.0
100933,TCGA-YD-A9TA,BCR,S317S,BCR,BCR@S317S,0.000000,0.0,0.0,1.0,0.0,1.0,0.0
100934,TCGA-YD-A9TA,BCR,I688I,BCR,BCR@I688I,0.000000,0.0,0.0,1.0,0.0,1.0,0.0


In [66]:
merged_tcga.canonical_gene_name.isna().sum()

0

In [67]:
merged_tcga[["clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown", "gpd_LU", "gpd_NCU", "gpd_PIU"]].sum()

clinvar_Pathogenic    10065.0
clinvar_Benign          335.0
clinvar_Unknown       90536.0
gpd_LU                16188.0
gpd_NCU               47156.0
gpd_PIU               37592.0
dtype: float64

#### GPD features

In [68]:
# Max feature
tcga_gpd_max = merged_tcga.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("max")[["xon17_score"]].reset_index()
tcga_gpd_max

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,TCGA-02-0003,EGFR,0.0,0.0,1.0,0.823529
1,TCGA-02-0003,PIK3R1,0.0,0.0,1.0,0.941176
2,TCGA-02-0003,TP53,0.0,0.0,1.0,0.941176
3,TCGA-02-0033,FGFR2,0.0,0.0,1.0,0.529412
4,TCGA-02-0033,PIK3CA,0.0,0.0,1.0,0.705882
...,...,...,...,...,...,...
75508,TCGA-ZS-A9CG,PRDM1,0.0,1.0,0.0,0.000000
75509,TCGA-ZS-A9CG,SMO,1.0,0.0,0.0,0.176471
75510,TCGA-ZU-A8S4,ARID1A,0.0,1.0,0.0,0.000000
75511,TCGA-ZU-A8S4,BRAF,0.0,0.0,1.0,0.705882


In [69]:
tcga_gpd_max_lu_matrix = get_matrices(tcga_gpd_max, merged_tcga, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_gpd_max_piu_matrix = get_matrices(tcga_gpd_max, merged_tcga, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_gpd_max_ncu_matrix = get_matrices(tcga_gpd_max, merged_tcga, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(tcga_gpd_max_lu_matrix.shape)
print(tcga_gpd_max_piu_matrix.shape)
print(tcga_gpd_max_ncu_matrix.shape)

(4782, 297)
(7961, 306)
(6890, 321)
(9178, 324)
(9178, 324)
(9178, 324)


In [70]:
# Sum feature
tcga_gpd_sum = merged_tcga.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("sum")[["xon17_score"]].reset_index()
tcga_gpd_sum

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,TCGA-02-0003,EGFR,0.0,0.0,1.0,0.823529
1,TCGA-02-0003,PIK3R1,0.0,0.0,1.0,0.941176
2,TCGA-02-0003,TP53,0.0,0.0,1.0,1.882353
3,TCGA-02-0033,FGFR2,0.0,0.0,1.0,0.529412
4,TCGA-02-0033,PIK3CA,0.0,0.0,1.0,0.705882
...,...,...,...,...,...,...
75508,TCGA-ZS-A9CG,PRDM1,0.0,1.0,0.0,0.000000
75509,TCGA-ZS-A9CG,SMO,1.0,0.0,0.0,0.176471
75510,TCGA-ZU-A8S4,ARID1A,0.0,1.0,0.0,0.000000
75511,TCGA-ZU-A8S4,BRAF,0.0,0.0,1.0,0.705882


In [71]:
tcga_gpd_sum_lu_matrix = get_matrices(tcga_gpd_sum, merged_tcga, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_gpd_sum_piu_matrix = get_matrices(tcga_gpd_sum, merged_tcga, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_gpd_sum_ncu_matrix = get_matrices(tcga_gpd_sum, merged_tcga, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(tcga_gpd_sum_lu_matrix.shape)
print(tcga_gpd_sum_piu_matrix.shape)
print(tcga_gpd_sum_ncu_matrix.shape)

(4782, 297)
(7961, 306)
(6890, 321)
(9178, 324)
(9178, 324)
(9178, 324)


In [72]:
# Mean feature
tcga_gpd_mean = merged_tcga.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"])[["xon17_score"]].aggregate("mean").reset_index()
tcga_gpd_mean

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,TCGA-02-0003,EGFR,0.0,0.0,1.0,0.823529
1,TCGA-02-0003,PIK3R1,0.0,0.0,1.0,0.941176
2,TCGA-02-0003,TP53,0.0,0.0,1.0,0.941176
3,TCGA-02-0033,FGFR2,0.0,0.0,1.0,0.529412
4,TCGA-02-0033,PIK3CA,0.0,0.0,1.0,0.705882
...,...,...,...,...,...,...
75508,TCGA-ZS-A9CG,PRDM1,0.0,1.0,0.0,0.000000
75509,TCGA-ZS-A9CG,SMO,1.0,0.0,0.0,0.176471
75510,TCGA-ZU-A8S4,ARID1A,0.0,1.0,0.0,0.000000
75511,TCGA-ZU-A8S4,BRAF,0.0,0.0,1.0,0.705882


In [73]:
tcga_gpd_mean_lu_matrix = get_matrices(tcga_gpd_mean, merged_tcga, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_gpd_mean_piu_matrix = get_matrices(tcga_gpd_mean, merged_tcga, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_gpd_mean_ncu_matrix = get_matrices(tcga_gpd_mean, merged_tcga, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(tcga_gpd_mean_lu_matrix.shape)
print(tcga_gpd_mean_piu_matrix.shape)
print(tcga_gpd_mean_ncu_matrix.shape)

(4782, 297)
(7961, 306)
(6890, 321)
(9178, 324)
(9178, 324)
(9178, 324)


In [74]:
# Count feature
tcga_gpd_count = merged_tcga.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("count")[["xon17_score"]].reset_index()
tcga_gpd_count

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,TCGA-02-0003,EGFR,0.0,0.0,1.0,1
1,TCGA-02-0003,PIK3R1,0.0,0.0,1.0,1
2,TCGA-02-0003,TP53,0.0,0.0,1.0,2
3,TCGA-02-0033,FGFR2,0.0,0.0,1.0,1
4,TCGA-02-0033,PIK3CA,0.0,0.0,1.0,1
...,...,...,...,...,...,...
75508,TCGA-ZS-A9CG,PRDM1,0.0,1.0,0.0,1
75509,TCGA-ZS-A9CG,SMO,1.0,0.0,0.0,1
75510,TCGA-ZU-A8S4,ARID1A,0.0,1.0,0.0,1
75511,TCGA-ZU-A8S4,BRAF,0.0,0.0,1.0,1


In [75]:
tcga_gpd_count_lu_matrix = get_matrices(tcga_gpd_count, merged_tcga, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_gpd_count_piu_matrix = get_matrices(tcga_gpd_count, merged_tcga, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_gpd_count_ncu_matrix = get_matrices(tcga_gpd_count, merged_tcga, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(tcga_gpd_count_lu_matrix.shape)
print(tcga_gpd_count_piu_matrix.shape)
print(tcga_gpd_count_ncu_matrix.shape)

(4782, 297)
(7961, 306)
(6890, 321)
(9178, 324)
(9178, 324)
(9178, 324)


#### ClinVar

In [76]:
# Max feature
tcga_clinvar_max = merged_tcga.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"])[["xon17_score"]].aggregate("max").reset_index()
tcga_clinvar_max

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,TCGA-02-0003,EGFR,0.0,0.0,1.0,0.823529
1,TCGA-02-0003,PIK3R1,1.0,0.0,0.0,0.941176
2,TCGA-02-0003,TP53,0.0,0.0,1.0,0.941176
3,TCGA-02-0003,TP53,1.0,0.0,0.0,0.941176
4,TCGA-02-0033,FGFR2,0.0,0.0,1.0,0.529412
...,...,...,...,...,...,...
70255,TCGA-ZS-A9CG,PRDM1,0.0,0.0,1.0,0.000000
70256,TCGA-ZS-A9CG,SMO,0.0,0.0,1.0,0.176471
70257,TCGA-ZU-A8S4,ARID1A,0.0,0.0,1.0,0.000000
70258,TCGA-ZU-A8S4,BRAF,1.0,0.0,0.0,0.705882


In [77]:
tcga_clinvar_max_pathogenic_matrix = get_matrices(tcga_clinvar_max, merged_tcga, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_clinvar_max_vus_matrix = get_matrices(tcga_clinvar_max, merged_tcga, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_clinvar_max_benign_matrix = get_matrices(tcga_clinvar_max, merged_tcga, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(tcga_clinvar_max_pathogenic_matrix.shape)
print(tcga_clinvar_max_vus_matrix.shape)
print(tcga_clinvar_max_benign_matrix.shape)

(5713, 165)
(8497, 321)
(244, 87)
(9178, 324)
(9178, 324)
(9178, 324)


In [78]:
# Sum feature
tcga_clinvar_sum = merged_tcga.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"]).aggregate("sum")[["xon17_score"]].reset_index()
tcga_clinvar_sum

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,TCGA-02-0003,EGFR,0.0,0.0,1.0,0.823529
1,TCGA-02-0003,PIK3R1,1.0,0.0,0.0,0.941176
2,TCGA-02-0003,TP53,0.0,0.0,1.0,0.941176
3,TCGA-02-0003,TP53,1.0,0.0,0.0,0.941176
4,TCGA-02-0033,FGFR2,0.0,0.0,1.0,0.529412
...,...,...,...,...,...,...
70255,TCGA-ZS-A9CG,PRDM1,0.0,0.0,1.0,0.000000
70256,TCGA-ZS-A9CG,SMO,0.0,0.0,1.0,0.176471
70257,TCGA-ZU-A8S4,ARID1A,0.0,0.0,1.0,0.000000
70258,TCGA-ZU-A8S4,BRAF,1.0,0.0,0.0,0.705882


In [79]:
tcga_clinvar_sum_pathogenic_matrix = get_matrices(tcga_clinvar_sum, merged_tcga, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_clinvar_sum_vus_matrix = get_matrices(tcga_clinvar_sum, merged_tcga, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_clinvar_sum_benign_matrix = get_matrices(tcga_clinvar_sum, merged_tcga, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(tcga_clinvar_sum_pathogenic_matrix.shape)
print(tcga_clinvar_sum_vus_matrix.shape)
print(tcga_clinvar_sum_benign_matrix.shape)

(5713, 165)
(8497, 321)
(244, 87)
(9178, 324)
(9178, 324)
(9178, 324)


In [80]:
# Mean feature
tcga_clinvar_mean = merged_tcga.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"])[["xon17_score"]].aggregate("mean").reset_index()
tcga_clinvar_mean

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,TCGA-02-0003,EGFR,0.0,0.0,1.0,0.823529
1,TCGA-02-0003,PIK3R1,1.0,0.0,0.0,0.941176
2,TCGA-02-0003,TP53,0.0,0.0,1.0,0.941176
3,TCGA-02-0003,TP53,1.0,0.0,0.0,0.941176
4,TCGA-02-0033,FGFR2,0.0,0.0,1.0,0.529412
...,...,...,...,...,...,...
70255,TCGA-ZS-A9CG,PRDM1,0.0,0.0,1.0,0.000000
70256,TCGA-ZS-A9CG,SMO,0.0,0.0,1.0,0.176471
70257,TCGA-ZU-A8S4,ARID1A,0.0,0.0,1.0,0.000000
70258,TCGA-ZU-A8S4,BRAF,1.0,0.0,0.0,0.705882


In [81]:
tcga_clinvar_mean_pathogenic_matrix = get_matrices(tcga_clinvar_mean, merged_tcga, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_clinvar_mean_vus_matrix = get_matrices(tcga_clinvar_mean, merged_tcga, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_clinvar_mean_benign_matrix = get_matrices(tcga_clinvar_mean, merged_tcga, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(tcga_clinvar_mean_pathogenic_matrix.shape)
print(tcga_clinvar_mean_vus_matrix.shape)
print(tcga_clinvar_mean_benign_matrix.shape)

(5713, 165)
(8497, 321)
(244, 87)
(9178, 324)
(9178, 324)
(9178, 324)


In [82]:
# Count feature
tcga_clinvar_count = merged_tcga.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"]).aggregate("count")[["xon17_score"]].reset_index()
tcga_clinvar_count

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,TCGA-02-0003,EGFR,0.0,0.0,1.0,1
1,TCGA-02-0003,PIK3R1,1.0,0.0,0.0,1
2,TCGA-02-0003,TP53,0.0,0.0,1.0,1
3,TCGA-02-0003,TP53,1.0,0.0,0.0,1
4,TCGA-02-0033,FGFR2,0.0,0.0,1.0,1
...,...,...,...,...,...,...
70255,TCGA-ZS-A9CG,PRDM1,0.0,0.0,1.0,1
70256,TCGA-ZS-A9CG,SMO,0.0,0.0,1.0,1
70257,TCGA-ZU-A8S4,ARID1A,0.0,0.0,1.0,1
70258,TCGA-ZU-A8S4,BRAF,1.0,0.0,0.0,1


In [83]:
tcga_clinvar_count_pathogenic_matrix = get_matrices(tcga_clinvar_count, merged_tcga, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_clinvar_count_vus_matrix = get_matrices(tcga_clinvar_count, merged_tcga, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
tcga_clinvar_count_benign_matrix = get_matrices(tcga_clinvar_count, merged_tcga, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(tcga_clinvar_count_pathogenic_matrix.shape)
print(tcga_clinvar_count_vus_matrix.shape)
print(tcga_clinvar_count_benign_matrix.shape)

(5713, 165)
(8497, 321)
(244, 87)
(9178, 324)
(9178, 324)
(9178, 324)


In [84]:
# Add suffixes to identify columns
tcga_gpd_max_piu_matrix = tcga_gpd_max_piu_matrix.add_suffix('_piu_max')
tcga_gpd_sum_piu_matrix = tcga_gpd_sum_piu_matrix.add_suffix("_piu_sum")
tcga_gpd_mean_piu_matrix = tcga_gpd_mean_piu_matrix.add_suffix("_piu_mean")
tcga_gpd_count_piu_matrix = tcga_gpd_count_piu_matrix.add_suffix("_piu_count")
tcga_gpd_max_lu_matrix = tcga_gpd_max_lu_matrix.add_suffix("_lu_max")
tcga_gpd_sum_lu_matrix = tcga_gpd_sum_lu_matrix.add_suffix("_lu_sum")
tcga_gpd_mean_lu_matrix = tcga_gpd_mean_lu_matrix.add_suffix("_lu_mean")
tcga_gpd_count_lu_matrix = tcga_gpd_count_ncu_matrix.add_suffix("_lu_count")
tcga_gpd_max_ncu_matrix = tcga_gpd_max_ncu_matrix.add_suffix("_ncu_max")
tcga_gpd_sum_ncu_matrix = tcga_gpd_sum_ncu_matrix.add_suffix("_ncu_sum")
tcga_gpd_mean_ncu_matrix = tcga_gpd_mean_ncu_matrix.add_suffix("_ncu_mean")
tcga_gpd_count_ncu_matrix = tcga_gpd_count_ncu_matrix.add_suffix("_ncu_count")

tcga_clinvar_max_pathogenic_matrix = tcga_clinvar_max_pathogenic_matrix.add_suffix("_pathogenic_max")
tcga_clinvar_sum_pathogenic_matrix = tcga_clinvar_sum_pathogenic_matrix.add_suffix("_pathogenic_sum")
tcga_clinvar_mean_pathogenic_matrix = tcga_clinvar_mean_pathogenic_matrix.add_suffix("_pathogenic_mean")
tcga_clinvar_count_pathogenic_matrix = tcga_clinvar_count_pathogenic_matrix.add_suffix("_pathogenic_count")
tcga_clinvar_max_vus_matrix = tcga_clinvar_max_vus_matrix.add_suffix("_vus_max")
tcga_clinvar_sum_vus_matrix = tcga_clinvar_sum_vus_matrix.add_suffix("_vus_sum")
tcga_clinvar_mean_vus_matrix = tcga_clinvar_mean_vus_matrix.add_suffix("_vus_mean")
tcga_clinvar_count_vus_matrix = tcga_clinvar_count_vus_matrix.add_suffix("_vus_count")
tcga_clinvar_max_benign_matrix = tcga_clinvar_max_benign_matrix.add_suffix("_benign_max")
tcga_clinvar_sum_benign_matrix = tcga_clinvar_sum_benign_matrix.add_suffix("_benign_sum")
tcga_clinvar_mean_benign_matrix = tcga_clinvar_mean_benign_matrix.add_suffix("_benign_mean")
tcga_clinvar_count_benign_matrix = tcga_clinvar_count_benign_matrix.add_suffix("_benign_count")

In [85]:
# PIU - Max, Sum, Mean, Count, LU - Max, Sum, Mean, Count, NCU - Max, Sum, Mean, Count, Pathogenic - Max, Sum, Mean, Count, VUS - Max, Sum, Mean, Count, Benign - Max, Sum, Mean, Count
tcga_feature_matrix = pd.concat([
    tcga_gpd_max_piu_matrix, tcga_gpd_sum_piu_matrix, tcga_gpd_mean_piu_matrix, tcga_gpd_count_piu_matrix,
    tcga_gpd_max_lu_matrix, tcga_gpd_sum_lu_matrix, tcga_gpd_mean_lu_matrix, tcga_gpd_count_lu_matrix,
    tcga_gpd_max_ncu_matrix, tcga_gpd_sum_ncu_matrix, tcga_gpd_mean_ncu_matrix, tcga_gpd_count_ncu_matrix,
    tcga_clinvar_max_pathogenic_matrix, tcga_clinvar_sum_pathogenic_matrix, tcga_clinvar_mean_pathogenic_matrix, tcga_clinvar_count_pathogenic_matrix,
    tcga_clinvar_max_vus_matrix, tcga_clinvar_sum_vus_matrix, tcga_clinvar_mean_vus_matrix, tcga_clinvar_count_vus_matrix,
    tcga_clinvar_max_benign_matrix, tcga_clinvar_sum_benign_matrix, tcga_clinvar_mean_benign_matrix, tcga_clinvar_count_benign_matrix,
], axis = 1)
tcga_feature_matrix.shape

(9178, 7776)

In [86]:
tcga_feature_matrix.to_csv("/data/ajayago/papers_data/systematic_assessment/raw/annotated_mutation_matrices/clinvar_gpd_annovar_annotated_Tcga_feature_matrix.csv")

### CBIO HCC

In [87]:
cbio_hcc_mutations = pd.read_csv("/data/druid_data/CBIO/hcc_mskimpact_2018/patient_gene_alteration(mutation).csv")
cbio_hcc_mutations["canonical_gene_name"] = cbio_hcc_mutations["gene"].apply(lambda x: convert2canonical(x))
cbio_hcc_mutations

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name
0,P-0005038-T02-IM6,TNFRSF14,Q242R,TNFRSF14
1,P-0005038-T02-IM6,JAK1,S729C,JAK1
2,P-0005038-T02-IM6,MEN1,X224_splice,MEN1
3,P-0005038-T02-IM6,ALK,E717K,ALK
4,P-0015203-T01-IM6,ZRSR2,C172S,
...,...,...,...,...
531,P-0012182-T01-IM5,NEGR1,Q8L,
532,P-0012182-T01-IM5,SETD2,S2479A,SETD2
533,P-0012182-T01-IM5,POLE,V544M,POLE
534,P-0012182-T01-IM5,AXIN1,E291*,AXIN1


In [88]:
cbio_hcc_mutations = cbio_hcc_mutations[cbio_hcc_mutations.canonical_gene_name.astype(str) != "nan"].reset_index(drop=True)
cbio_hcc_mutations["mutations"] = cbio_hcc_mutations["canonical_gene_name"] + "@" + cbio_hcc_mutations["alteration"]
cbio_hcc_mutations

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name,mutations
0,P-0005038-T02-IM6,TNFRSF14,Q242R,TNFRSF14,TNFRSF14@Q242R
1,P-0005038-T02-IM6,JAK1,S729C,JAK1,JAK1@S729C
2,P-0005038-T02-IM6,MEN1,X224_splice,MEN1,MEN1@X224_splice
3,P-0005038-T02-IM6,ALK,E717K,ALK,ALK@E717K
4,P-0015203-T01-IM6,CTNNB1,S37P,CTNNB1,CTNNB1@S37P
...,...,...,...,...,...
398,P-0014167-T01-IM5,KDR,R720W,KDR,KDR@R720W
399,P-0012182-T01-IM5,SETD2,S2479A,SETD2,SETD2@S2479A
400,P-0012182-T01-IM5,POLE,V544M,POLE,POLE@V544M
401,P-0012182-T01-IM5,AXIN1,E291*,AXIN1,AXIN1@E291*


In [89]:
len(cbio_hcc_mutations.patient_id.unique())

117

In [90]:
merged_cbio_hcc = cbio_hcc_mutations.merge(vocab_df[["xon17_score", 'clinvar_Pathogenic', 'clinvar_Benign', 'clinvar_Unknown', 'gpd_LU', 'gpd_NCU', 'gpd_PIU']], left_on="mutations", right_on=vocab_df.index, how="left")
merged_cbio_hcc["xon17_score"].fillna(0, inplace=True)
# NaNs are set to VUS
merged_cbio_hcc["clinvar_Pathogenic"].fillna(0, inplace=True)
merged_cbio_hcc["clinvar_Benign"].fillna(0, inplace=True)
merged_cbio_hcc["clinvar_Unknown"].fillna(1, inplace=True)
# NaNs are set to NCU
merged_cbio_hcc["gpd_LU"].fillna(0, inplace=True)
merged_cbio_hcc["gpd_PIU"].fillna(0, inplace=True)
merged_cbio_hcc["gpd_NCU"].fillna(1, inplace=True)
merged_cbio_hcc

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name,mutations,xon17_score,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,gpd_LU,gpd_NCU,gpd_PIU
0,P-0005038-T02-IM6,TNFRSF14,Q242R,TNFRSF14,TNFRSF14@Q242R,0.352941,0.0,0.0,1.0,0.0,0.0,1.0
1,P-0005038-T02-IM6,JAK1,S729C,JAK1,JAK1@S729C,0.352941,0.0,0.0,1.0,0.0,0.0,1.0
2,P-0005038-T02-IM6,MEN1,X224_splice,MEN1,MEN1@X224_splice,0.000000,0.0,0.0,1.0,0.0,1.0,0.0
3,P-0005038-T02-IM6,ALK,E717K,ALK,ALK@E717K,0.411765,0.0,0.0,1.0,1.0,0.0,0.0
4,P-0015203-T01-IM6,CTNNB1,S37P,CTNNB1,CTNNB1@S37P,0.882353,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
398,P-0014167-T01-IM5,KDR,R720W,KDR,KDR@R720W,0.588235,0.0,0.0,1.0,0.0,0.0,1.0
399,P-0012182-T01-IM5,SETD2,S2479A,SETD2,SETD2@S2479A,0.705882,0.0,0.0,1.0,0.0,0.0,1.0
400,P-0012182-T01-IM5,POLE,V544M,POLE,POLE@V544M,0.941176,0.0,0.0,1.0,1.0,0.0,0.0
401,P-0012182-T01-IM5,AXIN1,E291*,AXIN1,AXIN1@E291*,0.000000,0.0,0.0,1.0,0.0,1.0,0.0


In [91]:
merged_cbio_hcc.canonical_gene_name.isna().sum()

0

In [92]:
merged_cbio_hcc[["clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown", "gpd_LU", "gpd_NCU", "gpd_PIU"]].sum()

clinvar_Pathogenic     52.0
clinvar_Benign          1.0
clinvar_Unknown       350.0
gpd_LU                 74.0
gpd_NCU               128.0
gpd_PIU               201.0
dtype: float64

#### GPD features

In [93]:
# Max feature
cbio_hcc_gpd_max = merged_cbio_hcc.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("max")[["xon17_score"]].reset_index()
cbio_hcc_gpd_max

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,P-0000037-T02-IM3,AXIN1,0.0,0.0,1.0,0.882353
1,P-0000037-T02-IM3,ERCC4,1.0,0.0,0.0,0.764706
2,P-0000182-T01-IM3,WT1,0.0,1.0,0.0,0.000000
3,P-0000218-T01-IM3,KDM5C,0.0,0.0,1.0,0.470588
4,P-0000218-T01-IM3,NTRK2,1.0,0.0,0.0,0.411765
...,...,...,...,...,...,...
386,P-0021780-T01-IM6,ERCC4,0.0,0.0,1.0,0.529412
387,P-0021780-T01-IM6,HRAS,0.0,0.0,1.0,0.941176
388,P-0021780-T01-IM6,TP53,0.0,0.0,1.0,0.941176
389,P-0022348-T01-IM6,BAP1,0.0,1.0,0.0,0.000000


In [94]:
cbio_hcc_gpd_max_lu_matrix = get_matrices(cbio_hcc_gpd_max, merged_cbio_hcc, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_gpd_max_piu_matrix = get_matrices(cbio_hcc_gpd_max, merged_cbio_hcc, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_gpd_max_ncu_matrix = get_matrices(cbio_hcc_gpd_max, merged_cbio_hcc, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_hcc_gpd_max_lu_matrix.shape)
print(cbio_hcc_gpd_max_piu_matrix.shape)
print(cbio_hcc_gpd_max_ncu_matrix.shape)

(54, 49)
(99, 93)
(77, 52)
(117, 324)
(117, 324)
(117, 324)


In [95]:
# Sum feature
cbio_hcc_gpd_sum = merged_cbio_hcc.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("sum")[["xon17_score"]].reset_index()
cbio_hcc_gpd_sum

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,P-0000037-T02-IM3,AXIN1,0.0,0.0,1.0,0.882353
1,P-0000037-T02-IM3,ERCC4,1.0,0.0,0.0,0.764706
2,P-0000182-T01-IM3,WT1,0.0,1.0,0.0,0.000000
3,P-0000218-T01-IM3,KDM5C,0.0,0.0,1.0,0.470588
4,P-0000218-T01-IM3,NTRK2,1.0,0.0,0.0,0.411765
...,...,...,...,...,...,...
386,P-0021780-T01-IM6,ERCC4,0.0,0.0,1.0,0.529412
387,P-0021780-T01-IM6,HRAS,0.0,0.0,1.0,0.941176
388,P-0021780-T01-IM6,TP53,0.0,0.0,1.0,0.941176
389,P-0022348-T01-IM6,BAP1,0.0,1.0,0.0,0.000000


In [96]:
cbio_hcc_gpd_sum_lu_matrix = get_matrices(cbio_hcc_gpd_sum, merged_cbio_hcc, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_gpd_sum_piu_matrix = get_matrices(cbio_hcc_gpd_sum, merged_cbio_hcc, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_gpd_sum_ncu_matrix = get_matrices(cbio_hcc_gpd_sum, merged_cbio_hcc, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_hcc_gpd_sum_lu_matrix.shape)
print(cbio_hcc_gpd_sum_piu_matrix.shape)
print(cbio_hcc_gpd_sum_ncu_matrix.shape)

(54, 49)
(99, 93)
(77, 52)
(117, 324)
(117, 324)
(117, 324)


In [97]:
# Mean feature
cbio_hcc_gpd_mean = merged_cbio_hcc.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"])[["xon17_score"]].aggregate("mean").reset_index()
cbio_hcc_gpd_mean

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,P-0000037-T02-IM3,AXIN1,0.0,0.0,1.0,0.882353
1,P-0000037-T02-IM3,ERCC4,1.0,0.0,0.0,0.764706
2,P-0000182-T01-IM3,WT1,0.0,1.0,0.0,0.000000
3,P-0000218-T01-IM3,KDM5C,0.0,0.0,1.0,0.470588
4,P-0000218-T01-IM3,NTRK2,1.0,0.0,0.0,0.411765
...,...,...,...,...,...,...
386,P-0021780-T01-IM6,ERCC4,0.0,0.0,1.0,0.529412
387,P-0021780-T01-IM6,HRAS,0.0,0.0,1.0,0.941176
388,P-0021780-T01-IM6,TP53,0.0,0.0,1.0,0.941176
389,P-0022348-T01-IM6,BAP1,0.0,1.0,0.0,0.000000


In [98]:
cbio_hcc_gpd_mean_lu_matrix = get_matrices(cbio_hcc_gpd_mean, merged_cbio_hcc, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_gpd_mean_piu_matrix = get_matrices(cbio_hcc_gpd_mean, merged_cbio_hcc, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_gpd_mean_ncu_matrix = get_matrices(cbio_hcc_gpd_mean, merged_cbio_hcc, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_hcc_gpd_mean_lu_matrix.shape)
print(cbio_hcc_gpd_mean_piu_matrix.shape)
print(cbio_hcc_gpd_mean_ncu_matrix.shape)

(54, 49)
(99, 93)
(77, 52)
(117, 324)
(117, 324)
(117, 324)


In [99]:
# Count feature
cbio_hcc_gpd_count = merged_cbio_hcc.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("count")[["xon17_score"]].reset_index()
cbio_hcc_gpd_count

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,P-0000037-T02-IM3,AXIN1,0.0,0.0,1.0,1
1,P-0000037-T02-IM3,ERCC4,1.0,0.0,0.0,1
2,P-0000182-T01-IM3,WT1,0.0,1.0,0.0,1
3,P-0000218-T01-IM3,KDM5C,0.0,0.0,1.0,1
4,P-0000218-T01-IM3,NTRK2,1.0,0.0,0.0,1
...,...,...,...,...,...,...
386,P-0021780-T01-IM6,ERCC4,0.0,0.0,1.0,1
387,P-0021780-T01-IM6,HRAS,0.0,0.0,1.0,1
388,P-0021780-T01-IM6,TP53,0.0,0.0,1.0,1
389,P-0022348-T01-IM6,BAP1,0.0,1.0,0.0,1


In [100]:
cbio_hcc_gpd_count_lu_matrix = get_matrices(cbio_hcc_gpd_count, merged_cbio_hcc, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_gpd_count_piu_matrix = get_matrices(cbio_hcc_gpd_count, merged_cbio_hcc, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_gpd_count_ncu_matrix = get_matrices(cbio_hcc_gpd_count, merged_cbio_hcc, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_hcc_gpd_count_lu_matrix.shape)
print(cbio_hcc_gpd_count_piu_matrix.shape)
print(cbio_hcc_gpd_count_ncu_matrix.shape)

(54, 49)
(99, 93)
(77, 52)
(117, 324)
(117, 324)
(117, 324)


#### ClinVar

In [101]:
# Max feature
cbio_hcc_clinvar_max = merged_cbio_hcc.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"])[["xon17_score"]].aggregate("max").reset_index()
cbio_hcc_clinvar_max

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,P-0000037-T02-IM3,AXIN1,0.0,0.0,1.0,0.882353
1,P-0000037-T02-IM3,ERCC4,0.0,0.0,1.0,0.764706
2,P-0000182-T01-IM3,WT1,0.0,0.0,1.0,0.000000
3,P-0000218-T01-IM3,KDM5C,0.0,0.0,1.0,0.470588
4,P-0000218-T01-IM3,NTRK2,0.0,0.0,1.0,0.411765
...,...,...,...,...,...,...
385,P-0021780-T01-IM6,ERCC4,0.0,0.0,1.0,0.529412
386,P-0021780-T01-IM6,HRAS,1.0,0.0,0.0,0.941176
387,P-0021780-T01-IM6,TP53,1.0,0.0,0.0,0.941176
388,P-0022348-T01-IM6,BAP1,0.0,0.0,1.0,0.000000


In [102]:
cbio_hcc_clinvar_max_pathogenic_matrix = get_matrices(cbio_hcc_clinvar_max, merged_cbio_hcc, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_clinvar_max_vus_matrix = get_matrices(cbio_hcc_clinvar_max, merged_cbio_hcc, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_clinvar_max_benign_matrix = get_matrices(cbio_hcc_clinvar_max, merged_cbio_hcc, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_hcc_clinvar_max_pathogenic_matrix.shape)
print(cbio_hcc_clinvar_max_vus_matrix.shape)
print(cbio_hcc_clinvar_max_benign_matrix.shape)

(46, 13)
(110, 132)
(1, 1)
(117, 324)
(117, 324)
(117, 324)


In [103]:
# Sum feature
cbio_hcc_clinvar_sum = merged_cbio_hcc.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"]).aggregate("sum")[["xon17_score"]].reset_index()
cbio_hcc_clinvar_sum

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,P-0000037-T02-IM3,AXIN1,0.0,0.0,1.0,0.882353
1,P-0000037-T02-IM3,ERCC4,0.0,0.0,1.0,0.764706
2,P-0000182-T01-IM3,WT1,0.0,0.0,1.0,0.000000
3,P-0000218-T01-IM3,KDM5C,0.0,0.0,1.0,0.470588
4,P-0000218-T01-IM3,NTRK2,0.0,0.0,1.0,0.411765
...,...,...,...,...,...,...
385,P-0021780-T01-IM6,ERCC4,0.0,0.0,1.0,0.529412
386,P-0021780-T01-IM6,HRAS,1.0,0.0,0.0,0.941176
387,P-0021780-T01-IM6,TP53,1.0,0.0,0.0,0.941176
388,P-0022348-T01-IM6,BAP1,0.0,0.0,1.0,0.000000


In [104]:
cbio_hcc_clinvar_sum_pathogenic_matrix = get_matrices(cbio_hcc_clinvar_sum, merged_cbio_hcc, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_clinvar_sum_vus_matrix = get_matrices(cbio_hcc_clinvar_sum, merged_cbio_hcc, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_clinvar_sum_benign_matrix = get_matrices(cbio_hcc_clinvar_sum, merged_cbio_hcc, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_hcc_clinvar_sum_pathogenic_matrix.shape)
print(cbio_hcc_clinvar_sum_vus_matrix.shape)
print(cbio_hcc_clinvar_sum_benign_matrix.shape)

(46, 13)
(110, 132)
(1, 1)
(117, 324)
(117, 324)
(117, 324)


In [105]:
# Mean feature
cbio_hcc_clinvar_mean = merged_cbio_hcc.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"])[["xon17_score"]].aggregate("mean").reset_index()
cbio_hcc_clinvar_mean

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,P-0000037-T02-IM3,AXIN1,0.0,0.0,1.0,0.882353
1,P-0000037-T02-IM3,ERCC4,0.0,0.0,1.0,0.764706
2,P-0000182-T01-IM3,WT1,0.0,0.0,1.0,0.000000
3,P-0000218-T01-IM3,KDM5C,0.0,0.0,1.0,0.470588
4,P-0000218-T01-IM3,NTRK2,0.0,0.0,1.0,0.411765
...,...,...,...,...,...,...
385,P-0021780-T01-IM6,ERCC4,0.0,0.0,1.0,0.529412
386,P-0021780-T01-IM6,HRAS,1.0,0.0,0.0,0.941176
387,P-0021780-T01-IM6,TP53,1.0,0.0,0.0,0.941176
388,P-0022348-T01-IM6,BAP1,0.0,0.0,1.0,0.000000


In [106]:
cbio_hcc_clinvar_mean_pathogenic_matrix = get_matrices(cbio_hcc_clinvar_mean, merged_cbio_hcc, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_clinvar_mean_vus_matrix = get_matrices(cbio_hcc_clinvar_mean, merged_cbio_hcc, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_clinvar_mean_benign_matrix = get_matrices(cbio_hcc_clinvar_mean, merged_cbio_hcc, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_hcc_clinvar_mean_pathogenic_matrix.shape)
print(cbio_hcc_clinvar_mean_vus_matrix.shape)
print(cbio_hcc_clinvar_mean_benign_matrix.shape)

(46, 13)
(110, 132)
(1, 1)
(117, 324)
(117, 324)
(117, 324)


In [107]:
# Count feature
cbio_hcc_clinvar_count = merged_cbio_hcc.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"]).aggregate("count")[["xon17_score"]].reset_index()
cbio_hcc_clinvar_count

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,P-0000037-T02-IM3,AXIN1,0.0,0.0,1.0,1
1,P-0000037-T02-IM3,ERCC4,0.0,0.0,1.0,1
2,P-0000182-T01-IM3,WT1,0.0,0.0,1.0,1
3,P-0000218-T01-IM3,KDM5C,0.0,0.0,1.0,1
4,P-0000218-T01-IM3,NTRK2,0.0,0.0,1.0,1
...,...,...,...,...,...,...
385,P-0021780-T01-IM6,ERCC4,0.0,0.0,1.0,1
386,P-0021780-T01-IM6,HRAS,1.0,0.0,0.0,1
387,P-0021780-T01-IM6,TP53,1.0,0.0,0.0,1
388,P-0022348-T01-IM6,BAP1,0.0,0.0,1.0,1


In [108]:
cbio_hcc_clinvar_count_pathogenic_matrix = get_matrices(cbio_hcc_clinvar_count, merged_cbio_hcc, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_clinvar_count_vus_matrix = get_matrices(cbio_hcc_clinvar_count, merged_cbio_hcc, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_hcc_clinvar_count_benign_matrix = get_matrices(cbio_hcc_clinvar_count, merged_cbio_hcc, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_hcc_clinvar_count_pathogenic_matrix.shape)
print(cbio_hcc_clinvar_count_vus_matrix.shape)
print(cbio_hcc_clinvar_count_benign_matrix.shape)

(46, 13)
(110, 132)
(1, 1)
(117, 324)
(117, 324)
(117, 324)


In [109]:
# Add suffixes to identify columns
cbio_hcc_gpd_max_piu_matrix = cbio_hcc_gpd_max_piu_matrix.add_suffix('_piu_max')
cbio_hcc_gpd_sum_piu_matrix = cbio_hcc_gpd_sum_piu_matrix.add_suffix("_piu_sum")
cbio_hcc_gpd_mean_piu_matrix = cbio_hcc_gpd_mean_piu_matrix.add_suffix("_piu_mean")
cbio_hcc_gpd_count_piu_matrix = cbio_hcc_gpd_count_piu_matrix.add_suffix("_piu_count")
cbio_hcc_gpd_max_lu_matrix = cbio_hcc_gpd_max_lu_matrix.add_suffix("_lu_max")
cbio_hcc_gpd_sum_lu_matrix = cbio_hcc_gpd_sum_lu_matrix.add_suffix("_lu_sum")
cbio_hcc_gpd_mean_lu_matrix = cbio_hcc_gpd_mean_lu_matrix.add_suffix("_lu_mean")
cbio_hcc_gpd_count_lu_matrix = cbio_hcc_gpd_count_ncu_matrix.add_suffix("_lu_count")
cbio_hcc_gpd_max_ncu_matrix = cbio_hcc_gpd_max_ncu_matrix.add_suffix("_ncu_max")
cbio_hcc_gpd_sum_ncu_matrix = cbio_hcc_gpd_sum_ncu_matrix.add_suffix("_ncu_sum")
cbio_hcc_gpd_mean_ncu_matrix = cbio_hcc_gpd_mean_ncu_matrix.add_suffix("_ncu_mean")
cbio_hcc_gpd_count_ncu_matrix = cbio_hcc_gpd_count_ncu_matrix.add_suffix("_ncu_count")

cbio_hcc_clinvar_max_pathogenic_matrix = cbio_hcc_clinvar_max_pathogenic_matrix.add_suffix("_pathogenic_max")
cbio_hcc_clinvar_sum_pathogenic_matrix = cbio_hcc_clinvar_sum_pathogenic_matrix.add_suffix("_pathogenic_sum")
cbio_hcc_clinvar_mean_pathogenic_matrix = cbio_hcc_clinvar_mean_pathogenic_matrix.add_suffix("_pathogenic_mean")
cbio_hcc_clinvar_count_pathogenic_matrix = cbio_hcc_clinvar_count_pathogenic_matrix.add_suffix("_pathogenic_count")
cbio_hcc_clinvar_max_vus_matrix = cbio_hcc_clinvar_max_vus_matrix.add_suffix("_vus_max")
cbio_hcc_clinvar_sum_vus_matrix = cbio_hcc_clinvar_sum_vus_matrix.add_suffix("_vus_sum")
cbio_hcc_clinvar_mean_vus_matrix = cbio_hcc_clinvar_mean_vus_matrix.add_suffix("_vus_mean")
cbio_hcc_clinvar_count_vus_matrix = cbio_hcc_clinvar_count_vus_matrix.add_suffix("_vus_count")
cbio_hcc_clinvar_max_benign_matrix = cbio_hcc_clinvar_max_benign_matrix.add_suffix("_benign_max")
cbio_hcc_clinvar_sum_benign_matrix = cbio_hcc_clinvar_sum_benign_matrix.add_suffix("_benign_sum")
cbio_hcc_clinvar_mean_benign_matrix = cbio_hcc_clinvar_mean_benign_matrix.add_suffix("_benign_mean")
cbio_hcc_clinvar_count_benign_matrix = cbio_hcc_clinvar_count_benign_matrix.add_suffix("_benign_count")

In [110]:
# PIU - Max, Sum, Mean, Count, LU - Max, Sum, Mean, Count, NCU - Max, Sum, Mean, Count, Pathogenic - Max, Sum, Mean, Count, VUS - Max, Sum, Mean, Count, Benign - Max, Sum, Mean, Count
cbio_hcc_feature_matrix = pd.concat([
    cbio_hcc_gpd_max_piu_matrix, cbio_hcc_gpd_sum_piu_matrix, cbio_hcc_gpd_mean_piu_matrix, cbio_hcc_gpd_count_piu_matrix,
    cbio_hcc_gpd_max_lu_matrix, cbio_hcc_gpd_sum_lu_matrix, cbio_hcc_gpd_mean_lu_matrix, cbio_hcc_gpd_count_lu_matrix,
    cbio_hcc_gpd_max_ncu_matrix, cbio_hcc_gpd_sum_ncu_matrix, cbio_hcc_gpd_mean_ncu_matrix, cbio_hcc_gpd_count_ncu_matrix,
    cbio_hcc_clinvar_max_pathogenic_matrix, cbio_hcc_clinvar_sum_pathogenic_matrix, cbio_hcc_clinvar_mean_pathogenic_matrix, cbio_hcc_clinvar_count_pathogenic_matrix,
    cbio_hcc_clinvar_max_vus_matrix, cbio_hcc_clinvar_sum_vus_matrix, cbio_hcc_clinvar_mean_vus_matrix, cbio_hcc_clinvar_count_vus_matrix,
    cbio_hcc_clinvar_max_benign_matrix, cbio_hcc_clinvar_sum_benign_matrix, cbio_hcc_clinvar_mean_benign_matrix, cbio_hcc_clinvar_count_benign_matrix,
], axis = 1)
cbio_hcc_feature_matrix.shape

(117, 7776)

In [111]:
cbio_hcc_feature_matrix.to_csv("/data/ajayago/papers_data/systematic_assessment/raw/annotated_mutation_matrices/clinvar_gpd_annovar_annotated_cbio_hcc_feature_matrix.csv")

### CBIO BRCA

In [112]:
cbio_brca_mutations = pd.read_csv("/data/druid_data/CBIO/brca_mskcc_2019//patient_gene_alteration(mutation).csv")
cbio_brca_mutations["canonical_gene_name"] = cbio_brca_mutations["gene"].apply(lambda x: convert2canonical(x))
cbio_brca_mutations

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name
0,s_DS_bkm_077_T,VTCN1,S192L,
1,s_DS_bkm_078_T2,NOTCH2,D1582N,NOTCH2
2,s_DS_bkm_078_T1,NOTCH2,D1582N,NOTCH2
3,s_DS_bkm_074_T,NOTCH2,T1303P,NOTCH2
4,s_DS_bkm_064_T2,NOTCH2,P6Rfs*27,NOTCH2
...,...,...,...,...
653,s_DS_bkm_058_T,NCOR1,A750V,
654,s_DS_bkm_058_T,BCOR,N193T,BCOR
655,s_DS_bkm_059_T,SF3B1,I641V,SF3B1
656,s_DS_bkm_059_T,ESR1,L536R,ESR1


In [113]:
cbio_brca_mutations = cbio_brca_mutations[cbio_brca_mutations.canonical_gene_name.astype(str) != "nan"].reset_index(drop=True)
cbio_brca_mutations["mutations"] = cbio_brca_mutations["canonical_gene_name"] + "@" + cbio_brca_mutations["alteration"]
cbio_brca_mutations

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name,mutations
0,s_DS_bkm_078_T2,NOTCH2,D1582N,NOTCH2,NOTCH2@D1582N
1,s_DS_bkm_078_T1,NOTCH2,D1582N,NOTCH2,NOTCH2@D1582N
2,s_DS_bkm_074_T,NOTCH2,T1303P,NOTCH2,NOTCH2@T1303P
3,s_DS_bkm_064_T2,NOTCH2,P6Rfs*27,NOTCH2,NOTCH2@P6Rfs*27
4,s_DS_bkm_064_T1,NOTCH2,P6Rfs*27,NOTCH2,NOTCH2@P6Rfs*27
...,...,...,...,...,...
482,s_DS_bkm_058_T,PIK3CA,H1047L,PIK3CA,PIK3CA@H1047L
483,s_DS_bkm_058_T,BCOR,N193T,BCOR,BCOR@N193T
484,s_DS_bkm_059_T,SF3B1,I641V,SF3B1,SF3B1@I641V
485,s_DS_bkm_059_T,ESR1,L536R,ESR1,ESR1@L536R


In [114]:
len(cbio_brca_mutations.patient_id.unique())

70

In [115]:
merged_cbio_brca = cbio_brca_mutations.merge(vocab_df[["xon17_score", 'clinvar_Pathogenic', 'clinvar_Benign', 'clinvar_Unknown', 'gpd_LU', 'gpd_NCU', 'gpd_PIU']], left_on="mutations", right_on=vocab_df.index, how="left")
merged_cbio_brca["xon17_score"].fillna(0, inplace=True)
# NaNs are set to VUS
merged_cbio_brca["clinvar_Pathogenic"].fillna(0, inplace=True)
merged_cbio_brca["clinvar_Benign"].fillna(0, inplace=True)
merged_cbio_brca["clinvar_Unknown"].fillna(1, inplace=True)
# NaNs are set to NCU
merged_cbio_brca["gpd_LU"].fillna(0, inplace=True)
merged_cbio_brca["gpd_PIU"].fillna(0, inplace=True)
merged_cbio_brca["gpd_NCU"].fillna(1, inplace=True)
merged_cbio_brca

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name,mutations,xon17_score,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,gpd_LU,gpd_NCU,gpd_PIU
0,s_DS_bkm_078_T2,NOTCH2,D1582N,NOTCH2,NOTCH2@D1582N,0.470588,0.0,0.0,1.0,0.0,0.0,1.0
1,s_DS_bkm_078_T1,NOTCH2,D1582N,NOTCH2,NOTCH2@D1582N,0.470588,0.0,0.0,1.0,0.0,0.0,1.0
2,s_DS_bkm_074_T,NOTCH2,T1303P,NOTCH2,NOTCH2@T1303P,0.470588,0.0,0.0,1.0,1.0,0.0,0.0
3,s_DS_bkm_064_T2,NOTCH2,P6Rfs*27,NOTCH2,NOTCH2@P6Rfs*27,0.000000,0.0,0.0,1.0,0.0,1.0,0.0
4,s_DS_bkm_064_T1,NOTCH2,P6Rfs*27,NOTCH2,NOTCH2@P6Rfs*27,0.000000,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
482,s_DS_bkm_058_T,PIK3CA,H1047L,PIK3CA,PIK3CA@H1047L,0.529412,1.0,0.0,0.0,1.0,0.0,0.0
483,s_DS_bkm_058_T,BCOR,N193T,BCOR,BCOR@N193T,0.294118,0.0,0.0,1.0,0.0,0.0,1.0
484,s_DS_bkm_059_T,SF3B1,I641V,SF3B1,SF3B1@I641V,0.411765,0.0,0.0,1.0,1.0,0.0,0.0
485,s_DS_bkm_059_T,ESR1,L536R,ESR1,ESR1@L536R,1.000000,1.0,0.0,0.0,0.0,0.0,1.0


In [116]:
merged_cbio_brca.canonical_gene_name.isna().sum()

0

In [117]:
merged_cbio_brca[["clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown", "gpd_LU", "gpd_NCU", "gpd_PIU"]].sum()

clinvar_Pathogenic     72.0
clinvar_Benign         63.0
clinvar_Unknown       352.0
gpd_LU                146.0
gpd_NCU               108.0
gpd_PIU               233.0
dtype: float64

#### GPD features

In [118]:
# Max feature
cbio_brca_gpd_max = merged_cbio_brca.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("max")[["xon17_score"]].reset_index()
cbio_brca_gpd_max

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,s_DS_bkm_001_T,ALK,0.0,0.0,1.0,0.294118
1,s_DS_bkm_001_T,MED12,0.0,0.0,1.0,0.882353
2,s_DS_bkm_001_T,MLL,0.0,1.0,0.0,0.000000
3,s_DS_bkm_001_T,PDGFRA,0.0,0.0,1.0,0.117647
4,s_DS_bkm_001_T,TET2,1.0,0.0,0.0,0.058824
...,...,...,...,...,...,...
455,s_DS_bkm_084_T,HNF1A,0.0,0.0,1.0,0.235294
456,s_DS_bkm_084_T,IKBKE,0.0,0.0,1.0,0.705882
457,s_DS_bkm_084_T,PIK3CA,1.0,0.0,0.0,0.529412
458,s_DS_bkm_084_T,RB1,0.0,1.0,0.0,0.000000


In [119]:
cbio_brca_gpd_max_lu_matrix = get_matrices(cbio_brca_gpd_max, merged_cbio_brca, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_gpd_max_piu_matrix = get_matrices(cbio_brca_gpd_max, merged_cbio_brca, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_gpd_max_ncu_matrix = get_matrices(cbio_brca_gpd_max, merged_cbio_brca, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_brca_gpd_max_lu_matrix.shape)
print(cbio_brca_gpd_max_piu_matrix.shape)
print(cbio_brca_gpd_max_ncu_matrix.shape)

(53, 65)
(67, 93)
(55, 49)
(70, 324)
(70, 324)
(70, 324)


In [120]:
# Sum feature
cbio_brca_gpd_sum = merged_cbio_brca.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("sum")[["xon17_score"]].reset_index()
cbio_brca_gpd_sum

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,s_DS_bkm_001_T,ALK,0.0,0.0,1.0,0.294118
1,s_DS_bkm_001_T,MED12,0.0,0.0,1.0,1.470588
2,s_DS_bkm_001_T,MLL,0.0,1.0,0.0,0.000000
3,s_DS_bkm_001_T,PDGFRA,0.0,0.0,1.0,0.117647
4,s_DS_bkm_001_T,TET2,1.0,0.0,0.0,0.058824
...,...,...,...,...,...,...
455,s_DS_bkm_084_T,HNF1A,0.0,0.0,1.0,0.235294
456,s_DS_bkm_084_T,IKBKE,0.0,0.0,1.0,0.705882
457,s_DS_bkm_084_T,PIK3CA,1.0,0.0,0.0,0.529412
458,s_DS_bkm_084_T,RB1,0.0,1.0,0.0,0.000000


In [121]:
cbio_brca_gpd_sum_lu_matrix = get_matrices(cbio_brca_gpd_sum, merged_cbio_brca, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_gpd_sum_piu_matrix = get_matrices(cbio_brca_gpd_sum, merged_cbio_brca, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_gpd_sum_ncu_matrix = get_matrices(cbio_brca_gpd_sum, merged_cbio_brca, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_brca_gpd_sum_lu_matrix.shape)
print(cbio_brca_gpd_sum_piu_matrix.shape)
print(cbio_brca_gpd_sum_ncu_matrix.shape)

(53, 65)
(67, 93)
(55, 49)
(70, 324)
(70, 324)
(70, 324)


In [122]:
# Mean feature
cbio_brca_gpd_mean = merged_cbio_brca.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"])[["xon17_score"]].aggregate("mean").reset_index()
cbio_brca_gpd_mean

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,s_DS_bkm_001_T,ALK,0.0,0.0,1.0,0.294118
1,s_DS_bkm_001_T,MED12,0.0,0.0,1.0,0.735294
2,s_DS_bkm_001_T,MLL,0.0,1.0,0.0,0.000000
3,s_DS_bkm_001_T,PDGFRA,0.0,0.0,1.0,0.117647
4,s_DS_bkm_001_T,TET2,1.0,0.0,0.0,0.058824
...,...,...,...,...,...,...
455,s_DS_bkm_084_T,HNF1A,0.0,0.0,1.0,0.235294
456,s_DS_bkm_084_T,IKBKE,0.0,0.0,1.0,0.705882
457,s_DS_bkm_084_T,PIK3CA,1.0,0.0,0.0,0.529412
458,s_DS_bkm_084_T,RB1,0.0,1.0,0.0,0.000000


In [123]:
cbio_brca_gpd_mean_lu_matrix = get_matrices(cbio_brca_gpd_mean, merged_cbio_brca, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_gpd_mean_piu_matrix = get_matrices(cbio_brca_gpd_mean, merged_cbio_brca, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_gpd_mean_ncu_matrix = get_matrices(cbio_brca_gpd_mean, merged_cbio_brca, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_brca_gpd_mean_lu_matrix.shape)
print(cbio_brca_gpd_mean_piu_matrix.shape)
print(cbio_brca_gpd_mean_ncu_matrix.shape)

(53, 65)
(67, 93)
(55, 49)
(70, 324)
(70, 324)
(70, 324)


In [124]:
# Count feature
cbio_brca_gpd_count = merged_cbio_brca.groupby(["patient_id", "canonical_gene_name", "gpd_LU", "gpd_NCU", "gpd_PIU"]).aggregate("count")[["xon17_score"]].reset_index()
cbio_brca_gpd_count

Unnamed: 0,patient_id,canonical_gene_name,gpd_LU,gpd_NCU,gpd_PIU,xon17_score
0,s_DS_bkm_001_T,ALK,0.0,0.0,1.0,1
1,s_DS_bkm_001_T,MED12,0.0,0.0,1.0,2
2,s_DS_bkm_001_T,MLL,0.0,1.0,0.0,1
3,s_DS_bkm_001_T,PDGFRA,0.0,0.0,1.0,1
4,s_DS_bkm_001_T,TET2,1.0,0.0,0.0,1
...,...,...,...,...,...,...
455,s_DS_bkm_084_T,HNF1A,0.0,0.0,1.0,1
456,s_DS_bkm_084_T,IKBKE,0.0,0.0,1.0,1
457,s_DS_bkm_084_T,PIK3CA,1.0,0.0,0.0,1
458,s_DS_bkm_084_T,RB1,0.0,1.0,0.0,1


In [125]:
cbio_brca_gpd_count_lu_matrix = get_matrices(cbio_brca_gpd_count, merged_cbio_brca, criteria="gpd_LU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_gpd_count_piu_matrix = get_matrices(cbio_brca_gpd_count, merged_cbio_brca, criteria="gpd_PIU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_gpd_count_ncu_matrix = get_matrices(cbio_brca_gpd_count, merged_cbio_brca, criteria="gpd_NCU", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_brca_gpd_count_lu_matrix.shape)
print(cbio_brca_gpd_count_piu_matrix.shape)
print(cbio_brca_gpd_count_ncu_matrix.shape)

(53, 65)
(67, 93)
(55, 49)
(70, 324)
(70, 324)
(70, 324)


#### ClinVar

In [126]:
# Max feature
cbio_brca_clinvar_max = merged_cbio_brca.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"])[["xon17_score"]].aggregate("max").reset_index()
cbio_brca_clinvar_max

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,s_DS_bkm_001_T,ALK,0.0,0.0,1.0,0.294118
1,s_DS_bkm_001_T,MED12,0.0,0.0,1.0,0.882353
2,s_DS_bkm_001_T,MLL,0.0,0.0,1.0,0.000000
3,s_DS_bkm_001_T,PDGFRA,0.0,0.0,1.0,0.117647
4,s_DS_bkm_001_T,TET2,0.0,0.0,1.0,0.058824
...,...,...,...,...,...,...
445,s_DS_bkm_084_T,HNF1A,0.0,0.0,1.0,0.235294
446,s_DS_bkm_084_T,IKBKE,0.0,0.0,1.0,0.705882
447,s_DS_bkm_084_T,PIK3CA,1.0,0.0,0.0,0.529412
448,s_DS_bkm_084_T,RB1,0.0,0.0,1.0,0.000000


In [127]:
cbio_brca_clinvar_max_pathogenic_matrix = get_matrices(cbio_brca_clinvar_max, merged_cbio_brca, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_clinvar_max_vus_matrix = get_matrices(cbio_brca_clinvar_max, merged_cbio_brca, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_clinvar_max_benign_matrix = get_matrices(cbio_brca_clinvar_max, merged_cbio_brca, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_brca_clinvar_max_pathogenic_matrix.shape)
print(cbio_brca_clinvar_max_vus_matrix.shape)
print(cbio_brca_clinvar_max_benign_matrix.shape)

(49, 17)
(70, 138)
(26, 28)
(70, 324)
(70, 324)
(70, 324)


In [128]:
# Sum feature
cbio_brca_clinvar_sum = merged_cbio_brca.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"]).aggregate("sum")[["xon17_score"]].reset_index()
cbio_brca_clinvar_sum

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,s_DS_bkm_001_T,ALK,0.0,0.0,1.0,0.294118
1,s_DS_bkm_001_T,MED12,0.0,0.0,1.0,1.470588
2,s_DS_bkm_001_T,MLL,0.0,0.0,1.0,0.000000
3,s_DS_bkm_001_T,PDGFRA,0.0,0.0,1.0,0.117647
4,s_DS_bkm_001_T,TET2,0.0,0.0,1.0,0.058824
...,...,...,...,...,...,...
445,s_DS_bkm_084_T,HNF1A,0.0,0.0,1.0,0.235294
446,s_DS_bkm_084_T,IKBKE,0.0,0.0,1.0,0.705882
447,s_DS_bkm_084_T,PIK3CA,1.0,0.0,0.0,0.529412
448,s_DS_bkm_084_T,RB1,0.0,0.0,1.0,0.000000


In [129]:
cbio_brca_clinvar_sum_pathogenic_matrix = get_matrices(cbio_brca_clinvar_sum, merged_cbio_brca, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_clinvar_sum_vus_matrix = get_matrices(cbio_brca_clinvar_sum, merged_cbio_brca, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_clinvar_sum_benign_matrix = get_matrices(cbio_brca_clinvar_sum, merged_cbio_brca, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_brca_clinvar_sum_pathogenic_matrix.shape)
print(cbio_brca_clinvar_sum_vus_matrix.shape)
print(cbio_brca_clinvar_sum_benign_matrix.shape)

(49, 17)
(70, 138)
(26, 28)
(70, 324)
(70, 324)
(70, 324)


In [130]:
# Mean feature
cbio_brca_clinvar_mean = merged_cbio_brca.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"])[["xon17_score"]].aggregate("mean").reset_index()
cbio_brca_clinvar_mean

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,s_DS_bkm_001_T,ALK,0.0,0.0,1.0,0.294118
1,s_DS_bkm_001_T,MED12,0.0,0.0,1.0,0.735294
2,s_DS_bkm_001_T,MLL,0.0,0.0,1.0,0.000000
3,s_DS_bkm_001_T,PDGFRA,0.0,0.0,1.0,0.117647
4,s_DS_bkm_001_T,TET2,0.0,0.0,1.0,0.058824
...,...,...,...,...,...,...
445,s_DS_bkm_084_T,HNF1A,0.0,0.0,1.0,0.235294
446,s_DS_bkm_084_T,IKBKE,0.0,0.0,1.0,0.705882
447,s_DS_bkm_084_T,PIK3CA,1.0,0.0,0.0,0.529412
448,s_DS_bkm_084_T,RB1,0.0,0.0,1.0,0.000000


In [131]:
cbio_brca_clinvar_mean_pathogenic_matrix = get_matrices(cbio_brca_clinvar_mean, merged_cbio_brca, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_clinvar_mean_vus_matrix = get_matrices(cbio_brca_clinvar_mean, merged_cbio_brca, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_clinvar_mean_benign_matrix = get_matrices(cbio_brca_clinvar_mean, merged_cbio_brca, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_brca_clinvar_mean_pathogenic_matrix.shape)
print(cbio_brca_clinvar_mean_vus_matrix.shape)
print(cbio_brca_clinvar_mean_benign_matrix.shape)

(49, 17)
(70, 138)
(26, 28)
(70, 324)
(70, 324)
(70, 324)


In [132]:
# Count feature
cbio_brca_clinvar_count = merged_cbio_brca.groupby(["patient_id", "canonical_gene_name", "clinvar_Pathogenic", "clinvar_Benign", "clinvar_Unknown"]).aggregate("count")[["xon17_score"]].reset_index()
cbio_brca_clinvar_count

Unnamed: 0,patient_id,canonical_gene_name,clinvar_Pathogenic,clinvar_Benign,clinvar_Unknown,xon17_score
0,s_DS_bkm_001_T,ALK,0.0,0.0,1.0,1
1,s_DS_bkm_001_T,MED12,0.0,0.0,1.0,2
2,s_DS_bkm_001_T,MLL,0.0,0.0,1.0,1
3,s_DS_bkm_001_T,PDGFRA,0.0,0.0,1.0,1
4,s_DS_bkm_001_T,TET2,0.0,0.0,1.0,1
...,...,...,...,...,...,...
445,s_DS_bkm_084_T,HNF1A,0.0,0.0,1.0,1
446,s_DS_bkm_084_T,IKBKE,0.0,0.0,1.0,1
447,s_DS_bkm_084_T,PIK3CA,1.0,0.0,0.0,1
448,s_DS_bkm_084_T,RB1,0.0,0.0,1.0,1


In [133]:
cbio_brca_clinvar_count_pathogenic_matrix = get_matrices(cbio_brca_clinvar_count, merged_cbio_brca, criteria="clinvar_Pathogenic", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_clinvar_count_vus_matrix = get_matrices(cbio_brca_clinvar_count, merged_cbio_brca, criteria="clinvar_Unknown", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
cbio_brca_clinvar_count_benign_matrix = get_matrices(cbio_brca_clinvar_count, merged_cbio_brca, criteria="clinvar_Benign", criteria_value=1, index_name='patient_id', column_name='canonical_gene_name')
print(cbio_brca_clinvar_count_pathogenic_matrix.shape)
print(cbio_brca_clinvar_count_vus_matrix.shape)
print(cbio_brca_clinvar_count_benign_matrix.shape)

(49, 17)
(70, 138)
(26, 28)
(70, 324)
(70, 324)
(70, 324)


In [134]:
# Add suffixes to identify columns
cbio_brca_gpd_max_piu_matrix = cbio_brca_gpd_max_piu_matrix.add_suffix('_piu_max')
cbio_brca_gpd_sum_piu_matrix = cbio_brca_gpd_sum_piu_matrix.add_suffix("_piu_sum")
cbio_brca_gpd_mean_piu_matrix = cbio_brca_gpd_mean_piu_matrix.add_suffix("_piu_mean")
cbio_brca_gpd_count_piu_matrix = cbio_brca_gpd_count_piu_matrix.add_suffix("_piu_count")
cbio_brca_gpd_max_lu_matrix = cbio_brca_gpd_max_lu_matrix.add_suffix("_lu_max")
cbio_brca_gpd_sum_lu_matrix = cbio_brca_gpd_sum_lu_matrix.add_suffix("_lu_sum")
cbio_brca_gpd_mean_lu_matrix = cbio_brca_gpd_mean_lu_matrix.add_suffix("_lu_mean")
cbio_brca_gpd_count_lu_matrix = cbio_brca_gpd_count_ncu_matrix.add_suffix("_lu_count")
cbio_brca_gpd_max_ncu_matrix = cbio_brca_gpd_max_ncu_matrix.add_suffix("_ncu_max")
cbio_brca_gpd_sum_ncu_matrix = cbio_brca_gpd_sum_ncu_matrix.add_suffix("_ncu_sum")
cbio_brca_gpd_mean_ncu_matrix = cbio_brca_gpd_mean_ncu_matrix.add_suffix("_ncu_mean")
cbio_brca_gpd_count_ncu_matrix = cbio_brca_gpd_count_ncu_matrix.add_suffix("_ncu_count")

cbio_brca_clinvar_max_pathogenic_matrix = cbio_brca_clinvar_max_pathogenic_matrix.add_suffix("_pathogenic_max")
cbio_brca_clinvar_sum_pathogenic_matrix = cbio_brca_clinvar_sum_pathogenic_matrix.add_suffix("_pathogenic_sum")
cbio_brca_clinvar_mean_pathogenic_matrix = cbio_brca_clinvar_mean_pathogenic_matrix.add_suffix("_pathogenic_mean")
cbio_brca_clinvar_count_pathogenic_matrix = cbio_brca_clinvar_count_pathogenic_matrix.add_suffix("_pathogenic_count")
cbio_brca_clinvar_max_vus_matrix = cbio_brca_clinvar_max_vus_matrix.add_suffix("_vus_max")
cbio_brca_clinvar_sum_vus_matrix = cbio_brca_clinvar_sum_vus_matrix.add_suffix("_vus_sum")
cbio_brca_clinvar_mean_vus_matrix = cbio_brca_clinvar_mean_vus_matrix.add_suffix("_vus_mean")
cbio_brca_clinvar_count_vus_matrix = cbio_brca_clinvar_count_vus_matrix.add_suffix("_vus_count")
cbio_brca_clinvar_max_benign_matrix = cbio_brca_clinvar_max_benign_matrix.add_suffix("_benign_max")
cbio_brca_clinvar_sum_benign_matrix = cbio_brca_clinvar_sum_benign_matrix.add_suffix("_benign_sum")
cbio_brca_clinvar_mean_benign_matrix = cbio_brca_clinvar_mean_benign_matrix.add_suffix("_benign_mean")
cbio_brca_clinvar_count_benign_matrix = cbio_brca_clinvar_count_benign_matrix.add_suffix("_benign_count")

In [135]:
# PIU - Max, Sum, Mean, Count, LU - Max, Sum, Mean, Count, NCU - Max, Sum, Mean, Count, Pathogenic - Max, Sum, Mean, Count, VUS - Max, Sum, Mean, Count, Benign - Max, Sum, Mean, Count
cbio_brca_feature_matrix = pd.concat([
    cbio_brca_gpd_max_piu_matrix, cbio_brca_gpd_sum_piu_matrix, cbio_brca_gpd_mean_piu_matrix, cbio_brca_gpd_count_piu_matrix,
    cbio_brca_gpd_max_lu_matrix, cbio_brca_gpd_sum_lu_matrix, cbio_brca_gpd_mean_lu_matrix, cbio_brca_gpd_count_lu_matrix,
    cbio_brca_gpd_max_ncu_matrix, cbio_brca_gpd_sum_ncu_matrix, cbio_brca_gpd_mean_ncu_matrix, cbio_brca_gpd_count_ncu_matrix,
    cbio_brca_clinvar_max_pathogenic_matrix, cbio_brca_clinvar_sum_pathogenic_matrix, cbio_brca_clinvar_mean_pathogenic_matrix, cbio_brca_clinvar_count_pathogenic_matrix,
    cbio_brca_clinvar_max_vus_matrix, cbio_brca_clinvar_sum_vus_matrix, cbio_brca_clinvar_mean_vus_matrix, cbio_brca_clinvar_count_vus_matrix,
    cbio_brca_clinvar_max_benign_matrix, cbio_brca_clinvar_sum_benign_matrix, cbio_brca_clinvar_mean_benign_matrix, cbio_brca_clinvar_count_benign_matrix,
], axis = 1)
cbio_brca_feature_matrix.shape

(70, 7776)

In [136]:
cbio_brca_feature_matrix.to_csv("/data/ajayago/papers_data/systematic_assessment/raw/annotated_mutation_matrices/clinvar_gpd_annovar_annotated_cbio_brca_feature_matrix.csv")