# TCGA Mutations

In [72]:
import numpy as np 
import pandas as pd

# Mutation Filtering 

Here I'm going to keep the mutations from the TCGA folders that have been annotated through the tcga pancancer atlas paper from 2018 (unified data resource)

I will take the mutations that are nonsense / splice site / frameshift / TSS or missense with SIFT predicted deleterious 
    I am not using polyphen because it's predictions are shit (e.g. predicts NRAS G12D as benign)
    
From there I keep only the top genes mutated in 7 or more patient samples (manual cutoff to include known driver genes in AML) 
Next I incorporate the VAF data from the original TCGA 2013 data source (note that it doesn't apply for every mutation I've caught). 
    When the VAF is not available but the mutation is present (often splice site etc) because it wasn't caught in the original TCGA paper, i set the VAF to 0 in the resulting matrix. When the mutation is not present, i have the VAF as NAN. 
   
   
### NOTE THAT THIS DATASET DOES NOT INCLUDE FLT3-ITD

In [73]:
lab_path="/Users/andyzeng/Drive/Laptop/Dick_Lab/"

In [74]:
# tcga2013 data (with VAF)
tcga2013_clindata = pd.read_csv(lab_path+"CIBERSORT/TCGA_LAML/other_data/mutations/tcga_2013/data_mutations_mskcc.txt", sep='\t')
tcga2013_clindata = tcga2013_clindata[['Tumor_Sample_Barcode', 'Hugo_Symbol', 'Protein_position', 'Variant_Classification', 'TumorVAF_WU']]
tcga2013_clindata.head()

# pancancer atlas data 
mutations = pd.read_csv(lab_path+"CIBERSORT/TCGA_LAML/other_data/mutations/tcga_pancancer_atlas/data_mutations_mskcc.txt", sep='\t')

# FLT3 ITD
flt3ITD = mutations[(mutations['Variant_Classification'].isin(['In_Frame_Ins'])) & (mutations['Hugo_Symbol'].isin(['FLT3']))]
flt3ITD['Hugo_Symbol'] = 'FLT3-ITD'

# get missense and nonsense
nonsense = mutations[mutations['Variant_Classification'].isin(['Frame_Shift_Ins', 'Nonsense_Mutation', 'Frame_Shift_Del', 'Splice_Site', 'Translation_Start_Site'])]
missense = mutations[mutations['Variant_Classification'].isin(['Missense_Mutation'])]
missense = missense[~missense['SIFT'].isna()]
missense = missense[missense['SIFT'].str.contains('deleterious')]
mutations = missense.append(nonsense).append(flt3ITD)

# Frequently mutated genes
keygenes = mutations['Hugo_Symbol'].value_counts()[mutations['Hugo_Symbol'].value_counts() >= 7].index.values
mutations = mutations[mutations['Hugo_Symbol'].isin(keygenes)]

# merge with tcga2013 data 
mutations = mutations[['Tumor_Sample_Barcode', 'Hugo_Symbol', 'Amino_acids', 'Protein_position', 'Variant_Classification', 'SIFT']]
mutations = mutations.merge(tcga2013_clindata, how='left', left_on = ['Tumor_Sample_Barcode', 'Hugo_Symbol', 'Protein_position', 'Variant_Classification'], 
               right_on = ['Tumor_Sample_Barcode', 'Hugo_Symbol', 'Protein_position', 'Variant_Classification'])

# clean up AA and SIFT and drop duplicates
mutations['Amino_acids'] = np.where(mutations['Amino_acids'] == ".", np.nan, mutations['Amino_acids'])
mutations['SIFT'] = np.where(mutations['SIFT'] == ".", np.nan, mutations['SIFT'])
mutations = mutations.drop_duplicates()
mutations.head()

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,Tumor_Sample_Barcode,Hugo_Symbol,Amino_acids,Protein_position,Variant_Classification,SIFT,TumorVAF_WU
0,TCGA-AB-2802-03,DNMT3A,R/C,882.0,Missense_Mutation,deleterious(0.02),28.33
1,TCGA-AB-2802-03,IDH1,R/H,132.0,Missense_Mutation,deleterious_low_confidence(0.01),41.05
2,TCGA-AB-2802-03,PTPN11,D/N,61.0,Missense_Mutation,deleterious(0.01),31.08
3,TCGA-AB-2805-03,IDH2,R/Q,140.0,Missense_Mutation,deleterious_low_confidence(0),42.68
4,TCGA-AB-2807-03,IDH2,R/Q,140.0,Missense_Mutation,deleterious_low_confidence(0),45.81


In [28]:
# cleaning to remove  
mutations = mutations[~(((mutations['Hugo_Symbol'] == "KRAS") & (mutations['Protein_position'] == 36)) |
         ((mutations['Hugo_Symbol'] == "ASXL1") & (mutations['Protein_position'] == 157)) | 
          (mutations['Hugo_Symbol'].isin(['HPS3', 'ABCA6', 'SENP6'])))]

In [108]:
pd.set_option('display.max_columns', None)

mutations['present'] = 1
mutations_binary = mutations.reset_index().pivot_table(index='Tumor_Sample_Barcode', columns='Hugo_Symbol', values='present', aggfunc='max').fillna(0).astype('int')
mutations_binary = mutations_binary.add_suffix("_mut")
mutations_binary

Hugo_Symbol,ASXL1_mut,BPIFC_mut,CEBPA_mut,DNMT3A_mut,FLT3_mut,FLT3-ITD_mut,IDH1_mut,IDH2_mut,KIT_mut,KRAS_mut,NPM1_mut,NRAS_mut,PHF6_mut,PTPN11_mut,RUNX1_mut,SF3B1_mut,SMC1A_mut,SMC3_mut,STAG2_mut,TET2_mut,TP53_mut,U2AF1_mut,WT1_mut
Tumor_Sample_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
TCGA-AB-2802-03,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
TCGA-AB-2804-03,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
TCGA-AB-2805-03,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
TCGA-AB-2807-03,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
TCGA-AB-2808-03,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-AB-3002-03,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-AB-3006-03,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
TCGA-AB-3007-03,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-AB-3009-03,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1


In [109]:
pd.reset_option('max_columns')

In [113]:
mutations_binary.to_csv("TCGA_Mutations_binary.csv")

In [80]:
mutations_vaf = mutations.fillna(0).groupby(['Tumor_Sample_Barcode', 'Hugo_Symbol']).agg({'TumorVAF_WU' : 'max'}).reset_index().pivot(index='Tumor_Sample_Barcode', columns='Hugo_Symbol', values='TumorVAF_WU')
mutations_vaf = mutations_vaf.add_suffix("_VAF").fillna("WT")
mutations_vaf.replace(0, np.nan, inplace=True)
mutations_vaf.head()

Hugo_Symbol,ASXL1_VAF,BPIFC_VAF,CEBPA_VAF,DNMT3A_VAF,FLT3_VAF,FLT3-ITD_VAF,IDH1_VAF,IDH2_VAF,KIT_VAF,KRAS_VAF,...,PTPN11_VAF,RUNX1_VAF,SF3B1_VAF,SMC1A_VAF,SMC3_VAF,STAG2_VAF,TET2_VAF,TP53_VAF,U2AF1_VAF,WT1_VAF
Tumor_Sample_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-AB-2802-03,WT,WT,WT,28.33,WT,WT,41.05,WT,WT,WT,...,31.08,WT,WT,WT,WT,WT,WT,WT,WT,WT
TCGA-AB-2804-03,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
TCGA-AB-2805-03,WT,WT,WT,WT,WT,WT,WT,42.68,WT,WT,...,WT,39.69,WT,WT,WT,WT,WT,WT,WT,WT
TCGA-AB-2807-03,33.7,WT,WT,WT,WT,WT,WT,45.81,WT,WT,...,WT,WT,WT,,WT,WT,WT,WT,WT,WT
TCGA-AB-2808-03,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [81]:
mutations_vaf.to_csv("TCGA_Mutations_VAF.csv")

In [48]:
DNMT3A_FLT3 = mutations[mutations.Hugo_Symbol.isin(['FLT3', 'DNMT3A'])]
DNMT3A_FLT3['DNMT3A_R882'] = np.where((DNMT3A_FLT3.Hugo_Symbol == "DNMT3A") & (DNMT3A_FLT3.Protein_position == 882), 1, 0)
DNMT3A_FLT3['DNMT3A_NonSense'] = np.where((DNMT3A_FLT3.Hugo_Symbol == "DNMT3A") & (DNMT3A_FLT3.Variant_Classification != "Missense_Mutation"), 1, 0)
DNMT3A_FLT3['FLT3_D835'] = np.where((DNMT3A_FLT3.Hugo_Symbol == "FLT3") & (DNMT3A_FLT3.Protein_position == 835), 1, 0)
DNMT3A_FLT3['FLT3_NonSense'] = np.where((DNMT3A_FLT3.Hugo_Symbol == "FLT3") & (DNMT3A_FLT3.Variant_Classification != "Missense_Mutation"), 1, 0)
DNMT3A_FLT3 = DNMT3A_FLT3[['Tumor_Sample_Barcode', 'DNMT3A_R882', 'DNMT3A_NonSense', 'FLT3_D835', 'FLT3_NonSense']].groupby('Tumor_Sample_Barcode').agg(sum)
DNMT3A_FLT3.to_csv("TCGA_DNMT3A_FLT3_mutations.csv")
DNMT3A_FLT3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

Unnamed: 0_level_0,DNMT3A_R882,DNMT3A_NonSense,FLT3_D835,FLT3_NonSense
Tumor_Sample_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-AB-2802-03,1,0,0,0
TCGA-AB-2809-03,1,0,0,0
TCGA-AB-2811-03,1,0,1,0
TCGA-AB-2814-03,0,0,1,0
TCGA-AB-2816-03,1,0,1,0
...,...,...,...,...
TCGA-AB-2975-03,1,0,0,0
TCGA-AB-2981-03,1,0,0,0
TCGA-AB-2987-03,1,0,0,0
TCGA-AB-2988-03,0,0,0,0


# Fusion Genes

Recording the common fusion genes - first by gene and then by fusion. 

In [53]:
fusions = pd.read_csv("tcga_pancancer_atlas/data_fusions.txt", sep='\t')
fusions['value'] = "1"
fusions.head()

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,Tumor_Sample_Barcode,Fusion,DNA_support,RNA_support,Method,Frame,value
0,PML,,WashU,TCGA-AB-2803-03,PML-RARA,,,,in-frame,1
1,RPL32,,WashU,TCGA-AB-2803-03,RPL32-RP11-1084A12.1,,,,,1
2,DYRK1A,,WashU,TCGA-AB-2807-03,DYRK1A-TTC3,,,,frameshift,1
3,CBFB,,WashU,TCGA-AB-2815-03,CBFB-MYH11,,,,frameshift,1
4,BCR,,WashU,TCGA-AB-2817-03,BCR-ABL1,,,,in-frame,1


In [54]:
common_fusion_genes = fusions['Hugo_Symbol'].value_counts().index.values[fusions['Hugo_Symbol'].value_counts() >= 4]
fusion_genes = fusions[fusions['Hugo_Symbol'].isin(common_fusion_genes)][['Tumor_Sample_Barcode', 'Hugo_Symbol', 'value']].drop_duplicates().pivot(index='Tumor_Sample_Barcode', columns='Hugo_Symbol', values='value').isna().replace(True, "0").replace(False, "1")
fusion_genes = fusion_genes.add_suffix("_fusion")
fusion_genes.head()

Hugo_Symbol,ABL1_fusion,ABR_fusion,BCR_fusion,CBFB_fusion,KMT2A_fusion,LYZ_fusion,MLLT10_fusion,MPO_fusion,MYH11_fusion,PML_fusion,RARA_fusion,RUNX1_fusion,RUNX1T1_fusion,YWHAE_fusion
Tumor_Sample_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
TCGA-AB-2803-03,0,0,0,0,0,0,0,0,0,1,1,0,0,0
TCGA-AB-2815-03,0,0,0,1,0,0,0,0,1,0,0,0,0,0
TCGA-AB-2817-03,1,0,1,0,0,0,0,0,0,0,0,0,0,0
TCGA-AB-2819-03,0,0,0,0,0,0,0,0,0,0,0,1,1,0
TCGA-AB-2823-03,0,0,0,0,0,0,0,0,0,1,1,0,0,0


In [84]:
fusion_genes.to_csv("TCGA_Fusion_Genes.csv")

In [70]:
common_fusions = fusions['Fusion'].value_counts().index.values[fusions['Fusion'].value_counts() >= 4].tolist() + ['KMT2A-MLLT3']
fusions_fullnames = fusions[fusions['Fusion'].isin(common_fusions)][['Tumor_Sample_Barcode', 'Fusion', 'value']].drop_duplicates().pivot(index='Tumor_Sample_Barcode', columns='Fusion', values='value').isna().replace(True, "0").replace(False, "1")
fusions_fullnames

Fusion,ABR-YWHAE,BCR-ABL1,CBFB-MYH11,KMT2A-ELL,KMT2A-MLLT10,KMT2A-MLLT3,LYZ-SMG1,NUP98-NSD1,PICALM-MLLT10,PML-RARA,RPL13-MPO,RPS8-MPO,RUNX1-RUNX1T1,WNK1-RAD52
Tumor_Sample_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
TCGA-AB-2803-03,0,0,0,0,0,0,0,0,0,1,0,0,0,0
TCGA-AB-2815-03,0,0,1,0,0,0,0,0,0,0,0,0,0,0
TCGA-AB-2817-03,0,1,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-AB-2819-03,0,0,0,0,0,0,0,0,0,0,0,0,1,0
TCGA-AB-2823-03,0,0,0,0,0,0,0,0,0,1,0,0,0,0
TCGA-AB-2828-03,0,0,1,0,0,0,0,0,0,0,0,0,0,0
TCGA-AB-2832-03,1,0,0,0,1,0,0,0,0,0,0,0,0,0
TCGA-AB-2834-03,0,0,0,1,0,0,0,0,0,0,0,0,0,0
TCGA-AB-2840-03,0,0,0,0,0,0,0,0,0,1,0,0,0,0
TCGA-AB-2841-03,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [71]:
fusions_fullnames.to_csv("TCGA_Fusion_fullnames.csv")

# Copy Number Alterations

In [97]:
cna.set_index('Hugo_Symbol').drop('Entrez_Gene_Id', axis=1).T['KMT2A'].value_counts()

 0    170
 2     12
 1      6
-2      3
Name: KMT2A, dtype: int64

In [99]:
cna_genes=['KMT2A', 'ERG', 'TMPRSS2', 'U2AF1', 'XRCC2', 'KMT2C', 'APC', 'BRAF', 'CD74', 'CSF1R', 'EGR1', 'EZH2', 'IL3', 'RUNX1', 'CBFB', 'ETS1']

In [111]:
cna = pd.read_csv("tcga_pancancer_atlas/data_CNA.txt", sep='\t')
cna = cna.set_index('Hugo_Symbol').drop('Entrez_Gene_Id', axis=1).T[cna_genes]
cna = cna.add_suffix("_CNA")
cna.index.name='Tumor_Sample_Barcode'
cna

Hugo_Symbol,KMT2A_CNA,ERG_CNA,TMPRSS2_CNA,U2AF1_CNA,XRCC2_CNA,KMT2C_CNA,APC_CNA,BRAF_CNA,CD74_CNA,CSF1R_CNA,EGR1_CNA,EZH2_CNA,IL3_CNA,RUNX1_CNA,CBFB_CNA,ETS1_CNA
Tumor_Sample_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
TCGA-AB-2803-03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-AB-2804-03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-AB-2805-03,2,0,0,0,-2,-2,0,-2,0,0,0,-2,0,0,0,2
TCGA-AB-2806-03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-AB-2807-03,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-AB-3007-03,0,0,0,0,-1,-1,0,-1,0,0,0,-1,0,0,0,1
TCGA-AB-3008-03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-AB-3009-03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-AB-3011-03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [112]:
cna.to_csv("TCGA_CNA_keygenes.csv")

## Specific Mutations

In [10]:
fusions[fusions.Hugo_Symbol == 'KMT2A']

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,Tumor_Sample_Barcode,Fusion,DNA_support,RNA_support,Method,Frame,value
16,KMT2A,,WashU,TCGA-AB-2832-03,KMT2A-MLLT10,,,,frameshift,1
18,KMT2A,,WashU,TCGA-AB-2834-03,KMT2A-ELL,,,,frameshift,1
23,KMT2A,,WashU,TCGA-AB-2842-03,KMT2A-MLLT10,,,,frameshift,1
25,KMT2A,,WashU,TCGA-AB-2844-03,KMT2A-ELL,,,,frameshift,1
66,KMT2A,,WashU,TCGA-AB-2911-03,KMT2A-ELL,,,,in-frame,1
100,KMT2A,,WashU,TCGA-AB-2956-03,KMT2A-MLLT3,,,,frameshift,1
110,KMT2A,,WashU,TCGA-AB-2985-03,KMT2A-MLLT10,,,,frameshift,1
124,KMT2A,,WashU,TCGA-AB-3005-03,KMT2A-MLLT10,,,,frameshift,1
