In [1]:
import pandas as pd

## read in data files
drivers = pd.read_csv('Data/TCGA_driver_genes.csv')
sampleA = pd.read_csv('Data/sample_A.csv')
sampleB = pd.read_csv('Data/sample_B.csv')
sampleC = pd.read_csv('Data/sample_C.csv')
sampleD = pd.read_csv('Data/sample_D.csv')
sampleE = pd.read_csv('Data/sample_E.csv')


In [2]:
## add sample identifier column
sampleA['sample'] = 'A'
sampleB['sample'] = 'B'
sampleC['sample'] = 'C'
sampleD['sample'] = 'D'
sampleE['sample'] = 'E'

sampleA['value'] = True
sampleB['value'] = True
sampleC['value'] = True
sampleD['value'] = True
sampleE['value'] = True


In [5]:
## table of all mutations
samples = pd.concat([sampleA,sampleB,sampleC,sampleD,sampleE])
mutations = samples[['chr', 'pos', 'alt', 'ref', 'gene']].drop_duplicates()

## add FALSE value for missing mutations
missing = mutations[~mutations.isin(sampleA)].dropna()
missing['value'] = False
sampleA = pd.concat([sampleA,missing], sort=True)

missing = mutations[~mutations.isin(sampleB)].dropna()
missing['value'] = False
sampleB = pd.concat([sampleB,missing], sort=True)

missing = mutations[~mutations.isin(sampleC)].dropna()
missing['value'] = False
sampleC = pd.concat([sampleC,missing], sort=True)

missing = mutations[~mutations.isin(sampleD)].dropna()
missing['value'] = False
sampleD = pd.concat([sampleD,missing], sort=True)

missing = mutations[~mutations.isin(sampleE)].dropna()
missing['value'] = False
sampleE = pd.concat([sampleE,missing], sort=True)


In [6]:
## concatenate all samples into one table
samples = pd.concat([sampleA,sampleB,sampleC,sampleD,sampleE])

## pivot samples table
samples = samples.pivot_table(index=['chr', 'pos', 'alt', 'ref', 'gene'], columns='sample', values='value')
samples = samples.reset_index()


In [7]:
samples.head()

sample,chr,pos,alt,ref,gene,A,B,C,D,E
0,1,57002737.0,G,A,PLPP3,True,,,,True
1,1,64643492.0,G,A,ROR1,,,True,,
2,1,153177307.0,C,T,LELP1,True,True,True,True,True
3,1,154841924.0,T,C,KCNN3,True,True,True,True,True
4,1,161139480.0,G,C,PPOX,True,,True,True,True


In [9]:
sampleA.head()

Unnamed: 0,alt,chr,gene,pos,ref,sample,value
0,G,1,PLPP3,57002737.0,A,A,True
1,C,1,LELP1,153177307.0,T,A,True
2,T,1,KCNN3,154841924.0,C,A,True
3,G,1,PPOX,161139480.0,C,A,True
4,C,1,CFHR5,196963240.0,T,A,True


In [10]:
## identify cancer genes
cancer_genes = samples[samples.gene.isin(drivers.Gene.drop_duplicates().to_list())]['gene']
print(cancer_genes)


14     KRAS
27     TP53
29    SMAD4
46    FBXW7
49      APC
Name: gene, dtype: object


In [12]:
## annotate cancer_genes
cancer_genes_data = drivers[drivers.Gene.isin(cancer_genes)]
cancer_genes_data.head()


Unnamed: 0,Gene,Tissue,Cancer,Tissue Frequency
4,TP53,Adrenal gland,Adrenocortical carcinoma,0.1778
20,FBXW7,Bladder,Bladder Urothelial Carcinoma,0.0907
31,KRAS,Bladder,Bladder Urothelial Carcinoma,0.0415
46,TP53,Bladder,Bladder Urothelial Carcinoma,0.4845
60,FBXW7,Breast,Breast invasive carcinoma,0.018


In [14]:
## sort by tissue frequency and print
cancer_genes_data = cancer_genes_data.sort_values('Tissue Frequency', ascending=False)
cancer_genes_data[cancer_genes_data.Gene == 'APC'].sort_values('Tissue Frequency', ascending=False).head()


Unnamed: 0,Gene,Tissue,Cancer,Tissue Frequency
112,APC,Colorectum,Colon adenocarcinoma,0.8196
420,APC,Stomach,Stomach adenocarcinoma,0.0896
379,APC,Prostate,Prostate adenocarcinoma,0.021


In [15]:
cancer_genes_data[cancer_genes_data.Gene == 'KRAS'].sort_values('Tissue Frequency', ascending=False).head()


Unnamed: 0,Gene,Tissue,Cancer,Tissue Frequency
367,KRAS,Pancreatic,Pancreatic adenocarcinoma,0.7355
118,KRAS,Colorectum,Colon adenocarcinoma,0.4778
314,KRAS,Lung,Lung adenocarcinoma,0.3155
485,KRAS,Uterus,Uterine Corpus Endometrial Carcinoma,0.2096
517,KRAS,Uterus,Uterine Carcinosarcoma,0.1273


In [16]:
cancer_genes_data[cancer_genes_data.Gene == 'SMAD4'].sort_values('Tissue Frequency', ascending=False).head()


Unnamed: 0,Gene,Tissue,Cancer,Tissue Frequency
369,SMAD4,Pancreatic,Pancreatic adenocarcinoma,0.2323
124,SMAD4,Colorectum,Colon adenocarcinoma,0.1203
442,SMAD4,Stomach,Stomach adenocarcinoma,0.0784
153,SMAD4,Esophagus,Esophageal carcinoma,0.064
101,SMAD4,Cervical,Cervical squamous cell carcinoma and endocervi...,0.0365


In [17]:
cancer_genes_data[cancer_genes_data.Gene == 'TP53'].sort_values('Tissue Frequency', ascending=False).head()


Unnamed: 0,Gene,Tissue,Cancer,Tissue Frequency
525,TP53,Uterus,Uterine Carcinosarcoma,0.9273
360,TP53,Ovarian,Ovarian serous cystadenocarcinoma,0.9118
154,TP53,Esophagus,Esophageal carcinoma,0.8779
348,TP53,Lung,Lung squamous cell carcinoma,0.8685
128,TP53,Colorectum,Colon adenocarcinoma,0.7278


In [18]:
cancer_genes_data[cancer_genes_data.Gene == 'FBXW7'].sort_values('Tissue Frequency', ascending=False).head()

Unnamed: 0,Gene,Tissue,Cancer,Tissue Frequency
516,FBXW7,Uterus,Uterine Carcinosarcoma,0.3818
478,FBXW7,Uterus,Uterine Corpus Endometrial Carcinoma,0.1641
116,FBXW7,Colorectum,Colon adenocarcinoma,0.1203
86,FBXW7,Cervical,Cervical squamous cell carcinoma and endocervi...,0.1131
20,FBXW7,Bladder,Bladder Urothelial Carcinoma,0.0907


In [19]:
## sort columns & look for patterns
samples = samples.sort_values(['A','B','C','D','E'])

## subset cancer genes from samples
samples_genes = samples[samples.gene.isin(cancer_genes)]
print(samples_genes)

sample chr          pos alt ref   gene     A     B     C     D     E
49       5  112175639.0   C   T    APC  True  True  True  True  True
14      12   25398285.0   G   T   KRAS  True   NaN  True  True  True
29      18  153249384.0   C   T  SMAD4  True   NaN   NaN   NaN  True
27      17    7578406.0   G   A   TP53  True   NaN   NaN   NaN   NaN
46       4  153249384.0   G   A  FBXW7   NaN   NaN  True   NaN   NaN
