We decided to try a similar process with the breast cancer dataset. The goal is to find breast cancer patients with no known driver mutations and with normal copy number variations across all genes.

In [2]:
# Import libraries

import cptac
import pandas as pd

In [3]:
# Load the breast cancer dataset

br = cptac.Brca()

In [4]:
# Get all patient ids - clinical data is most likely to contain all patients

clinical = br.get_clinical("mssm")
patient_ids = set(clinical.index)
len(patient_ids)

134

In [None]:
# Get breast cancer somatic mutation data

mutations = br.get_somatic_mutation("harmonized")
mutations.head()

Name,Gene,Mutation,Location,Entrez_Gene_Id,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Type,...,HGNC_UniProt_ID(supplied_by_UniProt),HGNC_Ensembl_ID(supplied_by_Ensembl),HGNC_UCSC_ID(supplied_by_UCSC),Oreganno_Build,Simple_Uniprot_alt_uniprot_accessions,dbSNP_TOPMED,HGNC_Entrez_Gene_ID(supplied_by_NCBI),COHORT,getz,washu
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001,CCDC136,Intron,,64753.0,hg38,chr7,128815741,128815741,+,SNP,...,Q96JN2,ENSG00000128596,uc003vnv.3,,A4D1K1|A7MCY7|A8MYA7|Q6ZVK7|Q9H8M3|Q9UFE1,,64753.0,BRCA,True,True
01BR001,MYBPC1,Splice_Site,,4604.0,hg38,chr12,101661264,101661264,+,SNP,...,Q00872,ENSG00000196091,uc001tih.4,,B4DKR5|B7Z8G8|B7ZL02|B7ZL09|B7ZL10|E7ESM5|E7EW...,,4604.0,BRCA,True,True
01BR001,KRT77,Silent,p.G516G,374454.0,hg38,chr12,52691354,52691354,+,SNP,...,Q7Z794,ENSG00000189182,uc001saw.4,,Q7RTS8,,374454.0,BRCA,True,
01BR001,TENM4,Missense_Mutation,p.E19K,26011.0,hg38,chr11,79069890,79069890,+,SNP,...,Q6N022,ENSG00000149256,uc001ozl.5,hg38,A6ND26|Q7Z3C7|Q96MS6|Q9P2P4|Q9Y4S2,,26011.0,BRCA,True,True
01BR001,PPFIA1,Missense_Mutation,p.H795L,8500.0,hg38,chr11,70355707,70355707,+,SNP,...,Q13136,ENSG00000131626,uc001opo.4,,A6NLE3|Q13135|Q14567|Q8N4I2,,8500.0,BRCA,True,True


In [6]:
# Load driver mutation data into a dataframe

driver_mutations_df = pd.read_csv("breast_driver_mutations.tsv", sep="\t")
driver_mutations_df.head()

Unnamed: 0,Symbol,Mutations,Samples,Samples (%),Cohorts
0,TP53,1190,992,30.74,14
1,PIK3CA,1021,891,27.61,12
2,GATA3,347,225,6.97,6
3,KMT2C,297,212,6.57,11
4,CDH1,299,205,6.35,10


In [7]:
len(driver_mutations_df)

118

In [35]:
# Save driver mutations into a list

driver_genes = driver_mutations_df["Symbol"]
driver_genes = list(driver_genes)
%store driver_genes

Stored 'driver_genes' (list)


In [9]:
# Find number patients with driver mutations

driver_mutations = mutations[mutations["Gene"].isin(driver_genes)]
patients_with_driver_mutations = set(driver_mutations.index)
len(patients_with_driver_mutations)

111

In [34]:
driver_patients_mut = mutations.loc[list(patients_with_driver_mutations)]
genes_per_patient = (
    driver_mutations.reset_index()
      .groupby("Patient_ID")["Gene"]
      .agg(lambda x: sorted(set(x)))
      .to_frame(name="Mutated_Genes")
)
%store genes_per_patient
genes_per_patient

#This is useful for plotting patients by mutations later


Stored 'genes_per_patient' (DataFrame)


Unnamed: 0_level_0,Mutated_Genes
Patient_ID,Unnamed: 1_level_1
01BR001,"[ARID1A, FBXW7, TP53]"
01BR008,"[FAT4, HOXC13, MYH9, TP53]"
01BR009,"[BRCA1, FAT4, TP53]"
01BR010,"[HRAS, PIK3CA]"
01BR015,"[AFDN, MAP2K4, PIK3CA, PTPRD, TP53]"
...,...
21BR002,[PIK3CA]
21BR010,"[MAP2K4, NCOR1, PIK3CA]"
22BR005,"[ARID1A, KMT2D, PIK3CA, RUNX1, TP53]"
22BR006,"[ASXL1, CBFB, GATA3, KMT2C, PIK3CA]"


In [10]:
patients_without_driver_mutations = patient_ids - patients_with_driver_mutations
len(patients_without_driver_mutations)

23

In [11]:
patients_without_driver_mutations

{'01BR023',
 '01BR025',
 '01BR028',
 '03BR002',
 '03BR005',
 '03BR009',
 '03BR012',
 '05BR001',
 '05BR003',
 '05BR031',
 '06BR009',
 '11BR015',
 '11BR027',
 '11BR049',
 '11BR057',
 '11BR069',
 '11BR076',
 '14BR007',
 '14BR020',
 '18BR004',
 '18BR017',
 '21BR003',
 '22BR003'}

There are 111 breast cancer patients who have known driver mutations, which means 23 patients do not have known driver mutations. These are the cases we are looking to analyze further.

In [12]:
# Filter mutations to only include those from patients without driver mutations

candidate_patient_mutations = mutations[mutations.index.isin(patients_without_driver_mutations)]
candidate_patient_mutations

Name,Gene,Mutation,Location,Entrez_Gene_Id,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Type,...,HGNC_UniProt_ID(supplied_by_UniProt),HGNC_Ensembl_ID(supplied_by_Ensembl),HGNC_UCSC_ID(supplied_by_UCSC),Oreganno_Build,Simple_Uniprot_alt_uniprot_accessions,dbSNP_TOPMED,HGNC_Entrez_Gene_ID(supplied_by_NCBI),COHORT,getz,washu
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR023,BCHE,In_Frame_Del,p.F385del,590.0,hg38,chr3,165829880,165829882,+,DEL,...,P06276,ENSG00000114200,uc003fem.5,,A8K7P8,,590.0,BRCA,True,True
01BR023,FLG,Missense_Mutation,p.E2287K,2312.0,hg38,chr1,152308027,152308027,+,SNP,...,P20930,ENSG00000143631,uc001ezu.1,,Q01720|Q5T583|Q9UC71,"0.99991239806320081,0.00001592762487257,0.0000...",2312.0,BRCA,,True
01BR023,MFSD9,Intron,,84804.0,hg38,chr2,102732136,102732136,+,SNP,...,Q8NBP5,ENSG00000135953,uc002tcb.3,,Q4ZG89|Q53TU0|Q96GQ4|Q9BRI8,"0.99995221712538226,0.00004778287461773",84804.0,BRCA,True,
01BR023,TRPC1,Missense_Mutation,p.R707C,7220.0,hg38,chr3,142806074,142806074,+,SNP,...,P48995,ENSG00000144935,uc003evb.4,,Q14CE4,,7220.0,BRCA,True,True
01BR023,UGDH,Missense_Mutation,p.R142H,7358.0,hg38,chr4,39510701,39510701,+,SNP,...,O60701,ENSG00000109814,uc003guk.3,,B3KUU2|B4DN25|O60589,,7358.0,BRCA,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18BR017,AEN,Missense_Mutation,p.G192R,64782.0,hg38,chr15,88629259,88629259,+,SNP,...,Q8WTP8,ENSG00000181026,uc002bmt.3,hg38,C9J571|Q9BSA5|Q9H9X7,,64782.0,BRCA,True,True
18BR017,FBRSL1,Missense_Mutation,p.A153V,57666.0,hg38,chr12,132508319,132508319,+,SNP,...,Q9HCM7,ENSG00000112787,uc001ukf.4,,Q86XQ1,"0.99996018093781855,0.00003981906218144",57666.0,BRCA,True,
18BR017,EP400,Missense_Mutation,p.E2190D,57634.0,hg38,chr12,132044296,132044296,+,SNP,...,Q96L91,ENSG00000183495,uc001ujn.3,,O15411|Q6P2F5|Q8N8Q7|Q8NE05|Q96JK7|Q9P230,,57634.0,BRCA,True,True
18BR017,IFT81,Missense_Mutation,p.D583N,28981.0,hg38,chr12,110205625,110205625,+,SNP,...,Q8WYA0,ENSG00000122970,uc001tqi.4,,Q2YDY1|Q8NB51|Q9BSV2|Q9UNY8,,28981.0,BRCA,True,True


We checked if there are any mutations that are not considered driver mutations that are common across candidate patients. None of these mutations were particuarly common.

In [13]:
# Check most common mutations in candidate patients

most_common_mutations = candidate_patient_mutations["Gene"].value_counts()
most_common_mutations.head(20)

Gene
GOLGA6L2    3
ARMCX4      3
CHD1        3
GABRG3      2
GATA2       2
PRSS56      2
LAMC1       2
FUCA2       2
MN1         2
DOP1A       2
LRRC4B      2
IFT172      2
TMCO4       2
OBSCN       2
TTN         2
RERE        2
PLXND1      2
SPHKAP      2
ZC3H14      2
ABCC6       2
Name: count, dtype: int64

In [14]:
# Download copy number variation data

cnv = br.get_CNV("washu")
cnv.head()

Name,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,pk
Database_ID,ENSG00000121410.10,ENSG00000148584.13,ENSG00000175899.13,ENSG00000166535.18,ENSG00000184389.9,ENSG00000128274.14,ENSG00000118017.3,ENSG00000094914.11,ENSG00000081760.15,ENSG00000114771.12,...,ENSG00000086827.7,ENSG00000174442.10,ENSG00000122952.15,ENSG00000070476.13,ENSG00000203995.8,ENSG00000162378.11,ENSG00000159840.14,ENSG00000074755.13,ENSG00000036549.11,ENSG00000091436.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
01BR001,-0.07265,-0.00966,-0.11622,-0.11622,0.81754,-0.01887,-0.06602,-0.00563,-0.16064,-0.06602,...,0.01914,-0.26696,-0.00966,-0.06602,0.13021,0.13021,0.05187,0.2208,-0.09706,0.09183
01BR008,-0.06644,0.1155,0.31413,0.31413,0.08417,-0.07571,-0.09075,-0.02567,-0.02567,-0.09075,...,-0.04312,0.00988,0.1155,-0.09075,0.08417,0.08417,-0.00576,-0.02004,0.08417,-0.05192
01BR009,0.28468,-0.14743,-0.26372,-0.26372,-0.13078,0.04398,0.26289,-0.22827,-0.17576,0.26289,...,-0.15614,-0.15576,-0.14743,0.26289,-0.15498,-0.15498,0.06942,0.34555,-0.15498,0.21653
01BR010,0.0037,-0.14666,-0.00332,-0.00332,0.03008,-0.0094,-0.00744,-0.00332,-0.00332,-0.00744,...,0.039,-0.18501,-0.14666,-0.00744,0.03008,0.03008,-0.03744,-0.14187,0.03008,0.14108
01BR015,0.0961,0.13746,0.03751,0.03751,-0.17893,-0.24742,0.09962,0.12874,-0.21528,0.09962,...,-0.20074,-0.2397,0.13746,0.09962,0.40474,0.40474,-0.27216,-0.1866,0.40474,-0.23689


In [15]:
candidate_cnv_patients = cnv[cnv.index.isin(patients_without_driver_mutations)]
candidate_cnv_patients.head()

Name,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,pk
Database_ID,ENSG00000121410.10,ENSG00000148584.13,ENSG00000175899.13,ENSG00000166535.18,ENSG00000184389.9,ENSG00000128274.14,ENSG00000118017.3,ENSG00000094914.11,ENSG00000081760.15,ENSG00000114771.12,...,ENSG00000086827.7,ENSG00000174442.10,ENSG00000122952.15,ENSG00000070476.13,ENSG00000203995.8,ENSG00000162378.11,ENSG00000159840.14,ENSG00000074755.13,ENSG00000036549.11,ENSG00000091436.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
01BR023,-0.08158,0.13422,0.0093,0.0093,-0.40016,0.40444,-0.04538,0.0093,0.0093,-0.04538,...,-0.51938,0.04571,0.13422,0.10406,-0.40016,-0.40016,-0.1468,0.22234,0.17992,0.03808
01BR025,0.00226,0.01699,-0.00168,-0.00168,-0.00043,-0.00361,0.00382,-0.00168,-0.00168,0.00382,...,0.01101,-0.00746,0.01699,0.00382,-0.00043,-0.00043,0.01416,0.0007,-0.00043,0.0003
03BR002,0.00103,-0.01013,0.0042,0.0042,-0.01605,0.03513,-0.00456,0.0042,0.0042,-0.00456,...,-0.01032,-0.00219,-0.01013,-0.00456,-0.01605,-0.01605,-0.00116,-0.00172,-0.01605,-0.00515
03BR005,0.00851,0.02339,-0.00907,-0.00907,0.03822,-0.07876,-0.01027,-0.00907,-0.05621,-0.01027,...,0.02211,-0.00448,0.02339,-0.01027,0.03822,0.03822,-0.02272,0.02987,0.03822,-0.01255
05BR001,-0.01268,-0.009,-0.05628,-0.05628,0.00892,-0.04714,-0.03693,-0.02298,-0.02298,-0.03693,...,-0.03977,-0.03174,-0.009,-0.03693,0.00892,0.00892,-0.01057,-0.00411,0.00892,0.02052


In [16]:
all_patients = cnv.index
%store all_patients

Stored 'all_patients' (Index)


In [17]:
# Find patients without driver mutations that do not have CNV data - these will be filtered out

cnv_patients = set(candidate_cnv_patients.index)
mismatched_patients = patients_without_driver_mutations - cnv_patients
len(mismatched_patients)

12

In [22]:
# Filter CNV data to include only patients without driver mutations

candidate_cnv_patients = cnv[cnv.index.isin(patients_without_driver_mutations)]
%store candidate_cnv_patients
candidate_cnv_patients

Stored 'candidate_cnv_patients' (DataFrame)


Name,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,pk
Database_ID,ENSG00000121410.10,ENSG00000148584.13,ENSG00000175899.13,ENSG00000166535.18,ENSG00000184389.9,ENSG00000128274.14,ENSG00000118017.3,ENSG00000094914.11,ENSG00000081760.15,ENSG00000114771.12,...,ENSG00000086827.7,ENSG00000174442.10,ENSG00000122952.15,ENSG00000070476.13,ENSG00000203995.8,ENSG00000162378.11,ENSG00000159840.14,ENSG00000074755.13,ENSG00000036549.11,ENSG00000091436.15
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
01BR023,-0.08158,0.13422,0.0093,0.0093,-0.40016,0.40444,-0.04538,0.0093,0.0093,-0.04538,...,-0.51938,0.04571,0.13422,0.10406,-0.40016,-0.40016,-0.1468,0.22234,0.17992,0.03808
01BR025,0.00226,0.01699,-0.00168,-0.00168,-0.00043,-0.00361,0.00382,-0.00168,-0.00168,0.00382,...,0.01101,-0.00746,0.01699,0.00382,-0.00043,-0.00043,0.01416,0.0007,-0.00043,0.0003
03BR002,0.00103,-0.01013,0.0042,0.0042,-0.01605,0.03513,-0.00456,0.0042,0.0042,-0.00456,...,-0.01032,-0.00219,-0.01013,-0.00456,-0.01605,-0.01605,-0.00116,-0.00172,-0.01605,-0.00515
03BR005,0.00851,0.02339,-0.00907,-0.00907,0.03822,-0.07876,-0.01027,-0.00907,-0.05621,-0.01027,...,0.02211,-0.00448,0.02339,-0.01027,0.03822,0.03822,-0.02272,0.02987,0.03822,-0.01255
05BR001,-0.01268,-0.009,-0.05628,-0.05628,0.00892,-0.04714,-0.03693,-0.02298,-0.02298,-0.03693,...,-0.03977,-0.03174,-0.009,-0.03693,0.00892,0.00892,-0.01057,-0.00411,0.00892,0.02052
05BR003,0.02303,0.04916,0.04856,0.04856,0.05898,0.02389,0.04901,0.04856,0.04856,0.04901,...,-0.30921,0.04876,0.04916,0.04901,0.05898,0.05898,-0.35318,-0.32033,0.05898,-0.31456
11BR015,0.05911,0.20725,0.09614,0.09614,0.24075,0.0025,0.08565,0.09614,0.09614,0.08565,...,-0.58485,-0.34654,0.20725,0.15122,0.24075,0.24075,0.00264,0.16272,0.24075,-0.16318
11BR027,-0.01774,0.13475,0.0835,0.0835,0.04634,-0.05547,0.11669,-0.00864,-0.01412,0.11669,...,-0.49018,0.05053,0.13475,0.11669,0.10378,0.10378,0.07326,0.02744,0.18119,0.0657
11BR049,-0.11482,0.07614,0.15983,0.15983,0.08886,-0.4559,0.03501,0.27074,-0.02475,0.03501,...,0.05216,-0.00125,0.07614,0.03501,0.08886,0.08886,-0.07558,0.00723,0.08886,0.03968
18BR004,-0.03606,0.13876,0.0145,0.0145,-0.49707,-0.00432,0.02782,0.0145,0.0419,0.02782,...,0.09579,0.06894,0.13876,0.12324,-0.47364,-0.47364,-0.05963,0.1429,-0.47364,-0.16305


In [None]:
# Identify patients with normal CNV profiles

normal_patients = candidate_cnv_patients.loc[:, cnv.apply(lambda col: col.between(-0.3, 0.3).all())]
normal_patients = list(normal_patients.index)
normal_patients

['01BR023',
 '01BR025',
 '03BR002',
 '03BR005',
 '05BR001',
 '05BR003',
 '11BR015',
 '11BR027',
 '11BR049',
 '18BR004',
 '18BR017']

We found 11 patients who have no known driver mutations and normal copy number variation profiles. This suggests that something else is responsible for causing these cases of cancer.

In [None]:
normal_patient_mutations = candidate_patient_mutations[candidate_patient_mutations.index.isin(normal_patients)]

%store normal_patients
%store normal_patient_mutations

Stored 'normal_patients' (list)
Stored 'normal_patient_mutations' (DataFrame)
