# Parse biomarkers from CGI db

In [1]:
import pandas as pd

In [60]:
#Read table from CGI https://www.cancergenomeinterpreter.org/biomarkers
df = pd.read_csv('cgi_biomarkers.tsv',sep='\t')

#Get mutation alteration
df = df[df['Alteration type']=='MUT']

#Keep with responsive biomarkers
df = df[df['Association']=='Responsive']

#Select skin and lung cancers
selection = ['CM','LUAD','LUSC','NSCLC']
df = df[df['Primary Tumor type'].isin(selection)]

#Exlude Biomarkers that are not non-synonimous missense muts
df = df[~df['Biomarker'].str.contains('oncogenic mutation|proximal exon|mutation in exon|inframe insertion')]

#Select useful columns
df = df[['Biomarker','Gene','Alteration','Drug','Primary Tumor type','Drug full name','Drug status']].drop_duplicates(['Biomarker','Primary Tumor type'],keep='first')

#Fix alteration column
df['Alteration'] = df['Biomarker'].str.split('(',expand=True)[1].str.split(')',expand=True)[0]

#Exclude those biomarkers without explicit alteration
df = df[~df['Alteration'].isnull()]

#Exclude those biomarkers without explicit drug name
df = df[df['Drug']!='[]']

#Display one mutation per row
df_list = df.values.tolist()
final_list = []
for row in df_list:
    if ',' in row[2]:
        alterations = row[2].split(',')
        num_alt = len(alterations)
        for i in range(0,num_alt):
            new_row = [row[0] , row[1] , alterations[i] , row[3] , row[4] , row[5]]
            final_list.append(new_row)
    else:
        final_list.append(row)
final_df = pd.DataFrame(final_list,columns=df.columns)

#Fix Biomarker column
final_df['Biomarker'] = final_df['Gene'] +'_'+ final_df['Alteration']

#Exclude non-specific alterations
final_df = final_df[~final_df['Alteration'].str.contains('-')]

#Fix Inhibitor type
def inh_type(x):
    if '(' in x:
        if 'etc' in x:
            y = x.split(' (')[0]
        else:
            y = x.split('(')[1].split(')')[0]
    else:
        y = x
    return y

final_df['Inhibitor type'] = final_df['Drug full name'].apply(lambda x: inh_type(x))

#Fix Drug name column
def drug_name(x):
    if '[' in x:
        y = x.split('[')[1].split(']')[0]
    else:
        y = x
    return y
final_df['Drug'] = final_df['Drug'].apply(lambda x: drug_name(x))

#Fix Approved column
final_df['Approved'] = final_df['Drug status'].replace({None:False,'Approved':True, 'FDA approved':True})

#Drop non-useful columns
final_df.drop(columns=['Drug full name','Drug status'],inplace=True)

#Select unique biomarkers per cancer type
final_df.drop_duplicates(['Biomarker','Primary Tumor type','Approved'],keep = 'first',inplace=True)

#One biomarker with an approved drug and a not approved drug -> select the approved one
dupl = final_df[final_df.duplicated(['Biomarker','Primary Tumor type'])]
bm_dupl = dupl['Biomarker'].tolist()[0]
ct_dupl = dupl['Primary Tumor type'].tolist()[0]

final_df = final_df[~((final_df['Biomarker']==bm_dupl)&(final_df['Primary Tumor type']==ct_dupl)&(final_df['Approved']==False))]

final_df

Unnamed: 0,Biomarker,Gene,Alteration,Drug,Primary Tumor type,Inhibitor type,Approved
0,ALK_E1408V,ALK,E1408V,Brigatinib,LUAD,Pan-TK inhibitor,False
1,ALK_L1196M,ALK,L1196M,Brigatinib,LUAD,Pan-TK inhibitor,False
3,ALK_S1206Y,ALK,S1206Y,Ceritinib,LUAD,ALK inhibitor,False
4,ALK_G1269A,ALK,G1269A,Ceritinib,LUAD,ALK inhibitor,False
5,ALK_I1171T,ALK,I1171T,Ceritinib,LUAD,ALK inhibitor,False
6,ALK_L1198F,ALK,L1198F,Crizotinib,LUAD,ALK inhibitor,False
7,ALK_C1156Y,ALK,C1156Y,Lorlatinib,LUAD,ALK&ROS1 inhibitor,False
8,ARAF_S214C,ARAF,S214C,Sorafenib,LUAD,Pan-TK inhibitor,True
9,BRAF_K601R,BRAF,K601R,Trametinib,CM,MEK inhibitors,False
10,BRAF_L597R,BRAF,L597R,Trametinib,CM,MEK inhibitors,False


In [61]:
final_df.groupby('Approved').count()

Unnamed: 0_level_0,Biomarker,Gene,Alteration,Drug,Primary Tumor type,Inhibitor type
Approved,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,54,54,54,54,54,54
True,2,2,2,2,2,2


In [62]:
final_df[final_df['Approved']==True]

Unnamed: 0,Biomarker,Gene,Alteration,Drug,Primary Tumor type,Inhibitor type,Approved
8,ARAF_S214C,ARAF,S214C,Sorafenib,LUAD,Pan-TK inhibitor,True
54,EGFR_L858R,EGFR,L858R,Erlotinib,NSCLC,EGFR inhibitor 1st gen,True


In [63]:
df1 = final_df.groupby(['Biomarker','Primary Tumor type'],as_index=False).count()
print(len(df1),'biomarkers in total')

56 biomarkers in total


In [64]:
cm_df = final_df[final_df['Primary Tumor type']=='CM']
print (len(cm_df.groupby('Biomarker').count()) , 'biomarkers for CM')

21 biomarkers for CM


In [65]:
cm_df

Unnamed: 0,Biomarker,Gene,Alteration,Drug,Primary Tumor type,Inhibitor type,Approved
9,BRAF_K601R,BRAF,K601R,Trametinib,CM,MEK inhibitors,False
10,BRAF_L597R,BRAF,L597R,Trametinib,CM,MEK inhibitors,False
11,BRAF_V600R,BRAF,V600R,Trametinib,CM,MEK inhibitors,False
14,BRAF_V600E,BRAF,V600E,Dabrafenib;Trametinib,CM,BRAF inhibitor + MEK inhibitor,False
15,BRAF_V600K,BRAF,V600K,Dabrafenib;Trametinib,CM,BRAF inhibitor + MEK inhibitor,False
18,BRAF_D594G,BRAF,D594G,Sorafenib,CM,Pan-TK inhibitor,False
19,BRAF_G469E,BRAF,G469E,Sorafenib,CM,Pan-TK inhibitor,False
20,BRAF_V600D,BRAF,V600D,Vemurafenib,CM,BRAF inhibitor,False
22,BRAF_V600M,BRAF,V600M,Vemurafenib,CM,BRAF inhibitor,False
23,BRAF_V600G,BRAF,V600G,Vemurafenib,CM,BRAF inhibitor,False


In [78]:
lungs = ['LUAD','LUSC','NSCLC']
lung_df = final_df[final_df['Primary Tumor type'].isin(lungs)]
print (len(lung_df.groupby('Biomarker').count()),'biomarkers for lung cancer')

35 biomarkers for lung cancer


In [79]:
lung_df

Unnamed: 0,Biomarker,Gene,Alteration,Drug,Primary Tumor type,Inhibitor type,Approved
0,ALK_E1408V,ALK,E1408V,Brigatinib,LUAD,Pan-TK inhibitor,False
1,ALK_L1196M,ALK,L1196M,Brigatinib,LUAD,Pan-TK inhibitor,False
3,ALK_S1206Y,ALK,S1206Y,Ceritinib,LUAD,ALK inhibitor,False
4,ALK_G1269A,ALK,G1269A,Ceritinib,LUAD,ALK inhibitor,False
5,ALK_I1171T,ALK,I1171T,Ceritinib,LUAD,ALK inhibitor,False
6,ALK_L1198F,ALK,L1198F,Crizotinib,LUAD,ALK inhibitor,False
7,ALK_C1156Y,ALK,C1156Y,Lorlatinib,LUAD,ALK&ROS1 inhibitor,False
8,ARAF_S214C,ARAF,S214C,Sorafenib,LUAD,Pan-TK inhibitor,True
12,BRAF_V600E,BRAF,V600E,Dabrafenib,NSCLC,BRAF inhibitor,False
16,BRAF_G466V,BRAF,G466V,Dasatinib,LUAD,BCR-ABL inhibitor 2nd gen,False


In [66]:
luad_df = final_df[final_df['Primary Tumor type']=='LUAD']
print (len(luad_df.groupby('Biomarker').count()),'biomarkers for LUAD')

14 biomarkers for LUAD


In [67]:
luad_df

Unnamed: 0,Biomarker,Gene,Alteration,Drug,Primary Tumor type,Inhibitor type,Approved
0,ALK_E1408V,ALK,E1408V,Brigatinib,LUAD,Pan-TK inhibitor,False
1,ALK_L1196M,ALK,L1196M,Brigatinib,LUAD,Pan-TK inhibitor,False
3,ALK_S1206Y,ALK,S1206Y,Ceritinib,LUAD,ALK inhibitor,False
4,ALK_G1269A,ALK,G1269A,Ceritinib,LUAD,ALK inhibitor,False
5,ALK_I1171T,ALK,I1171T,Ceritinib,LUAD,ALK inhibitor,False
6,ALK_L1198F,ALK,L1198F,Crizotinib,LUAD,ALK inhibitor,False
7,ALK_C1156Y,ALK,C1156Y,Lorlatinib,LUAD,ALK&ROS1 inhibitor,False
8,ARAF_S214C,ARAF,S214C,Sorafenib,LUAD,Pan-TK inhibitor,True
16,BRAF_G466V,BRAF,G466V,Dasatinib,LUAD,BCR-ABL inhibitor 2nd gen,False
17,BRAF_Y472C,BRAF,Y472C,Dasatinib,LUAD,BCR-ABL inhibitor 2nd gen,False


In [76]:
lusc_df = final_df[final_df['Primary Tumor type']=='LUSC']
print(len(lusc_df.groupby('Biomarker').count()),'biomarkers for LUSC')

7 biomarkers for LUSC


In [69]:
lusc_df

Unnamed: 0,Biomarker,Gene,Alteration,Drug,Primary Tumor type,Inhibitor type,Approved
25,DDR2_I638F,DDR2,I638F,Dasatinib,LUSC,BCR-ABL inhibitor 2nd gen,False
26,DDR2_L239R,DDR2,L239R,Dasatinib,LUSC,BCR-ABL inhibitor 2nd gen,False
27,DDR2_G253C,DDR2,G253C,Dasatinib,LUSC,BCR-ABL inhibitor 2nd gen,False
28,DDR2_G774V,DDR2,G774V,Dasatinib,LUSC,BCR-ABL inhibitor 2nd gen,False
29,DDR2_L63V,DDR2,L63V,Dasatinib,LUSC,BCR-ABL inhibitor 2nd gen,False
30,DDR2_G505S,DDR2,G505S,Dasatinib,LUSC,BCR-ABL inhibitor 2nd gen,False
31,DDR2_S768R,DDR2,S768R,Dasatinib,LUSC,BCR-ABL inhibitor 2nd gen,False


In [77]:
nsclc_df = final_df[final_df['Primary Tumor type']=='NSCLC']
print(len(nsclc_df.groupby('Biomarker').count()),'biomarkers for NSCLC')

14 biomarkers for NSCLC


In [71]:
nsclc_df

Unnamed: 0,Biomarker,Gene,Alteration,Drug,Primary Tumor type,Inhibitor type,Approved
12,BRAF_V600E,BRAF,V600E,Dabrafenib,NSCLC,BRAF inhibitor,False
32,EGFR_T790M,EGFR,T790M,"Rociletinib,HM61713",NSCLC,EGFR inhibitor 3rd gens,False
34,EGFR_L861Q,EGFR,L861Q,Afatinib,NSCLC,ERBB2 inhibitor&EGFR inhibitor 2nd gen,False
35,EGFR_G719A,EGFR,G719A,Afatinib,NSCLC,ERBB2 inhibitor&EGFR inhibitor 2nd gen,False
36,EGFR_G719S,EGFR,G719S,Afatinib,NSCLC,ERBB2 inhibitor&EGFR inhibitor 2nd gen,False
37,EGFR_G719C,EGFR,G719C,Afatinib,NSCLC,ERBB2 inhibitor&EGFR inhibitor 2nd gen,False
38,EGFR_G719D,EGFR,G719D,Afatinib,NSCLC,ERBB2 inhibitor&EGFR inhibitor 2nd gen,False
39,EGFR_L747S,EGFR,L747S,Afatinib,NSCLC,ERBB2 inhibitor&EGFR inhibitor 2nd gen,False
40,EGFR_S768I,EGFR,S768I,Afatinib,NSCLC,ERBB2 inhibitor&EGFR inhibitor 2nd gen,False
41,EGFR_L861P,EGFR,L861P,Afatinib,NSCLC,ERBB2 inhibitor&EGFR inhibitor 2nd gen,False


In [80]:
final_df.to_csv('selected_biomarkers.tsv',sep='\t',index=False)

In [81]:
df = pd.read_csv('selected_biomarkers.tsv',sep='\t')
df

Unnamed: 0,Biomarker,Gene,Alteration,Drug,Primary Tumor type,Inhibitor type,Approved
0,ALK_E1408V,ALK,E1408V,Brigatinib,LUAD,Pan-TK inhibitor,False
1,ALK_L1196M,ALK,L1196M,Brigatinib,LUAD,Pan-TK inhibitor,False
2,ALK_S1206Y,ALK,S1206Y,Ceritinib,LUAD,ALK inhibitor,False
3,ALK_G1269A,ALK,G1269A,Ceritinib,LUAD,ALK inhibitor,False
4,ALK_I1171T,ALK,I1171T,Ceritinib,LUAD,ALK inhibitor,False
5,ALK_L1198F,ALK,L1198F,Crizotinib,LUAD,ALK inhibitor,False
6,ALK_C1156Y,ALK,C1156Y,Lorlatinib,LUAD,ALK&ROS1 inhibitor,False
7,ARAF_S214C,ARAF,S214C,Sorafenib,LUAD,Pan-TK inhibitor,True
8,BRAF_K601R,BRAF,K601R,Trametinib,CM,MEK inhibitors,False
9,BRAF_L597R,BRAF,L597R,Trametinib,CM,MEK inhibitors,False
