In [3]:
!pip install bravado



In [1]:
import pandas as pd
import numpy as np
from bravado.client import SwaggerClient

In [2]:
cbioportal = SwaggerClient.from_url('https://www.cbioportal.org/api/v2/api-docs',
                                    config={"validate_requests":False,"validate_responses":False,"validate_swagger_spec": False})

In [16]:
def mutation_to_maf(path):

    combined = pd.read_csv(path, sep="\t", dtype='object')
    studies = list(combined['Study ID'].unique())
    
    mutations_list = []
    df_list = []
    
    #carregando as mutações e os sample ID a partir de uma lista de estudos
    for studie in studies:
        mutations = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
            molecularProfileId='{}_mutations'.format(studie),
            sampleListId='{}_all'.format(studie),
            projection='DETAILED').result()
        
        mutations_list.append(mutations) 

    #transformando as mutações em dataframes
    for mutations in mutations_list:
        df = pd.DataFrame.from_dict([dict(
            {k:getattr(m,k) for k in dir(m)},
            **{k:getattr(m.gene,k) for k in dir(m.gene)}) for m in mutations])
        df.dropna()
        df_list.append(df)
    
    studie_mutations = pd.concat(df_list, axis=0) #concatenando os dataframes
    
    #escolhendo somente as colunas exigidas pelo maftools
    maf_ = studie_mutations[[#'alleleSpecificCopyNumber', 'aminoAcidChange', 'center', 
    'chr',
       #'driverFilter', 'driverFilterAnnotation', 'driverTiersFilter',
       #'driverTiersFilterAnnotation', 
    'endPosition', #'entrezGeneId', 'gene',
       #'keyword', 'molecularProfileId', 'mutationStatus', 
    'mutationType',
       #'namespaceColumns', 'ncbiBuild', 'normalAltCount', 'normalRefCount',
       #'patientId', 
    'proteinChange', #'proteinPosEnd', 'proteinPosStart',
       'referenceAllele',# 'refseqMrnaId', 
    'sampleId', 'startPosition',
       'studyId', 'tumorAltCount', 'tumorRefCount', 
    #'uniquePatientKey',
       #'uniqueSampleKey', 'validationStatus', 
    'variantAllele', 
    'variantType',
       #'geneticEntityId', 'type'
    'hugoGeneSymbol']]
    
    maf = maf_.drop_duplicates() 
    

    #renomeando para o maftools
    maf = maf.rename(columns={'hugoGeneSymbol':'Hugo_Symbol',
                    'sampleId':'Tumor_Sample_Barcode',
                    'proteinChange': 'Protein_Change',
                    'mutationType': 'Variant_Classification',
                    'variantType': 'Variant_Type',
                    'startPosition': 'Start_Position',
                    'endPosition': 'End_Position',
                    'referenceAllele':'Reference_Allele',
                    'variantAllele': 'Tumor_Seq_Allele2',
                          'studyId':'Study_ID'})
    
    return maf

In [3]:
list_genes = pd.read_csv("genes_leukemia/BALL.csv")
list_genes

Unnamed: 0,GENE,CNV,DEL,DUP
0,ASXL1,0,5,3
1,BRAF,1,6,5
2,CALR,0,3,1
3,CBL,0,6,5
4,CDKN2A,1,66,13
5,CDKN2B,1,57,10
6,ETV6,0,29,3
7,EZH2,0,7,5
8,FBXW7,0,4,23
9,GATA1,0,5,34


In [17]:
leukemia = mutation_to_maf("combined_leukemia_studies.tsv")

  studie_mutations = pd.concat(df_list, axis=0) #concatenando os dataframes


Unnamed: 0,chr,End_Position,Variant_Classification,Protein_Change,Reference_Allele,Tumor_Sample_Barcode,Start_Position,Study_ID,tumorAltCount,tumorRefCount,Tumor_Seq_Allele2,Variant_Type,Hugo_Symbol
0,4,106156043,Frame_Shift_Del,Q317Rfs*30,C,aml_ohsu_2018_09-00705,106156043,aml_ohsu_2018,68.0,151.0,-,DEL,TET2
1,4,106190830,Frame_Shift_Del,V1371Sfs*77,G,aml_ohsu_2018_09-00705,106190830,aml_ohsu_2018,28.0,74.0,-,DEL,TET2
2,5,170837544,Frame_Shift_Ins,W288Cfs*12,-,aml_ohsu_2018_09-00705,170837543,aml_ohsu_2018,17.0,59.0,TCTG,INS,NPM1
3,2,25457242,Missense_Mutation,R882H,C,aml_ohsu_2018_10-00136,25457242,aml_ohsu_2018,11.0,28.0,T,SNP,DNMT3A
4,11,32456652,Frame_Shift_Del,A82Pfs*15,C,aml_ohsu_2018_10-00136,32456652,aml_ohsu_2018,18.0,51.0,-,DEL,WT1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1835,11,1272152,Missense_Mutation,T4681I,C,TCGA-AB-2988-03,1272152,laml_tcga_pub,1.0,0.0,T,SNP,MUC5B
1836,7,75051473,Missense_Mutation,G688R,C,TCGA-AB-2803-03,75051473,laml_tcga_pub,1.0,9.0,G,SNP,POM121C
1837,16,33629937,Missense_Mutation,H54Q,C,TCGA-AB-2980-03,33629937,laml_tcga_pub,110.0,557.0,G,SNP,IGHV3OR16-13
1838,7,148964299,Missense_Mutation,R220K,G,TCGA-AB-2868-03,148964299,laml_tcga_pub,103.0,248.0,A,SNP,ZNF783


In [33]:
subset_leuk = leukemia[leukemia['Hugo_Symbol'].isin(list(list_genes['GENE']))]
subset_leuk['Hugo_Symbol'].unique()

array(['NPM1', 'WT1', 'SRSF2', 'IDH1', 'PHF6', 'JAK2', 'ZRSR2', 'CDKN2A',
       'PTEN', 'GATA1'], dtype=object)

In [34]:
subset_leuk.to_csv("TALL_maf.txt", sep='\t')