In [1]:
import requests

import pandas as pd
import numpy as np


In [2]:
def get_data_from_cbio_portal(query,):
    url = 'http://www.cbioportal.org/webservice.do'
    str_query = url + '?' + query
    return pd.read_table(str_query, encoding='iso-8859-1')

def modify_array_to_string(value):
    return str(value).strip().replace('[','').replace(']', '').replace(',', '+').replace('\'','').replace(' ', '')

In [3]:
def get_all_types_of_cancer():
    '''
    GET ALL TYPES OF CANCER
    RETURN PANDAS DATAFRAME:
        type_of_cancer_id: a unique text identifier used to identify the type of cancer. 
            For example, "gbm" identifies Glioblastoma multiforme.
        name: short name of the type of cancer.
    '''
    query = 'cmd=getTypesOfCancer'
    return get_data_from_cbio_portal(query=query)

def get_all_cancer_studies():
    '''
    GET ALL CANCER STUDIES
    RETURN PANDAS DATAFRAME:
        cancer_study_id: a unique ID that should be used to identify the cancer study in subsequent interface calls.
        name: short name of the cancer study.
        description: short description of the cancer study.
    '''
    query = 'cmd=getCancerStudies'
    return get_data_from_cbio_portal(query=query)

def get_all_genetic_profiles_for_specific_cancer_study(cancer_id):
    '''
    GET ALL GENETIC PROFILES FOR A SPECIFIC CANCER STUDY
    PARAMETERS:
        cancer_study_id=[cancer study ID] (required)
    RETURN PANDAS DATAFRAME:
        genetic_profile_id: a unique ID used to identify the genetic profile ID in subsequent interface calls. This is a human readable ID. For example, "gbm_mutations" identifies the TCGA GBM mutation genetic profile.
        genetic_profile_name: short profile name.
        genetic_profile_description: short profile description.
        cancer_study_id: cancer study ID tied to this genetic profile. Will match the input cancer_study_id.
        genetic_alteration_type: indicates the profile type. Will be one of:
            MUTATION
            MUTATION_EXTENDED
            COPY_NUMBER_ALTERATION
            MRNA_EXPRESSION
            METHYLATION
        show_profile_in_analysis_tab: a boolean flag used for internal purposes (you can safely ignore it).
    '''
    if type(cancer_id) is list:
        dfs = [get_data_from_cbio_portal(query='cmd=getGeneticProfiles&cancer_study_id=' + str(q)) for q in cancer_id]
        return pd.concat(dfs, ignore_index=True)
    
    query = 'cmd=getGeneticProfiles&cancer_study_id=' + str(cancer_id)
    return get_data_from_cbio_portal(query=query)

def get_all_case_list_for_specific_cancer_study(cancer_study_id):
    '''
    GET ALL CASE LISTS FOR A SPECIFIC CANCER STUDY
    PARAMETERS:
        cancer_study_id=[cancer study ID] (required)
    RETURN PANDAS DATAFRAME:
        case_list_id: a unique ID used to identify the case list ID in subsequent interface calls. This is a human readable ID. For example, "gbm_all" identifies all cases profiles in the TCGA GBM study.
        case_list_name: short name for the case list.
        case_list_description: short description of the case list.
        cancer_study_id: cancer study ID tied to this genetic profile. Will match the input cancer_study_id.
        case_ids: space delimited list of all case IDs that make up this case list.
    '''
    if type(cancer_id) is list:
        dfs = [get_data_from_cbio_portal(query='cmd=getCaseLists&cancer_study_id=' + str(q)) for q in cancer_id]
        return pd.concat(dfs, ignore_index=True)
    
    query = 'cmd=getCaseLists&cancer_study_id=' + str(cancer_study_id)
    return get_data_from_cbio_portal(query=query)


def get_clinical_data(case_set_id):
    '''
    GET CLINICAL DATA
    PARAMETERS:
        case_set_id= [case set ID] (required)
   RETURN PANDAS DATAFRAME:
        case_id: Unique Case Identifier.
        overall_survival_months: Overall survival, in months.
        overall_survival_status: Overall survival status, usually indicated as "LIVING" or "DECEASED".
        disease_free_survival_months: Disease free survival, in months.
        disease_free_survival_status: Disease free survival status, usually indicated as "DiseaseFree" or "Recurred/Progressed".
        age_at_diagnosis: Age at diagnosis.
    '''
    if type(case_set_id) is list:
        dfs = [get_data_from_cbio_portal(query='cmd=getClinicalData&case_set_id=' + str(q)) for q in case_set_id]
        return pd.concat(dfs, ignore_index=True)

    query = 'cmd=getClinicalData&case_set_id=' + str(case_set_id)
    return get_data_from_cbio_portal(query=query)

def get_extended_mutation_data(genetic_profile_id, case_set_id, gene_list):
    '''
    GET EXTENDED MUTATION DATA
    PARAMETERS:
        genetic_profile_id= [one or more mutation profile IDs] (required). Multiple genetic profile IDs must be separated by comma (,) characters, or URL encoded spaces, e.g. +
        case_set_id= [case set ID] (optional). If not provided, all cases that have data in the specified mutation profiles will be queried.
        gene_list= [one or more genes, specified as HUGO Gene Symbols or Entrez Gene IDs] (required). Multiple genes must be separated by comma (,) characters, or URL encoded spaces, e.g. +
    RETURN PANDAS DATAFRAME:
        entrez_gene_id: Entrez Gene ID.
        gene_symbol: HUGO Gene Symbol.
        case_id: Case ID.
        sequencing_center: Sequencer Center responsible for identifying this mutation. For example: broad.mit.edu.
        mutation_status: somatic or germline mutation status. all mutations returned will be of type somatic.
        mutation_type: mutation type, such as nonsense, missense, or frameshift_ins.
        validation_status: validation status. Usually valid, invalid, or unknown.
        amino_acid_change: amino acid change resulting from the mutation.
        functional_impact_score: predicted functional impact score, as predicted by: Mutation Assessor.
        xvar_link: Link to the Mutation Assessor web site.
        xvar_link_pdb: Link to the Protein Data Bank (PDB) View within Mutation Assessor web site.
        xvar_link_msa: Link the Multiple Sequence Alignment (MSA) view within the Mutation Assessor web site.
        chr: chromosome where mutation occurs.
        start_position: start position of mutation.
        end_position: end position of mutation.
        genetic_profile_id: mutation profile id.
    '''
    if type(genetic_profile_id) is list:
        genetic_profile_id = modify_array_to_string(genetic_profile_id)
        
    if type(gene_list) is list:
        gene_list = modify_array_to_string(gene_list)
        
    if type(case_set_id) is list:
        dfs = [get_data_from_cbio_portal(query='cmd=getMutationData&case_set_id=' + str(q) + '&genetic_profile_id=' + \
                                        genetic_profile_id + '&gene_list=' + gene_list) for q in case_set_id]
        return pd.concat(dfs, ignore_index=True)
    
    query='cmd=getMutationData&case_set_id=' + str(case_set_id) + '&genetic_profile_id=' + \
                                        genetic_profile_id + '&gene_list=' + gene_list
        
    return get_data_from_cbio_portal(query=query) 
    
    
def get_profile_data(case_set_id, genetic_profile_id, gene_list):
    '''
    GET PROFILE DATA
    PARAMETERS:
        cmd=getProfileData (required)
        case_set_id= [case set ID] (required)
        genetic_profile_id= [one or more genetic profile IDs] (required). Multiple genetic profile IDs must be separated by comma (,) characters, or URL encoded spaces, e.g. +
        gene_list= [one or more genes, specified as HUGO Gene Symbols or Entrez Gene IDs] (required). Multiple genes must be separated by comma (,) characters, or URL encoded spaces, e.g. +
    RETURN PANDAS DATAFRAME:
        When requesting one or multiple genes and a single genetic profile ID (see above), you will receive a tab-delimited matrix with the following columns:

        GENE_ID: Entrez Gene ID
        COMMON: HUGO Gene Symbol
        Columns 3 - N: Data for each case
        Response Format 2
        When requesting a single gene and multiple genetic profile IDs (see above), you will receive a tab-delimited matrix with the following columns:

        GENETIC_PROFILE_ID: The Genetic Profile ID.
        ALTERATION_TYPE: The Genetic Alteration Type, e.g. MUTATION, MUTATION_EXTENDED, COPY_NUMBER_ALTERATION, or MRNA_EXPRESSION.
        GENE_ID: Entrez Gene ID.
        COMMON: HUGO Gene Symbol.
        Columns 5 - N: Data for each case.
    '''
    if type(genetic_profile_id) is list:
        genetic_profile_id = modify_array_to_string(genetic_profile_id)
        
    if type(gene_list) is list:
        gene_list = modify_array_to_string(gene_list)
        
    if type(case_set_id) is list:
        dfs = [get_data_from_cbio_portal(query='cmd=getProfileData&case_set_id=' + str(q) + '&genetic_profile_id=' + \
                                        genetic_profile_id + '&gene_list=' + gene_list) for q in case_set_id]
        return pd.concat(dfs, ignore_index=True)
    
    query='cmd=getProfileData&case_set_id=' + str(case_set_id) + '&genetic_profile_id=' + \
                                        genetic_profile_id + '&gene_list=' + gene_list
        
    return get_data_from_cbio_portal(query=query)
 

In [4]:
get_clinical_data(case_set_id=['gbm_tcga_all', 'all_stjude_2016_sequenced'])

Unnamed: 0,AGE,AGE_CLASS,CANCER_TYPE,CANCER_TYPE_DETAILED,CASE_ID,CYTOGENETICS,DAYS_TO_BIRTH,DAYS_TO_COLLECTION,DAYS_TO_DEATH,DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS,...,SAMPLE_TYPE_ID,SEX,SHORTEST_DIMENSION,SPECIMEN_SECOND_LONGEST_DIMENSION,TISSUE_SOURCE_SITE,TREATMENT_OUTCOME_FIRST_COURSE,TUMOR_STATUS,TUMOR_TISSUE_SITE,VIAL_NUMBER,VITAL_STATUS
0,47.00,,Glioma,Glioblastoma Multiforme,TCGA-32-4213-01,,-17237.0,,,0.0,...,1.0,Female,0.2,0.7,32,,WITH TUMOR,Brain,A,Alive
1,60.00,,Glioma,Glioblastoma Multiforme,TCGA-14-1452-01,,-22106.0,,216.0,0.0,...,1.0,Male,0.2,0.8,14,,WITH TUMOR,Brain,A,Dead
2,63.00,,Glioma,Glioblastoma Multiforme,TCGA-08-0509-01,,-23273.0,,382.0,0.0,...,1.0,Male,0.1,0.9,8,,WITH TUMOR,Brain,A,Dead
3,52.00,,Glioma,Glioblastoma Multiforme,TCGA-14-1395-01,,-19237.0,,42.0,0.0,...,1.0,Male,0.4,0.6,14,,WITH TUMOR,Brain,B,Dead
4,59.00,,Glioma,Glioblastoma Multiforme,TCGA-27-1838-01,,-21706.0,,350.0,0.0,...,1.0,Female,0.3,0.5,27,,WITH TUMOR,Brain,A,Dead
5,21.00,,Glioma,Glioblastoma Multiforme,TCGA-15-1444-01,,-7827.0,,1537.0,0.0,...,1.0,Male,0.5,1.1,15,Complete Remission/Response,WITH TUMOR,Brain,A,Dead
6,85.00,,Glioma,Glioblastoma Multiforme,TCGA-76-4928-01,,-31267.0,,94.0,0.0,...,1.0,Female,0.3,1.1,76,,WITH TUMOR,Brain,B,Dead
7,26.00,,Glioma,Glioblastoma Multiforme,TCGA-02-0271-01,,-9578.0,,440.0,0.0,...,1.0,Male,0.8,0.8,2,,,Brain,A,Dead
8,71.00,,Glioma,Glioblastoma Multiforme,TCGA-28-5207-01,,-25975.0,,343.0,0.0,...,1.0,Male,0.3,0.8,28,,WITH TUMOR,Brain,A,Dead
9,45.00,,Glioma,Glioblastoma Multiforme,TCGA-28-2514-01,,-16737.0,,,0.0,...,1.0,Male,0.3,1.0,28,,,Brain,A,Alive


In [5]:
get_all_types_of_cancer()

Unnamed: 0,type_of_cancer_id,name
0,aa,Aggressive Angiomyxoma
1,aastr,Anaplastic Astrocytoma
2,abc,Activated B-cell Type
3,abl,Acute Basophilic Leukemia
4,aca,Adrenocortical Adenoma
5,acbc,Adenoid Cystic Breast Cancer
6,acc,Adrenocortical Carcinoma
7,accc,Acinic Cell Carcinoma
8,acml,"Atypical Chronic Myeloid Leukemia, BCR-ABL1-"
9,acn,"Acinar Cell Carcinoma, NOS"


In [6]:
get_all_cancer_studies()

Unnamed: 0,cancer_study_id,name,description
0,paac_jhu_2014,Acinar Cell Carcinoma of the Pancreas (Johns H...,Mutation data from whole exome sequencing of 2...
1,all_stjude_2016,"Acute Lymphoblastic Leukemia (St Jude, Nat Gen...",Whole-genome and/or whole-exome sequencing was...
2,laml_tcga_pub,"Acute Myeloid Leukemia (TCGA, NEJM 2013)","TCGA Acute Myeloid Leukemia, analysis of 200 a..."
3,laml_tcga_pan_can_atlas_2018,"Acute Myeloid Leukemia (TCGA, PanCancer Atlas)",TCGA PanCanAtlas genomic data from 11k cases a...
4,laml_tcga,"Acute Myeloid Leukemia (TCGA, Provisional)",TCGA Acute Myeloid Leukemia; raw data at the <...
5,acyc_fmi_2014,"Adenoid Cystic Carcinoma (FMI, Am J Surg Pathl...",Comprehensive genomic profiling of 28 metastat...
6,acyc_mda_2015,"Adenoid Cystic Carcinoma (MDA, Clin Cancer Res...",WGS of 21 salivary ACCs and targeted molecular...
7,acyc_mskcc_2013,"Adenoid Cystic Carcinoma (MSKCC, Nat Genet 2013)",Exome profiling of 60 adenoid cystic carcinoma...
8,acyc_sanger_2013,"Adenoid Cystic Carcinoma (Sanger/MDA, JCI 2013)",Whole exome sequencing of 24 ACCs.
9,acbc_mskcc_2015,"Adenoid Cystic Carcinoma of the Breast (MSKCC,...",Whole exome sequencing of 12 breast AdCCs.


In [7]:
cancer_id=['all_stjude_2016', 'paac_jhu_2014']
case_id=['SJBALL020595_D1', 'SJERG005_D']

In [8]:
get_all_case_list_for_specific_cancer_study(cancer_id)

Unnamed: 0,case_list_id,case_list_name,case_list_description,cancer_study_id,case_ids
0,all_stjude_2016_all,All Tumors,All tumor samples (73 samples),920,SJBALL020595_D1 SJERG005_D SJERG023_D_WES SJER...
1,all_stjude_2016_sequenced,Sequenced Tumors,All (Next-Gen) sequenced samples (73 samples),920,SJBALL020595_D1 SJERG005_D SJERG023_D_WES SJER...
2,paac_jhu_2014_all,All Tumors,All tumor samples (23 samples),815,ACINAR01 ACINAR02 ACINAR03 ACINAR04 ACINAR05 A...
3,paac_jhu_2014_sequenced,Sequenced Tumors,All (Next-Gen) sequenced samples (23 samples),815,ACINAR01 ACINAR02 ACINAR03 ACINAR04 ACINAR05 A...


In [9]:
get_all_genetic_profiles_for_specific_cancer_study(cancer_id)

Unnamed: 0,genetic_profile_id,genetic_profile_name,genetic_profile_description,cancer_study_id,genetic_alteration_type,show_profile_in_analysis_tab
0,all_stjude_2016_mutations,Mutations,Mutation data from WGS/WES.,920,MUTATION_EXTENDED,True
1,paac_jhu_2014_mutations,Mutations,Mutation data from whole exome sequencing of 2...,815,MUTATION_EXTENDED,True
