In [403]:
import numpy as np
import pandas as pd
import re
pd.set_option('display.max_rows', None)

In [254]:
# Read in new data
filepath_new = 'mmc1-tony.xlsx'
data_new = pd.read_excel(filepath_new,index_col=0)

In [560]:
# Breast         - 'BCRA'
# Head-neck      - 'HNSC'
# Melanoma(skin) - 'SKCM'
# Stomach        - 'STAD'
# Bladder        - 'BLCA'
# Sarcoma        - 'SARC'
# Pancreatic     - 'PAAD'

def create_dataset(cancer_type,new_tumor_event_site, data_new):
    data_clinical = data_new[data_new['type']==cancer_type]

    filepath_ge = str(cancer_type)+'/TCGA-'+str(cancer_type)+'.htseq_fpkm.tsv'
    data_ge = pd.read_csv(filepath_ge,delimiter = '\t')
    data_ge.rename(columns = {'Ensembl_ID':'Sample'},inplace=True)
    data_ge = data_ge.set_index('Sample').T
    data_ge['sample_id'] = data_ge.index
    
    # Get sample ids
    idx_ge = [idx[:-4] for idx in data_ge.index]
    idx_clinical = data_clinical['bcr_patient_barcode']
    idx_inter = set(idx_ge).intersection(idx_clinical)
    data_ge['bcr_patient_barcode'] = idx_ge
    
    # Make subset of clinical data for join
    data_clinical_subset = data_clinical[['bcr_patient_barcode','new_tumor_event_site']]
    
    # Combine data using inner join
    data_combined = pd.merge(data_ge,data_clinical_subset,on='bcr_patient_barcode',how='inner')
    data_combined.index = data_combined['sample_id']
    
    data_final = data_combined[data_combined['new_tumor_event_site'] == new_tumor_event_site]
    
    return data_final

In [574]:
# Get breast cancer data with new tumor site in bone
data_BRCA_bone = create_dataset(cancer_type = 'BRCA', new_tumor_event_site = 'Bone', data_new = data_new)

In [578]:
# Get head-neck squamous cell carcinoma data with new tumor site in oral cavity
data_HNSC_oral = create_dataset(cancer_type = 'HNSC', new_tumor_event_site = 'Oral Cavity', data_new = data_new)

In [586]:
# Get melanoma (skin cancer) data with new tumor site in lung
data_SKCM_lung = create_dataset(cancer_type = 'SKCM', new_tumor_event_site = 'Lung', data_new = data_new)

In [587]:
# Get head-neck squamous cell carcinoma data with new tumor site in liver
data_STAD_liver = create_dataset(cancer_type = 'STAD', new_tumor_event_site = 'Liver', data_new = data_new)

In [None]:
# Get bladder cancer data with new tumor site in lung
data_BLCA_lung = create_dataset(cancer_type = 'BLCA', new_tumor_event_site = 'Lung', data_new = data_new)

In [570]:
# Get sarcoma data with new tumor site in lung
data_SARC_lung = create_dataset(cancer_type = 'SARC', new_tumor_event_site = 'Lung', data_new = data_new)

In [572]:
# Get pancreatic cancer data with new tumor site in lung
data_PAAD_liver = create_dataset(cancer_type = 'PAAD', new_tumor_event_site = 'Liver', data_new = data_new)

In [612]:
# Get counts of new tumor sites of each type of cancers

# for cancer_type in data_new['type'].unique():
#     data_test = data_new[data_new['type'] == cancer_type]
#     print(cancer_type)
#     print(data_test['new_tumor_event_site'].value_counts())
#     print('=========================')

In [143]:
# # Phenotype data

# # Breast Cancer
# filepath_breast_ph = 'Breast/TCGA-BRCA.GDC_phenotype.tsv'
# data_breast_ph = pd.read_csv(filepath_breast_ph, delimiter = '\t')

# # Bladder Cancer
# filepath_bladder_ph = 'Bladder/TCGA-BLCA.GDC_phenotype.tsv'
# data_bladder_ph = pd.read_csv(filepath_bladder_ph,delimiter = '\t')

# # Cervical Cancer
# filepath_cervical_ph = 'Cervical/TCGA-CESC.GDC_phenotype.tsv'
# data_cervical_ph = pd.read_csv(filepath_cervical_ph,delimiter = '\t')

# # Colon Cancer
# filepath_colon_ph = 'Colon/TCGA-COAD.GDC_phenotype.tsv'
# data_colon_ph = pd.read_csv(filepath_colon_ph,delimiter = '\t')

# # Melanoma Cancer
# filepath_melanoma_ph = 'Melanoma/TCGA-SKCM.GDC_phenotype.tsv'
# data_melanoma_ph = pd.read_csv(filepath_melanoma_ph,delimiter = '\t')

# # Prostate Cancer
# filepath_prostate_ph = 'Prostate/TCGA-PRAD.GDC_phenotype.tsv'
# data_prostate_ph = pd.read_csv(filepath_prostate_ph,delimiter = '\t')

# # Rectal Cancer
# filepath_rectal_ph = 'Rectal/TCGA-READ.GDC_phenotype.tsv'
# data_rectal_ph = pd.read_csv(filepath_rectal_ph,delimiter = '\t')

In [None]:
# # Get counts of types of sample, either tumor, normal or metastatic
# data_ph['sample_type.samples'].value_counts()

# # Get counts of nulls
# data_ph['sample_type.samples'].isnull().sum()