In [2]:
import numpy as np
import pandas as pd

We get the subtypes for our patients using SNF algorithm. However, the algorithm can only splits the patients into groups with no information about which particular classes the groups belong to. Thus, here, we find the consensus between the true labels and our results. 

One route can be finding the differences between our grouping and the labels from TCGA. Our assumption here is that SNF tend to produce similar outputs as TCGA pipeline does.

Another route is that we ignore the true label, continue analysis with our grouping to see if our grouping make any sense.

### Evaluate SNF result comparing with TCGA subtype

In [230]:
group = pd.read_csv('/Users/yuanqizhao/Desktop/SNFtool/Result/3clusters/group.txt',delimiter= ' ',index_col=0)
group.shape

(347, 1)

In [12]:
group['x'].unique()

array([3, 1, 2])

In [232]:
TCGA_subtype = pd.read_csv('data/true_subtype.txt', header=0,index_col=0 , sep='\t')
TCGA_subtype[:3]

Unnamed: 0,GeneExp_Subtype
TCGA-02-0001-01,1
TCGA-02-0003-01,3
TCGA-02-0007-01,3


In [233]:
group.index = TCGA_subtype.index
group[:3]

Unnamed: 0,x
TCGA-02-0001-01,3
TCGA-02-0003-01,3
TCGA-02-0007-01,1


In [7]:
group.to_csv('/Users/yuanqizhao/Desktop/SNFtool/Result/SNF_subtype4.txt', header=False, index=True, sep='\t', mode='a')

In [8]:
TCGA_group = TCGA_subtype.groupby(['GeneExp_Subtype'])
# TCGA_group.groups

In [9]:
SNF_group = group.groupby(['x'])
# SNF_group.groups

##### Use largest common elements to reassign labels

In [10]:
best_group = []
for i in TCGA_group.__iter__():
    sub_tcga = set(list(i[1].index))
    inter_len=[]
    for j in SNF_group.__iter__():
        sub_group = set(list(j[1].index))
        inter = len(list(sub_tcga & sub_group))
#         print(j[0],': ', inter)
        inter_len.append(inter/len(sub_group))   ###percentage of match
    max_per = np.argmax(inter_len)
    best_group.append((max_per,inter_len[max_per]))
best_group

[(0, 0.3858267716535433),
 (0, 0.2204724409448819),
 (3, 0.6078431372549019),
 (2, 0.5802469135802469)]

We changed to three clusters.

https://www.nature.com/articles/nrclinonc.2017.122#:~:text=Glioblastoma%20(GBM)%20was%20previously%20classified,nonmalignant%20cells%20in%20the%20analysis

### 3 Clusters clinical differences

In [23]:
group.shape

(347, 1)

In [17]:
clinical = pd.read_csv('/Users/yuanqizhao/Desktop/CompMed_Project/data/GBM_clinicalMatrix', sep='\t',header=0,index_col=0)
clinical[:3]

Unnamed: 0_level_0,CDE_DxAge,CDE_alk_chemoradiation_standard,CDE_chemo_adjuvant_alk,CDE_chemo_adjuvant_tmz,CDE_chemo_alk,CDE_chemo_alk_days,CDE_chemo_alk_long,CDE_chemo_tmz,CDE_chemo_tmz_days,CDE_chemo_tmz_long,...,_GENOMIC_ID_TCGA_GBM_exp_u133a,_GENOMIC_ID_TCGA_GBM_exp_HiSeqV2_PANCAN,_GENOMIC_ID_TCGA_GBM_RPPA_RBN,_GENOMIC_ID_TCGA_GBM_gistic2thd,_GENOMIC_ID_TCGA_GBM_PDMRNAseqCNV,_GENOMIC_ID_TCGA_GBM_mutation,_GENOMIC_ID_TCGA_GBM_exp_HiSeqV2_percentile,_GENOMIC_ID_TCGA_GBM_exp_HiSeqV2_exon,_GENOMIC_ID_TCGA_GBM_gistic2,_GENOMIC_ID_TCGA_GBM_hMethyl450
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-02-0001-01,44.3,False,False,False,False,0.0,False,False,0.0,False,...,03c14838-bbab-4487-80c3-c5fdf6ec83f9,,,TCGA-02-0001-01C-01D-0182-01,,,,,TCGA-02-0001-01C-01D-0182-01,
TCGA-02-0002-01,,,,,,,,,,,...,,,,,,,,,,
TCGA-02-0003-01,50.21,False,False,False,False,0.0,False,False,0.0,False,...,4298ad72-e8eb-4749-95da-cce602107724,,TCGA-02-0003-01A-21-1898-20,TCGA-02-0003-01A-01D-0182-01,,TCGA-02-0003-01A-01D-1490-08,,,TCGA-02-0003-01A-01D-0182-01,


In [19]:
clinical.columns

Index(['CDE_DxAge', 'CDE_alk_chemoradiation_standard',
       'CDE_chemo_adjuvant_alk', 'CDE_chemo_adjuvant_tmz', 'CDE_chemo_alk',
       'CDE_chemo_alk_days', 'CDE_chemo_alk_long', 'CDE_chemo_tmz',
       'CDE_chemo_tmz_days', 'CDE_chemo_tmz_long',
       ...
       '_GENOMIC_ID_TCGA_GBM_exp_u133a',
       '_GENOMIC_ID_TCGA_GBM_exp_HiSeqV2_PANCAN',
       '_GENOMIC_ID_TCGA_GBM_RPPA_RBN', '_GENOMIC_ID_TCGA_GBM_gistic2thd',
       '_GENOMIC_ID_TCGA_GBM_PDMRNAseqCNV', '_GENOMIC_ID_TCGA_GBM_mutation',
       '_GENOMIC_ID_TCGA_GBM_exp_HiSeqV2_percentile',
       '_GENOMIC_ID_TCGA_GBM_exp_HiSeqV2_exon', '_GENOMIC_ID_TCGA_GBM_gistic2',
       '_GENOMIC_ID_TCGA_GBM_hMethyl450'],
      dtype='object', length=128)

### Append subtype labels

In [25]:
df = group.merge(clinical,how='inner',left_index=True, right_index=True)

In [26]:
df = df.rename(columns={'x':'subtype'})
df[:3]

Unnamed: 0,subtype,CDE_DxAge,CDE_alk_chemoradiation_standard,CDE_chemo_adjuvant_alk,CDE_chemo_adjuvant_tmz,CDE_chemo_alk,CDE_chemo_alk_days,CDE_chemo_alk_long,CDE_chemo_tmz,CDE_chemo_tmz_days,...,_GENOMIC_ID_TCGA_GBM_exp_u133a,_GENOMIC_ID_TCGA_GBM_exp_HiSeqV2_PANCAN,_GENOMIC_ID_TCGA_GBM_RPPA_RBN,_GENOMIC_ID_TCGA_GBM_gistic2thd,_GENOMIC_ID_TCGA_GBM_PDMRNAseqCNV,_GENOMIC_ID_TCGA_GBM_mutation,_GENOMIC_ID_TCGA_GBM_exp_HiSeqV2_percentile,_GENOMIC_ID_TCGA_GBM_exp_HiSeqV2_exon,_GENOMIC_ID_TCGA_GBM_gistic2,_GENOMIC_ID_TCGA_GBM_hMethyl450
TCGA-02-0001-01,3,44.3,False,False,False,False,0.0,False,False,0.0,...,03c14838-bbab-4487-80c3-c5fdf6ec83f9,,,TCGA-02-0001-01C-01D-0182-01,,,,,TCGA-02-0001-01C-01D-0182-01,
TCGA-02-0003-01,3,50.21,False,False,False,False,0.0,False,False,0.0,...,4298ad72-e8eb-4749-95da-cce602107724,,TCGA-02-0003-01A-21-1898-20,TCGA-02-0003-01A-01D-0182-01,,TCGA-02-0003-01A-01D-1490-08,,,TCGA-02-0003-01A-01D-0182-01,
TCGA-02-0007-01,1,40.53,True,True,True,True,306.0,True,True,306.0,...,b972e653-da0c-4db0-9769-e82cc8a408e0,,,TCGA-02-0007-01A-01D-0182-01,,,,,TCGA-02-0007-01A-01D-0182-01,


### Subtype Clinical Analysis

In [27]:
list(df.columns)

['subtype',
 'CDE_DxAge',
 'CDE_alk_chemoradiation_standard',
 'CDE_chemo_adjuvant_alk',
 'CDE_chemo_adjuvant_tmz',
 'CDE_chemo_alk',
 'CDE_chemo_alk_days',
 'CDE_chemo_alk_long',
 'CDE_chemo_tmz',
 'CDE_chemo_tmz_days',
 'CDE_chemo_tmz_long',
 'CDE_missing',
 'CDE_missingflag',
 'CDE_previously_treated',
 'CDE_radiation_adjuvant',
 'CDE_radiation_adjuvant_standard',
 'CDE_radiation_adjuvant_standard_probable',
 'CDE_radiation_any',
 'CDE_radiation_standard',
 'CDE_radiation_standard_probable',
 'CDE_sourcesite',
 'CDE_survival_time',
 'CDE_suspect',
 'CDE_therapy',
 'CDE_tmz_chemoradiation_standard',
 'CDE_vital_status',
 'G_CIMP_STATUS',
 'GeneExp_Subtype',
 'In_Cancer_Cell_Paper',
 '_INTEGRATION',
 '_PANCAN_CNA_PANCAN_K8',
 '_PANCAN_Cluster_Cluster_PANCAN',
 '_PANCAN_DNAMethyl_GBM',
 '_PANCAN_DNAMethyl_PANCAN',
 '_PANCAN_RPPA_PANCAN_K8',
 '_PANCAN_UNC_RNAseq_PANCAN_K16',
 '_PANCAN_mutation_PANCAN',
 '_PATIENT',
 '_cohort',
 '_primary_disease',
 '_primary_site',
 'additional_chemo_th

In [30]:
drop_ls = [name for name in df.columns if '_GENOMIC_ID' in name]
df = df.drop(columns = drop_ls)

In [33]:
df.shape

(347, 107)

In [34]:
drop_ls = [name for name in df.columns if 'id' in name]
df = df.drop(columns = drop_ls)
df.shape

(347, 104)

In [36]:
df.dtypes

subtype                                   int64
CDE_DxAge                               float64
CDE_alk_chemoradiation_standard          object
CDE_chemo_adjuvant_alk                   object
CDE_chemo_adjuvant_tmz                   object
                                         ...   
tissue_source_site                       object
tumor_tissue_site                        object
vial_number                              object
vital_status                             object
year_of_initial_pathologic_diagnosis    float64
Length: 104, dtype: object

In [37]:
continuous = [name for name in df.columns if df[name].dtype=='float64']
continuous

['CDE_DxAge',
 'CDE_chemo_alk_days',
 'CDE_chemo_tmz_days',
 'CDE_sourcesite',
 'CDE_survival_time',
 'age_at_initial_pathologic_diagnosis',
 'days_to_birth',
 'days_to_collection',
 'days_to_initial_pathologic_diagnosis',
 'days_to_last_followup',
 'days_to_new_tumor_event_additional_surgery_procedure',
 'days_to_tumor_recurrence',
 'eastern_cancer_oncology_group',
 'initial_weight',
 'intermediate_dimension',
 'karnofsky_performance_score',
 'longest_dimension',
 'shortest_dimension',
 'year_of_initial_pathologic_diagnosis']

In [43]:
continuous_df = df[continuous]
continuous_df.shape

(347, 19)

In [38]:
discrete = [name for name in df.columns if df[name].dtype=='object']
discrete

['CDE_alk_chemoradiation_standard',
 'CDE_chemo_adjuvant_alk',
 'CDE_chemo_adjuvant_tmz',
 'CDE_chemo_alk',
 'CDE_chemo_alk_long',
 'CDE_chemo_tmz',
 'CDE_chemo_tmz_long',
 'CDE_missing',
 'CDE_missingflag',
 'CDE_previously_treated',
 'CDE_radiation_adjuvant',
 'CDE_radiation_adjuvant_standard',
 'CDE_radiation_adjuvant_standard_probable',
 'CDE_radiation_any',
 'CDE_radiation_standard',
 'CDE_radiation_standard_probable',
 'CDE_suspect',
 'CDE_therapy',
 'CDE_tmz_chemoradiation_standard',
 'CDE_vital_status',
 'G_CIMP_STATUS',
 'GeneExp_Subtype',
 'In_Cancer_Cell_Paper',
 '_INTEGRATION',
 '_PANCAN_CNA_PANCAN_K8',
 '_PANCAN_Cluster_Cluster_PANCAN',
 '_PANCAN_DNAMethyl_GBM',
 '_PANCAN_DNAMethyl_PANCAN',
 '_PANCAN_RPPA_PANCAN_K8',
 '_PANCAN_UNC_RNAseq_PANCAN_K16',
 '_PANCAN_mutation_PANCAN',
 '_PATIENT',
 '_cohort',
 '_primary_disease',
 '_primary_site',
 'additional_chemo_therapy',
 'additional_drug_therapy',
 'additional_immuno_therapy',
 'additional_pharmaceutical_therapy',
 'additio

In [44]:
discrete_df = df[discrete]
discrete_df.shape

(347, 84)

In [46]:
discrete_df['CDE_chemo_tmz']

TCGA-02-0001-01    False
TCGA-02-0003-01    False
TCGA-02-0007-01     True
TCGA-02-0009-01    False
TCGA-02-0010-01     True
                   ...  
TCGA-76-6193-01      NaN
TCGA-76-6282-01      NaN
TCGA-76-6285-01      NaN
TCGA-81-5910-01    False
TCGA-87-5896-01    False
Name: CDE_chemo_tmz, Length: 347, dtype: object

#### Differential analysis of subtypes on continuous variables

In [47]:
!pip install bioinfokit

Collecting bioinfokit
[?25l  Downloading https://files.pythonhosted.org/packages/f5/48/d117e32009dfdfbac82d9e17bdcea730b81ba6e137b29f14d59811a7488e/bioinfokit-1.0.4.tar.gz (76kB)
[K     |████████████████████████████████| 81kB 5.3MB/s eta 0:00:011
Collecting matplotlib_venn (from bioinfokit)
  Downloading https://files.pythonhosted.org/packages/ca/e8/53441d7feb29ab10de3bd46c05358c41f0ba2f57395e88ffbb62d6b69df3/matplotlib-venn-0.11.6.tar.gz
Collecting tabulate (from bioinfokit)
  Downloading https://files.pythonhosted.org/packages/c4/f4/770ae9385990f5a19a91431163d262182d3203662ea2b5739d0fcfc080f1/tabulate-0.8.7-py3-none-any.whl
Collecting textwrap3 (from bioinfokit)
  Downloading https://files.pythonhosted.org/packages/77/9c/a53e561d496ee5866bbeea4d3a850b3b545ed854f8a21007c1e0d872e94d/textwrap3-0.9.2-py2.py3-none-any.whl
Collecting adjustText (from bioinfokit)
  Downloading https://files.pythonhosted.org/packages/9e/15/4157718bf323fd5f5b81c891c660d0f388e042d2689a558bf1389632dc44/adjust

In [64]:
df_c = pd.concat([continuous_df,df['subtype']],axis=1)
df_c[:3]

Unnamed: 0,CDE_DxAge,CDE_chemo_alk_days,CDE_chemo_tmz_days,CDE_sourcesite,CDE_survival_time,age_at_initial_pathologic_diagnosis,days_to_birth,days_to_collection,days_to_initial_pathologic_diagnosis,days_to_last_followup,days_to_new_tumor_event_additional_surgery_procedure,days_to_tumor_recurrence,eastern_cancer_oncology_group,initial_weight,intermediate_dimension,karnofsky_performance_score,longest_dimension,shortest_dimension,year_of_initial_pathologic_diagnosis,subtype
TCGA-02-0001-01,44.3,0.0,0.0,2.0,353.0,44.0,-16179.0,,0.0,279.0,,137.0,,,0.7,80.0,2.0,0.2,2002.0,3
TCGA-02-0003-01,50.21,0.0,0.0,2.0,144.0,50.0,-18341.0,,0.0,144.0,,40.0,,,0.9,100.0,2.1,0.1,2003.0,3
TCGA-02-0007-01,40.53,306.0,306.0,2.0,705.0,40.0,-14806.0,,0.0,705.0,,,,,,80.0,,,2002.0,1


In [66]:
df_d = pd.concat([discrete_df,df['subtype']],axis=1)

In [96]:
import itertools

In [117]:
val_dict = {}
groups = [df_c.groupby(['subtype']).get_group(i) for i in range(1,4)]
for name in df_c.columns:
    if name == 'subtype':
        continue
    df_name = [group[name].values for group in groups]
    g1_n = ['G1']*df_name[0].shape[0]
    g2_n = ['G2']*df_name[1].shape[0]
    g3_n = ['G3']*df_name[2].shape[0]
    subtype = pd.Series(g1_n+g2_n+g3_n)
    df_name = pd.Series(itertools.chain.from_iterable(df_name))
    df_name = pd.concat([subtype,df_name],axis=1)
    df_name.columns = ['subtype','values']
    val_dict[name] = df_name

In [49]:
from bioinfokit.analys import stat

In [118]:
sig_diff_ls = []
for key,dfs in val_dict.items():
    
    ### deal with nas
    if pd.isna(dfs).sum().sum()>= dfs.size*.4:
        continue
    
    res = stat()
    res.tukey_hsd(df=dfs, res_var='values', xfac_var='subtype', anova_model='values ~ C(subtype)')
    pval = res.tukey_summary['p-value']
    if pd.Series.min(pval)<=0.05:
        sig_diff_ls.append(key)
        print(key,'\n',res.tukey_summary)

CDE_DxAge 
   group1 group2      Diff     Lower      Upper   q-value   p-value
0     G1     G2  8.831249  3.897849  13.764649  5.961064  0.001000
1     G1     G3  2.983769 -1.253978   7.221516  2.344651  0.223419
2     G2     G3  5.847480  0.513526  11.181434  3.650627  0.027681
CDE_chemo_alk_days 
   group1 group2       Diff      Lower       Upper   q-value   p-value
0     G1     G2   5.800078 -51.503308   63.103463  0.337056  0.900000
1     G1     G3  66.977677  17.754576  116.200777  4.531157  0.004246
2     G2     G3  61.177599  -0.778383  123.133581  3.288192  0.053820
CDE_survival_time 
   group1 group2        Diff       Lower       Upper   q-value   p-value
0     G1     G2  207.221875   38.746034  375.697716  4.095987  0.011223
1     G1     G3  112.787500  -32.420414  257.995414  2.586612  0.161963
2     G2     G3  320.009375  137.663669  502.355081  5.844236  0.001000
age_at_initial_pathologic_diagnosis 
   group1 group2      Diff     Lower      Upper   q-value   p-value
0     



days_to_last_followup 
   group1 group2        Diff       Lower       Upper   q-value   p-value
0     G1     G2  185.727572   21.210444  350.244700  3.758575  0.022388
1     G1     G3   91.460967  -50.269388  233.191322  2.148478  0.283637
2     G2     G3  277.188539  100.967531  453.409547  5.236915  0.001000


In [126]:
import matplotlib.pyplot as plt
import numpy as np
from lifelines import *

In [131]:
ages = []
for g in groups.__iter__():
    ages.append(pd.Series.mean(g['days_to_birth']))
ages

[-22474.591463414636, -19580.681159420288, -21452.61261261261]

In [132]:
avg_ages = [i/365 for i in ages]
avg_ages

[-61.574223187437354, -53.645701806630925, -58.77428113044551]

In [135]:
for name in sig_diff_ls:
    print(name)
    for g in groups.__iter__():
        print(pd.Series.mean(g[name]))

CDE_DxAge
61.37515527950317
52.543906250000006
58.391386138613875
CDE_chemo_alk_days
148.98757763975155
143.1875
82.00990099009901
CDE_survival_time
426.3875
633.609375
313.6
age_at_initial_pathologic_diagnosis
61.02439024390244
53.11594202898551
58.27927927927928
days_to_birth
-22474.591463414636
-19580.681159420288
-21452.61261261261
days_to_last_followup
420.7361963190184
606.463768115942
329.27522935779814


#### Discrete Independence

In [155]:
from scipy.stats import chi2_contingency as chi2_contingency

In [211]:
groups_d = [df_d.groupby(['subtype']).get_group(i) for i in range(1,4)]
name_count = {}
for name in df_d.columns:
    contingency_t = []
    categories = df_d[name].unique()
    for group in groups_d.__iter__():
        counts = []
        for c in categories:
            counts.append(group[name][group[name]==c].count())
        contingency_t.append(pd.Series(counts))
    contingency_t = pd.DataFrame(contingency_t)
    idx = ['G1','G2','G3']
    contingency_t.index = idx
    contingency_t.columns = categories
    name_count[name] = contingency_t

In [200]:
name_count['CDE_alk_chemoradiation_standard']

Unnamed: 0,False,True,NaN
G1,89,72,0
G2,30,34,0
G3,58,43,0


In [212]:
for key, value in name_count.items():
    ls1 = value.sum(axis =1)
#     print('h',ls1)
    idx1 = np.where(np.asarray(ls1)==0)[0]
#     print('d',idx1)
    if len(idx1)!=0:
        name_count[key] = pd.DataFrame([])
        continue
#     print(key,name_count[key])
    ls = value.sum(axis=0)
    idx = np.where(np.asarray(ls)==0)[0]
    if len(idx)!=0:
        drop_col = [value.columns[i] for i in idx]
        new_df = value.drop(drop_col,axis=1)
        name_count[key] = new_df
name_count    

{'CDE_alk_chemoradiation_standard':     False  True 
 G1     89     72
 G2     30     34
 G3     58     43, 'CDE_chemo_adjuvant_alk':     False  True 
 G1     50    111
 G2     21     43
 G3     31     70, 'CDE_chemo_adjuvant_tmz':     False  True 
 G1     63     98
 G2     24     40
 G3     35     66, 'CDE_chemo_alk':     False  True 
 G1     39    122
 G2     16     48
 G3     29     72, 'CDE_chemo_alk_long':     False  True 
 G1     44    117
 G2     20     44
 G3     37     64, 'CDE_chemo_tmz':     False  True 
 G1     53    108
 G2     19     45
 G3     31     70, 'CDE_chemo_tmz_long':     False  True 
 G1     54    107
 G2     23     41
 G3     38     63, 'CDE_missing':     histological_type,  vital_status,
 G1                   0              1
 G2                   1              0
 G3                   0              1, 'CDE_missingflag':     False  True 
 G1    160      1
 G2     63      1
 G3    100      1, 'CDE_previously_treated':     False  True 
 G1    154      7
 G2    

In [213]:
sig_d =[]
for key,value in name_count.items():
    if value.size ==0 or value.shape[1]<2:
        continue
    print(name_count[key])
    chi2,pval,dof,exp  = chi2_contingency(name_count[key])
   
    if pval<=0.05:
        sig_d.append((key, pval))
sig_d


    False  True 
G1     89     72
G2     30     34
G3     58     43
    False  True 
G1     50    111
G2     21     43
G3     31     70
    False  True 
G1     63     98
G2     24     40
G3     35     66
    False  True 
G1     39    122
G2     16     48
G3     29     72
    False  True 
G1     44    117
G2     20     44
G3     37     64
    False  True 
G1     53    108
G2     19     45
G3     31     70
    False  True 
G1     54    107
G2     23     41
G3     38     63
    histological_type,  vital_status,
G1                   0              1
G2                   1              0
G3                   0              1
    False  True 
G1    160      1
G2     63      1
G3    100      1
    False  True 
G1    154      7
G2     57      6
G3     96      5
    True   False
G1    121     40
G2     56      8
G3     82     19
    True   False
G1    100     61
G2     46     18
G3     58     43
    True   False
G1    105     56
G2     50     14
G3     61     40
    True   False
G1    127     3

[('CDE_therapy', 0.0239712693970924),
 ('CDE_vital_status', 0.04219955419397384),
 ('G_CIMP_STATUS', 1.897978420038419e-15),
 ('GeneExp_Subtype', 8.534673419184012e-22),
 ('_PANCAN_CNA_PANCAN_K8', 2.1989441476890682e-14),
 ('_PANCAN_DNAMethyl_GBM', 9.889506786244145e-30),
 ('_PANCAN_DNAMethyl_PANCAN', 2.253317472999156e-15),
 ('_PANCAN_mutation_PANCAN', 0.002746407829379185),
 ('form_completion_date', 0.04856771110301229),
 ('hormonal_therapy', 0.004905154740707577),
 ('initial_pathologic_diagnosis_method', 0.04956822543700062),
 ('tissue_source_site', 7.59186908651961e-05),
 ('vital_status', 0.023497201679813208),
 ('subtype', 6.940552177863327e-149)]

In [215]:
name_count['CDE_therapy']

Unnamed: 0,StandardRadiation,TMZChemoradiation.TMZChemoLong,NonstandardRadiation,StandardRadiation.TMZChemoLong,StandardRadiation.TMZChemo,StandardRadiation.AlkChemoLong,TMZChemoLong,NonstandardRadiation.AlkChemoLong,NonstandardRadiation.TMZChemoLong,NonstandardRadiation.TMZChemo,AlkChemoradiation.AlkChemoLong,AlkChemo,StandardRadiation.AlkChemo,AlkChemoLong
G1,8,72,5,24,0,7,4,2,7,1,0,3,1,1
G2,7,32,2,8,3,1,1,0,0,1,2,0,0,0
G3,3,43,11,14,3,1,2,0,4,4,0,1,0,0


In [216]:
name_count['CDE_vital_status']

Unnamed: 0,DECEASED,LIVING
G1,124,36
G2,40,24
G3,67,33


In [217]:
name_count['G_CIMP_STATUS']

Unnamed: 0,NON G-CIMP,G-CIMP
G1,161,4
G2,42,24
G3,108,5


Recently, a subset of relatively favorable prognosis GBMs has been identified. These glioma CpG island methylator phenotype, or G-CIMP tumors, have distinct genomic copy number aberrations, DNA methylation patterns, and (mRNA) expression profiles compared to other GBMs. While the standard method for identifying G-CIMP tumors is based on genome-wide DNA methylation data.

In [218]:
name_count['GeneExp_Subtype']

Unnamed: 0,Classical,Proneural,Mesenchymal,Neural
G1,68,25,38,34
G2,4,48,12,5
G3,23,18,55,17


In [219]:
name_count['_PANCAN_CNA_PANCAN_K8']

Unnamed: 0,High,GBM,Kirc+,Squamous,Iq,BRCA-LUAD+,Quiet,COAD-READ
G1,0,143,11,0,1,10,0,0
G2,4,25,22,8,1,8,0,1
G3,2,80,18,0,0,7,5,1


In [220]:
name_count['_PANCAN_DNAMethyl_GBM']

Unnamed: 0,cluster 1,cluster 3,cluster 4,cluster 5,cluster 6,cluster 2
G1,12,66,36,3,13,31
G2,3,12,8,21,19,6
G3,35,15,6,4,6,46


In [221]:
name_count['hormonal_therapy']

Unnamed: 0,NO,YES
G1,115,38
G2,48,14
G3,87,8


In [222]:
name_count['initial_pathologic_diagnosis_method']

Unnamed: 0,Tumor resection,Excisional Biopsy,Incisional Biopsy,Fine needle aspiration biopsy,"Other method, specify:"
G1,131,28,2,2,1
G2,64,4,1,0,0
G3,104,5,1,1,0


In [223]:
name_count['tissue_source_site']

Unnamed: 0,02,06,12,14,15,16,19,26,27,28,32,41,76,81,87
G1,13,29,31,20,1,8,18,7,14,6,10,4,2,0,1
G2,13,11,3,12,2,3,5,2,2,7,4,2,3,0,0
G3,18,29,3,6,1,2,12,3,1,14,11,3,7,1,0


In [224]:
name_count['vital_status']

Unnamed: 0,DECEASED,LIVING
G1,138,25
G2,48,21
G3,84,27


In [225]:
cnv = pd.read_csv('data/cnv.txt',delimiter= '\t',index_col=0,header=0)

In [231]:
group

Unnamed: 0,x
1,3
2,3
3,1
4,1
5,2
...,...
343,3
344,3
345,3
346,3


In [240]:
cnv = cnv.merge(group,how='inner',left_index=True, right_index=True)

In [241]:
cnv.columns

Index(['ACAP3', 'ACTRT2', 'AGRN', 'ANKRD65', 'ATAD3A', 'ATAD3B', 'ATAD3C',
       'AURKAIP1', 'B3GALT6', 'C1orf159',
       ...
       'SNORA56', 'TMLHE', 'VBP1', 'IL9R|ENSG00000124334.12',
       'SPRY3|ENSG00000168939.6', 'VAMP7|ENSG00000124333.10',
       'WASH6P|ENSG00000182484.10', 'WASIR1|ENSG00000185203.7', 'x_x', 'x_y'],
      dtype='object', length=24778)

In [244]:
idh1 = cnv[['x_x', 'IDH1']]

In [248]:
g2 = idh1[idh1['x_x']==2]

In [250]:
g2.shape

(69, 2)

In [252]:
g2[g2['IDH1']!=0].shape

(63, 2)

In [251]:
g1 = idh1[idh1['x_x']==1]
g1.shape

(165, 2)

In [253]:
g1[g1['IDH1']!=0].shape

(149, 2)

In [254]:
g3 = idh1[idh1['x_x']==3]
g3.shape

(113, 2)

In [255]:
g3[g3['IDH1']!=0].shape

(101, 2)

In [256]:
[63/69,149/165,101/113]

[0.9130434782608695, 0.9030303030303031, 0.8938053097345132]