In [14]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os
import scipy
import numpy as np
import glob
from scipy.stats import mannwhitneyu
from matplotlib.pyplot import subplot_mosaic as mosaic
import matplotlib.transforms as mtransforms
import math
import matplotlib

### Read results from the first part of the test protocol and save to csv for faster plotting

In [5]:
cwd = os.getcwd()
n = 100 # nb of random partitions
k = 5000 # nb of max top_k_edges selected
m = 10 # number of iterations on confounder-based partitions
ct_sels_init = ['BRCA_metabric', 'BRCA', 'CESC', 'COAD', 'GBM', 'HNSC', 'KIRC', 'KIRP', 'LUAD', 'LUSC', 'PCPG', 'READ', 'STAD']
confs_init = ['tobacco_smoking_history', 'alcohol_history.exposures', 'age_quartile', 'age_at_initial_pathologic_diagnosis', 'race.demographic', 'gender.demographic', 'tumor_stage.diagnoses', 'Tumor Stage']
algs = ['ARACNE', 'CEMITOOL', 'GRNBOOST2', 'WGCNA'] # string identifiers of algorithms
alpha = 0.05

In [6]:
JI = pd.DataFrame(columns = ['Confounder/\nvariable', 'Cohort', 'Method', 'Partition type', 'partID', 'k', 'mean JI'])
for alg_sel in algs:
    for conf_sel in confs_init:
        for ct_sel in ct_sels_init:
            path = os.path.join(cwd, alg_sel, str(ct_sel))
            for i in range(m):
                try:
                    filename = f'cb_{i}_{alg_sel}_{conf_sel}_{ct_sel}_jaccInd.csv'
                    filename = os.path.join(path, filename)
                    df_conf = pd.read_csv(filename, sep=',', header=0)
                    df_conf['partID'] = i
                    df_conf['Partition type'] = 'confounder-based partition'
                    df_conf['Method'] = alg_sel
                    df_conf['Cohort'] = ct_sel
                    df_conf['Confounder/\nvariable'] = conf_sel
                    JI = pd.concat([JI, df_conf])
                except:
                    continue
for alg_sel in algs:
    for conf_sel in confs_init:
        for ct_sel in ct_sels_init:
            path = os.path.join(cwd, alg_sel, str(ct_sel))
            for i in range(n):
                filename = 'rnd'+'_'+str(i)+'_'+alg_sel+'_'+conf_sel+'_'+ct_sel+'_jaccInd.csv'
                try:
                    df_rnd = pd.read_csv(os.path.join(path, filename), sep=',', header=0)                   
                    df_rnd['partID'] = i
                    df_rnd['Partition type'] = 'random partition'
                    df_rnd['Method'] = alg_sel
                    df_rnd['Cohort'] = ct_sel
                    df_rnd['Confounder/\nvariable'] = conf_sel
                except:
                    continue
                JI = pd.concat([JI, df_rnd])
JI_all = JI.copy()

In [7]:
# adjust fields
JI_all['Confounder/\nvariable'] = JI_all['Confounder/\nvariable'].replace(['race'], 'ethnicity')
JI_all['Confounder/\nvariable'] = JI_all['Confounder/\nvariable'].replace(['race.demographic'], 'ethnicity')
JI_all['Confounder/\nvariable'] = JI_all['Confounder/\nvariable'].replace(['age_quartile'], 'age')
JI_all['Confounder/\nvariable'] = JI_all['Confounder/\nvariable'].replace(['gender.demographic'], 'sex')
JI_all['Confounder/\nvariable'] = JI_all['Confounder/\nvariable'].replace(['tumor_stage.diagnoses'], 'stage')
JI_all['Confounder/\nvariable'] = JI_all['Confounder/\nvariable'].replace(['alcohol_history.exposures'], 'alcohol history')
JI_all['Confounder/\nvariable'] = JI_all['Confounder/\nvariable'].replace(['tobacco_smoking_history'], 'smoking history')

JI_all['Confounder/\nvariable'] = JI_all['Confounder/\nvariable'].replace(['Tumor Stage'], 'stage')
JI_all['Cohort'] = JI_all['Cohort'].replace(['BRCA_metabric'], 'METABRIC')
JI_all['Confounder/\nvariable'] = JI_all['Confounder/\nvariable'].replace(['age_at_initial_pathologic_diagnosis'], 'age')
JI_all.rename(columns={'mean JI': 'Mean JI'}, inplace=True)
JI_all = JI_all[JI_all['k']%100 == 10]
JI_all['Method'] = JI_all['Method'].replace(['ARACNE'], 'ARACNe-AP').replace(['CEMITOOL'], 'CEMiTool').replace(['GRNBOOST2'], 'GRNBoost2')
JI_all['Cohort x\nConfounder'] = JI_all['Cohort'] + ' x ' + JI_all['Confounder/\nvariable']
#JI_all.reset_index(inplace=True)

In [8]:
JI_all.to_csv('JI_all.csv', index=False)

### Read results from the second part of the test protocol and save to csv for faster plotting

In [9]:
alg_sels = ['WGCNA', 'CEMITOOL', 'ARACNE', 'GRNBOOST2']
ct_sels = ['METABRIC', 'BRCA', 'CESC', 'STAD', 'KIRC', 'KIRP', 'LUAD']
conf_sels = ['age_at_initial_pathologic_diagnosis','race.demographic','gender.demographic']
blocks = {'age_at_initial_pathologic_diagnosis': ['lower', 'upper'],
          'race.demographic': ['asian', 'black or african american', 'white'],
          'gender.demographic': ['female', 'male']}
fro = 0
to = 10
cwd = os.getcwd()
JI = {'Confounder': [], 'Cohort': [], 'Method': [], 'Demographic group': [], 'Sampling': [], 'partID': [], 'k': [], 'Mean JI': []}
for alg_sel in alg_sels:
    for conf_sel in conf_sels:
        for ct_sel in ct_sels:
            for block in blocks[conf_sel]:
                path = os.path.join(cwd, 'mean_JI_G_all_comparisons')
                for i in range(fro, to):
                    for t in ['conf', 'rnd']:
                        if ct_sel == 'METABRIC':
                            filename = f'g_all_{t}_{i}_{str(alg_sel)}_age_quartile_BRCA_metabric_{block}_jaccInd.csv'
                        else:
                            filename = f'g_all_{t}_{i}_{str(alg_sel)}_{str(conf_sel)}_{str(ct_sel)}_{block}_jaccInd.csv'
                        try:
                            res = pd.read_csv(os.path.join(path, filename), sep=',', usecols=['k','mean JI', 'state'])
                            num_rows = res.shape[0]
                            JI['Confounder'] += [conf_sel for _ in range(num_rows)]
                            JI['Cohort'] += [ct_sel for _ in range(num_rows)]
                            JI['Method'] += [alg_sel for _ in range(num_rows)]
                            if conf_sel == 'age_at_initial_pathologic_diagnosis':
                                subgroup = f'{block.capitalize()} age quartile'
                            else:
                                subgroup = f'{block.capitalize()}'
                            JI['Demographic group'] += [subgroup for _ in range(num_rows)]
                            if t == 'rnd':
                                JI['Sampling'] += ['Size-matched random' for _ in range(num_rows)]
                            else:
                                JI['Sampling'] += ['Demography-based' for _ in range(num_rows)]
                            JI['partID'] += [i for _ in range(num_rows)]
                            JI['k'] += list(res['k'])
                            JI['Mean JI'] += list(res['mean JI'])
                        except:
                            continue
JI = pd.DataFrame(data=JI)

In [10]:
JI['Confounder'] = JI['Confounder'].replace(['age_at_initial_pathologic_diagnosis'], 'age').replace(['race.demographic'], 'ethnicity').replace(['gender.demographic'], 'sex')
JI = JI.replace('ARACNE', 'ARACNe-AP').replace('CEMITOOL', 'CEMiTool').replace('GRNBOOST2', 'GRNBoost2').replace('Black or african american', 'Black or African American')
JI = JI[JI['k']%100 == 10]
JI['Cohort – confounder'] = JI['Cohort'] + ' – ' + JI['Confounder']
JI['Method'] = pd.Categorical(JI['Method'], ['ARACNe-AP', 'CEMiTool', 'GRNBoost2', 'WGCNA'])
JI['Cohort – confounder'] = pd.Categorical(JI['Cohort – confounder'], list(JI['Cohort – confounder'].unique()))
JI['Demographic group'] = pd.Categorical(JI['Demographic group'], list(JI['Demographic group'].unique()))

In [11]:
JI.to_csv('JI_all_g_all.csv', index=False)

### Compute MWU tests

In [15]:
confs = ['age', 'ethnicity', 'sex', 'alcohol history', 'smoking history', 'stage']
ct_sels = ['METABRIC', 'BRCA', 'CESC', 'COAD', 'GBM', 'HNSC', 'KIRC', 'KIRP', 'LUAD', 'LUSC', 'PCPG', 'READ', 'STAD']

mwus = pd.DataFrame(columns=['Method', 'Cohort', 'Confounder/\nvariable', 'k', 'mwu.pvalue'])
for alg_sel in ['ARACNe-AP', 'CEMiTool', 'GRNBoost2', 'WGCNA']:
    JI_ = JI_all[JI_all['Method'] == alg_sel]
    for cohort in ct_sels:
        coh = JI_[JI_['Cohort'] == cohort]
        for conf_sel in confs:
            conf = coh[coh['Confounder/\nvariable'] == conf_sel]
            if len(conf) == 0:
                continue
            c = conf[conf['Partition type']=='confounder-based partition']
            r = conf[conf['Partition type']=='random partition']
            for k in sorted(list(set(c['k']))):
                c_k = c[c['k'] == k]
                r_k = r[r['k'] == k]
                mwu = mannwhitneyu(c_k['Mean JI'],r_k['Mean JI'], alternative='less')
                df = pd.DataFrame([[alg_sel, cohort, conf_sel, k, mwu.pvalue]], columns=['Method', 'Cohort', 'Confounder/\nvariable', 'k', 'mwu.pvalue'])
                mwus = pd.concat([mwus, df])
                
mwus.reset_index(inplace=True)
manh = mwus.drop('index', axis=1)
mwus = mwus.drop('index', axis=1).drop('k', axis=1)

count = mwus.groupby(['Method', 'Cohort', 'Confounder/\nvariable']).count().reset_index()
count.rename(columns={'mwu.pvalue':'total.pvalues'}, inplace=True)

sign_ = mwus[mwus['mwu.pvalue'] < alpha]
sign_ = sign_.groupby(['Method', 'Cohort', 'Confounder/\nvariable']).count().reset_index()
sign_.rename(columns={'mwu.pvalue':'sign.pvalues'}, inplace=True)
sign = pd.merge(count, sign_,  how='left', left_on=['Method', 'Cohort', 'Confounder/\nvariable'],
              right_on=['Method', 'Cohort', 'Confounder/\nvariable']).fillna(0)
sign['frac'] = sign['sign.pvalues']/sign['total.pvalues']
_mwus = pd.DataFrame(sign).reset_index()
_mwus = _mwus[(_mwus['Method'] != 'GRNBoost2') & (_mwus['Method'] != 'GENIE3')]
_mwus['Confounder/\nvariable'] = pd.Categorical(_mwus['Confounder/\nvariable'], ['age', 'ethnicity', 'sex', 'alcohol history', 'smoking history', 'stage'])
corr_coeff= _mwus.pivot(columns='Method',values='frac',index=['Cohort','Confounder/\nvariable']).corr()

In [16]:
_mwus.to_csv('mwus_fast_methods.csv', index=False)

In [17]:
mwus = pd.DataFrame(columns=['Method', 'Cohort', 'Confounder/\nvariable', 'k', 'mwu.pvalue'])
algs = ['GRNBoost2']
for alg_sel in algs:
    JI_ = JI_all[JI_all['Method'] == alg_sel]
    for cohort in ct_sels:
        coh = JI_[JI_['Cohort'] == cohort]
        for conf_sel in confs:
            conf = coh[coh['Confounder/\nvariable'] == conf_sel]
            if len(conf) == 0:
                continue
            c = conf[conf['Partition type']=='confounder-based partition']
            r = conf[conf['Partition type']=='random partition']
            for k in sorted(list(set(c['k']))):
                c_k = c[c['k'] == k]
                r_k = r[r['k'] == k]
                mwu = mannwhitneyu(c_k['Mean JI'],r_k['Mean JI'], alternative='less')
                df = pd.DataFrame([[alg_sel, cohort, conf_sel, k, mwu.pvalue]], columns=['Method', 'Cohort', 'Confounder/\nvariable', 'k', 'mwu.pvalue'])
                mwus = pd.concat([mwus, df])
mwus.reset_index(inplace=True)
mwus['Cohort - Variable'] = mwus['Cohort'] + ' – ' + mwus['Confounder/\nvariable']
mwus['k'] = pd.Categorical(mwus['k'], range(10, 5000, 100))
mwus['-log(p)'] = -np.log10(mwus['mwu.pvalue'])

In [18]:
mwus.to_csv('mwus_conf_grnboost2.csv', index=False)