In [3]:
import pandas as pd
from glob import glob

In [4]:
## read the datasets in Nature Comm Paper: kong et al. https://www.nature.com/articles/s41467-022-31535-6
# https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-022-31535-6/MediaObjects/41467_2022_31535_MOESM1_ESM.pdf

dfd = pd.read_csv('./dataset_stat_nc.csv')
dfdt = dfd.groupby(['dataset','response']).size().to_frame(name='size').reset_index()
liu = dfd[dfd.dataset=='Liu'].set_index('sampleID')
pd.pivot_table(dfdt, 'size', 'response', 'dataset').T

response,nonresponder,responder
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
Auslander,34,3
Gide,42,49
IMvigor210,230,68
Kim,33,12
Liu,72,47
Prat,16,9
Riaz,39,10


In [5]:
sample_files = glob('./*/processed/samples.csv')
mRNA_files = glob('./*/processed/abundance.csv')

In [6]:
labels = []
genes = []
for sf, sm in zip(sample_files, mRNA_files):
    cohort = sf.split('/')[1].split('_')[1]
    dfs = pd.read_csv(sf,index_col = 0)
    dfm = pd.read_csv(sm,index_col = 0)
    label = dfs[['flag']]
    label.loc[:,'cohort'] = cohort
    genes.append(set(dfm.index))
    labels.append(label)
    print(cohort, dfm.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label.loc[:,'cohort'] = cohort


Hugo (25268, 26)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label.loc[:,'cohort'] = cohort
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label.loc[:,'cohort'] = cohort


Liu (20848, 119)
Allen (23749, 42)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label.loc[:,'cohort'] = cohort
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label.loc[:,'cohort'] = cohort


Riaz (20311, 49)
MGH (19070, 40)
Gide (59403, 73)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label.loc[:,'cohort'] = cohort


In [7]:
df_label = pd.concat(labels)

In [8]:
common_genes = list(set.intersection(*map(set, genes)))

In [9]:
mrnas = []
for sm in mRNA_files:
    dfm = pd.read_csv(sm,index_col = 0)
    dfm = dfm.loc[common_genes]
    print(dfm.shape)
    if dfm.shape[0] == 14950:
        sms = sm
    mrnas.append(dfm)

(14891, 26)
(14891, 119)
(14891, 42)
(14891, 49)
(14891, 40)
(14891, 73)


In [10]:
dfm_all = pd.concat(mrnas, axis=1)
dfm_all = dfm_all.T
dfm_all.index.name = 'samples'

In [11]:
df = df_label.join(dfm_all)
df.to_csv('./model_data/itrp.v1.allgene.csv')
df.groupby(['cohort', 'flag']).size().to_frame().unstack()

Unnamed: 0_level_0,0,0
flag,0.0,1.0
cohort,Unnamed: 1_level_2,Unnamed: 2_level_2
Allen,28,14
Gide,33,40
Hugo,12,14
Liu,72,47
MGH,25,15
Riaz,39,10


## select protein-encoding genes only

In [12]:
cols = df.columns[2:]
gene_name_type_map = pd.read_pickle('/n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37/gene_name_type.map')
scols = cols[cols.map(gene_name_type_map) == 'protein_coding']
dfpc = df[df.columns[:2]].join(df[scols])
dfpc.to_csv('./model_data/itrp.v1.pc.csv')

In [14]:
dfpc

Unnamed: 0_level_0,flag,cohort,CTRL,PON1,RNF5,RFC2,CDK15,ADAMDEC1,ARGLU1,GTPBP10,...,FAM131A,ANKS1A,KDM2A,APEX2,CXXC5,RPN1,BARHL1,BCL6,RAET1E,CDH6
samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Pt1,0.0,Hugo,0.109097,0.000000,0.000012,3.592707,0.050994,0.961236,7.825837,0.820405,...,1.354392,4.610294,3.777608,3.770257,14.696738,30.904117,0.002313,2.296439,0.027466,0.235788
Pt2,1.0,Hugo,0.053454,27.513532,0.000053,3.231825,0.279514,0.413760,6.472230,1.272180,...,1.005918,1.219955,2.199371,2.330469,8.555862,21.217602,0.000000,2.773991,0.007708,0.059146
Pt4,1.0,Hugo,0.066133,0.002032,0.000012,4.664013,0.541774,0.299194,5.632450,0.726660,...,1.071782,3.801157,3.008310,2.443374,5.273865,20.231393,0.000000,1.540183,0.027585,0.045626
Pt5,1.0,Hugo,0.195291,0.000000,0.000084,17.744966,0.453280,0.360162,51.174967,3.372226,...,11.408208,48.722579,19.874556,29.450039,23.173350,185.316153,0.000000,8.107615,0.459669,1.393967
Pt6,1.0,Hugo,0.021505,0.002824,0.000007,3.087662,0.008995,0.041230,6.614850,1.227000,...,1.311774,1.840432,2.073268,2.915321,1.803413,16.781464,0.001071,1.326983,0.015075,0.041028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERR2208977,0.0,Gide,1.085634,4.325787,11.317088,10.763576,0.834908,1.098756,113.909806,24.156945,...,1.806987,10.236076,77.619364,1.573069,4.574501,83.399820,0.000000,19.210528,0.390287,7.721778
ERR3262562,1.0,Gide,0.312987,0.360390,9.729909,7.407001,0.041077,0.614677,5.158011,10.733206,...,0.963436,17.143870,8.149921,1.433787,6.796733,19.203019,0.000000,7.702944,0.007727,0.267510
ERR3262563,0.0,Gide,0.366608,0.548477,12.918828,11.557856,0.041639,1.153355,19.053504,18.693833,...,2.152914,7.216115,18.579343,2.672765,5.048177,26.441587,0.003921,15.374756,0.268686,1.552509
ERR3262564,0.0,Gide,0.474652,1.336014,4.309364,12.488540,0.502847,2.632455,32.395951,15.592795,...,1.138833,5.652721,23.781937,2.089480,3.048816,34.473571,0.004292,16.355884,1.505195,1.749314


In [13]:
df

Unnamed: 0_level_0,flag,cohort,CTRL,PON1,RNF5,RFC2,CDK15,ADAMDEC1,ARGLU1,GTPBP10,...,FAM131A,ANKS1A,KDM2A,APEX2,CXXC5,RPN1,BARHL1,BCL6,RAET1E,CDH6
samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Pt1,0.0,Hugo,0.109097,0.000000,0.000012,3.592707,0.050994,0.961236,7.825837,0.820405,...,1.354392,4.610294,3.777608,3.770257,14.696738,30.904117,0.002313,2.296439,0.027466,0.235788
Pt2,1.0,Hugo,0.053454,27.513532,0.000053,3.231825,0.279514,0.413760,6.472230,1.272180,...,1.005918,1.219955,2.199371,2.330469,8.555862,21.217602,0.000000,2.773991,0.007708,0.059146
Pt4,1.0,Hugo,0.066133,0.002032,0.000012,4.664013,0.541774,0.299194,5.632450,0.726660,...,1.071782,3.801157,3.008310,2.443374,5.273865,20.231393,0.000000,1.540183,0.027585,0.045626
Pt5,1.0,Hugo,0.195291,0.000000,0.000084,17.744966,0.453280,0.360162,51.174967,3.372226,...,11.408208,48.722579,19.874556,29.450039,23.173350,185.316153,0.000000,8.107615,0.459669,1.393967
Pt6,1.0,Hugo,0.021505,0.002824,0.000007,3.087662,0.008995,0.041230,6.614850,1.227000,...,1.311774,1.840432,2.073268,2.915321,1.803413,16.781464,0.001071,1.326983,0.015075,0.041028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERR2208977,0.0,Gide,1.085634,4.325787,11.317088,10.763576,0.834908,1.098756,113.909806,24.156945,...,1.806987,10.236076,77.619364,1.573069,4.574501,83.399820,0.000000,19.210528,0.390287,7.721778
ERR3262562,1.0,Gide,0.312987,0.360390,9.729909,7.407001,0.041077,0.614677,5.158011,10.733206,...,0.963436,17.143870,8.149921,1.433787,6.796733,19.203019,0.000000,7.702944,0.007727,0.267510
ERR3262563,0.0,Gide,0.366608,0.548477,12.918828,11.557856,0.041639,1.153355,19.053504,18.693833,...,2.152914,7.216115,18.579343,2.672765,5.048177,26.441587,0.003921,15.374756,0.268686,1.552509
ERR3262564,0.0,Gide,0.474652,1.336014,4.309364,12.488540,0.502847,2.632455,32.395951,15.592795,...,1.138833,5.652721,23.781937,2.089480,3.048816,34.473571,0.004292,16.355884,1.505195,1.749314
