In [None]:
# data_loader
    # read from model system sub-folders in data
    # write to structure and figure folders in output/
    # read from output/ to correction methods, write to results/

https://pnnl-compbio.github.io/coderdata/pages/usage

#### Download data

In [3]:
import coderdata as cd

In [None]:
# cd.download_data_by_prefix('beataml')
cd.download_data_by_prefix('cell_line')
cd.download_data_by_prefix('cptac')
# cd.download_data_by_prefix('hcmi')

#### Set systems

In [7]:
%whos

Variable   Type      Data/Info
------------------------------
cd         module    <module 'coderdata' from <...>s/coderdata/__init__.py'>


In [21]:
import coderdata as cd
cell_line = cd.DatasetLoader('cell_line', data_directory = '../data/cell_line/') # a
cptac = cd.DatasetLoader('cptac', data_directory = '../data/cptac/') # b

print('DatasetLoaders in RAM')
systems = 'cell-line+CPTAC'

Processing Data...
Processing Data...
DatasetLoaders in RAM


#### Imports and functions

In [None]:
import glob
import pandas as pd
import umap
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

In [10]:
# Traspose long-form X-file into biomarkers as columns,
def extract(df_lite):
    import time
    start = time.time()
    data_type = df_lite.columns[2] # extract marker values

    dot_T = df_lite.pivot_table(
        index='improve_sample_id',
        columns='entrez_id',
        values=data_type,
        aggfunc='mean'             # average duplicate values
    )

    end = time.time()
    wall_clock = end - start
    return str(round(wall_clock / 60, 2)) + ' minutes', dot_T

In [11]:
# Extract ids and biomarker values
def df_check(X_n):
    df_lite = X_n.iloc[:, :3] # cut the last two columns, source and study
    size = f"{df_lite.shape[0]:,}"
    na_count = f"{df_lite.isna().sum().sum():,}"
    inf_count = f"{df_lite.isin([np.inf, -np.inf]).sum().sum():,}"
    return df_lite, size, na_count, inf_count

In [12]:
# dot_T = g(d_typ, dot_T.copy())
def g(d_typ, df):
    """
    Checks the data types of columns and index in a DataFrame and prints informative messages.

    Args:
        df (pandas.DataFrame): The DataFrame to check.

    Returns:
        None
    """

    if df.columns.dtype == 'float64' and df.index.dtype == 'float64':
        print('both float')
        df = float_to_string(d_typ, df)
    elif df.columns.dtype == 'float64' and df.index.dtype == 'int':
        print('columns are float, index are int')
        df = indx_int_colm_flt(d_typ, df)
    elif df.columns.dtype == 'int' and df.index.dtype == 'float':
        print('columns are int, index are float, fail, write another function')
        # forth function
    elif df.columns.dtype == 'int' and df.index.dtype == 'int':
        print('columns are int, index are int')
        df = int_to_string(d_typ, df)
    else:
        print('non int or float dtype detected')
    return df

In [13]:
def int_to_string(d_typ, dot_T):
    dot_T.columns = dot_T.columns.map(str)
    dot_T.columns = ['entrz_' + d_typ + i for i in dot_T.columns] #
    dot_T.columns.name = 'entrez_id'

    dot_T.index = dot_T.index.map(str)
    dot_T.index = ['smpl_id_' + i for i in dot_T.index]
    dot_T.index.name = 'improve_sample_id'
    return dot_T

In [14]:
def indx_int_colm_flt(d_typ, dot_T):
    dot_T.columns = dot_T.columns.map(str)
    dot_T.columns = [i.split('.')[0] for i in dot_T.columns]
    dot_T.columns = ['entrz_' + d_typ + i for i in dot_T.columns]
    dot_T.columns.name = 'entrez_id'
    
    dot_T.index = dot_T.index.map(str)
    dot_T.index = ['smpl_id_' + i for i in dot_T.index]
    dot_T.index.name = 'improve_sample_id'
    return dot_T

In [15]:
def float_to_string(d_typ, dot_T):
    dot_T.columns = dot_T.columns.map(str)
    dot_T.columns = [i.split('.')[0] for i in dot_T.columns]
    dot_T.columns = ['entrz_' + d_typ + i for i in dot_T.columns]
    dot_T.columns.name = 'entrez_id'
    
    dot_T.index = dot_T.index.map(str)
    dot_T.index = [i.split('.')[0] for i in dot_T.index]
    dot_T.index = ['smpl_id_' + i for i in dot_T.index]
    dot_T.index.name = 'improve_sample_id'
    return dot_T

#### Samples, abstract systems to a and b

In [22]:
sys_a_samp = cell_line.samples
sys_a = 'cell-line'
sys_a_lbl = 'cell_line'

sys_b_samp = cptac.samples
sys_b = 'cptac'
sys_b_lbl = 'CPTAC'

#### Cancer_type view

In [None]:
print(sys_a, 'is system a')
sys_a_samp.cancer_type.value_counts()[:60]

In [None]:
print(sys_b, 'is system b')
sys_b_samp.cancer_type.value_counts()[:50]

#### Cancer_type toggles, cell line as a, CPTAC as b

In [None]:
# Lung adeno
cncr = 'lung-adeno'
cncr_lbl = 'lung_adeno'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Lung Adenocarcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Lung adenocarcinoma'] # cp

In [None]:
# Pancreatic adeno
cncr = 'pancreatic-adeno'
cncr_lbl = 'pancreatic_adeno'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Pancreatic Adenocarcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Pancreatic ductal adenocarcinoma'] # cp

In [None]:
# Head and neck
cncr = 'head-neck'
cncr_lbl = 'head_neck'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Head and Neck Squamous Cell Carcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Head and Neck squamous cell carcinoma'] # cp

In [None]:
# Colon adeno
cncr = 'colon-adeno'
cncr_lbl = 'colon_adeno'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Colon Adenocarcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Colon adenocarcinoma'] # cp

In [104]:
# Glioblastoma
cncr = 'glioblastoma'
cncr_lbl = 'glioblastoma'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Glioblastoma']
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Glioblastoma multiforme']

In [None]:
# Breast Ductal
cncr = 'breast-ductal'
cncr_lbl = 'breast_ductal'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Breast Invasive Ductal Carcinoma']
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Breast carcinoma']

In [None]:
# Breast Lobular
cncr = 'breast-lobular'
cncr_lbl = 'breast_lobular'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Breast Invasive Lobular Carcinoma']
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Breast carcinoma']

In [None]:
# Breast NOS
cncr = 'breast-nos'
cncr_lbl = 'breast_nos'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Breast Invasive Carcinoma, NOS']
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Breast carcinoma']

In [139]:
# Renal clear cell
cncr = 'renal-clear-cell'
cncr_lbl = 'renal_clear_cell'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Renal Clear Cell Carcinoma']
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Clear cell renal cell carcinoma']

#### Continue

In [140]:
print(cncr)
ids_sys_a = sys_a_samp_canc_n.improve_sample_id # cl
ids_sys_b = sys_b_samp_canc_n.improve_sample_id # cp

renal-clear-cell


#### Data extraction

In [141]:
modality = 'transcriptomics' # to file name
moda = 'tran_' # to columns and index
mda_n_sys_a = cell_line.transcriptomics[cell_line.transcriptomics.improve_sample_id.isin(ids_sys_a)] # cl
mda_n_sys_b = cptac.transcriptomics[cptac.transcriptomics.improve_sample_id.isin(ids_sys_b)]

In [None]:
modality = 'proteomics' # to file name
moda = 'prot_' # to columns and index
mda_n_sys_a = cell_line.proteomics[cell_line.proteomics.improve_sample_id.isin(ids_proj_a)] # cl
mda_n_sys_b = cptac.proteomics[cptac.proteomics.improve_sample_id.isin(ids_proj_b)]

In [None]:
modality = 'copy-number' # to file name
moda = 'cpnm_' # to columns and index
mda_n_sys_a= cell_line.copy_number[cell_line.copy_number.improve_sample_id.isin(ids_proj_a)]
mda_n_sys_b= cell_line.copy_number[cell_line.copy_number.improve_sample_id.isin(ids_proj_b)]

In [None]:
modality = 'mutations' # to file name
moda = 'muta_' # to columns and index
mda_n_sys_a = cptac.mutations[cptac.mutations.improve_sample_id.isin(ids_proj_a)]
mda_n_sys_b = cell_line.mutations[cell_line.mutations.improve_sample_id.isin(ids_proj_b)]

#### Project A, cell line

In [142]:
df_lite, size, na_count, inf_count = df_check(mda_n_sys_a)
print(sys_a, '| sys a')
print(cncr, modality)
print(size)
print(na_count)
print(inf_count)

cell-line | sys a
renal-clear-cell transcriptomics
326,261
0
0


In [143]:
wall_clock, dot_T = extract(df_lite)
print(wall_clock)
dot_T.iloc[:3, :3]

0.0 minutes


entrez_id,1,2,9
improve_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15,0.176323,0.097611,1.778209
158,0.201634,0.669027,2.127633
249,0.111031,0.214125,2.140779


In [144]:
cncr

'renal-clear-cell'

In [145]:
moda

'tran_'

In [146]:
# dot_T = g(d_typ, dot_T.copy())
dot_T = g(moda, dot_T.copy())
dot_T.iloc[:3, :3]

columns are int, index are int


entrez_id,entrz_tran_1,entrz_tran_2,entrz_tran_9
improve_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
smpl_id_15,0.176323,0.097611,1.778209
smpl_id_158,0.201634,0.669027,2.127633
smpl_id_249,0.111031,0.214125,2.140779


In [147]:
dot_T.shape

(17, 19176)

In [148]:
dot_T.dropna(axis = 1, inplace = True)

In [149]:
dot_T.shape

(17, 19176)

In [150]:
a = dot_T # cell line

#### Project B, CPTAC

In [151]:
df_lite, size, na_count, inf_count = df_check(mda_n_sys_b)
print(sys_b, '| sys b')
print(cncr, modality)
print(size)
print(na_count)
print(inf_count)

cptac | sys b
renal-clear-cell transcriptomics
4,380,249
115,422
0


In [152]:
wall_clock, dot_T = extract(df_lite)
print(wall_clock)
dot_T.iloc[:3, :3]

0.02 minutes


entrez_id,1.0,2.0,3.0
improve_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2891.0,5.33,15.85,5.32
2892.0,4.96,15.87,4.98
2893.0,4.84,15.52,4.5


In [153]:
dot_T = g(moda, dot_T.copy())

both float


In [154]:
dot_T.iloc[:3, :3]

entrez_id,entrz_tran_1,entrz_tran_2,entrz_tran_3
improve_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
smpl_id_2891,5.33,15.85,5.32
smpl_id_2892,4.96,15.87,4.98
smpl_id_2893,4.84,15.52,4.5


In [155]:
dot_T.shape

(110, 37287)

In [156]:
dot_T.dropna(axis = 1, inplace = True)

In [157]:
dot_T.shape

(110, 37287)

In [158]:
b = dot_T # cptac

#### Labels

In [159]:
# Check a and b are different

In [160]:
a.iloc[:3, :3]

entrez_id,entrz_tran_1,entrz_tran_2,entrz_tran_9
improve_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
smpl_id_15,0.176323,0.097611,1.778209
smpl_id_158,0.201634,0.669027,2.127633
smpl_id_249,0.111031,0.214125,2.140779


In [161]:
b.iloc[:3, :3]

entrez_id,entrz_tran_1,entrz_tran_2,entrz_tran_3
improve_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
smpl_id_2891,5.33,15.85,5.32
smpl_id_2892,4.96,15.87,4.98
smpl_id_2893,4.84,15.52,4.5


In [162]:
a.insert(0, 'Cancer_type', cncr_lbl)
b.insert(0, 'Cancer_type', cncr_lbl)
a.insert(0, 'System', sys_a_lbl)
b.insert(0, 'System', sys_b_lbl)

In [163]:
ab = pd.concat([a, b], axis=0, join='inner')

In [None]:
ab

In [165]:
ab.System.value_counts()

System
CPTAC        110
cell_line     17
Name: count, dtype: int64

In [166]:
ab.Cancer_type.value_counts()

Cancer_type
renal_clear_cell    127
Name: count, dtype: int64

In [167]:
# Set target output dirs
out_dbl = '../output/dbl-cncr/'
out_sng = '../output/sng-cncr'
ump_out = '../output/umap/'

In [168]:
# out, u_o, cancer, data_type
out_dbl, out_sng, cncr, modality, ump_out

('../output/dbl-cncr/',
 '../output/sng-cncr',
 'renal-clear-cell',
 'transcriptomics',
 '../output/umap/')

In [169]:
# Write two-system, single cancer type to disk
ab.to_csv(
    '../output/'+out_sng+'/'+cncr+'_'+modality+'_'+systems+'.tsv',
    sep = '\t')

In [170]:
sng_chk = pd.read_csv('../output/'+out_sng+'/'+cncr+'_'+modality+'_'+systems+'.tsv',
                    sep = '\t', index_col = 0)

In [None]:
print(cncr)
sng_chk

#### Dbl Cncr

In [172]:
print(cncr)
ab.iloc[:3, :3]

renal-clear-cell


entrez_id,System,Cancer_type,entrz_tran_1
improve_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
smpl_id_15,cell_line,renal_clear_cell,0.176323
smpl_id_158,cell_line,renal_clear_cell,0.201634
smpl_id_249,cell_line,renal_clear_cell,0.111031


In [173]:
ab.shape

(127, 19133)

In [None]:
#### ab hold

In [168]:
sng_lbl_disk_read_n = ab
print(cncr, moda)

lung-adeno tran_


#### Disk read Cancer_n

In [204]:
pwd

'/Users/karlberb/Documents/work/2024/bnch_mrk/mdl-sys-bnchmrk/data'

In [None]:
output/sng_lab/lung_adeno_transcriptomics_cell_line_CPTAC.tsv

In [None]:
cncr_n = 'lung-adeno'
moda_n = 'transcriptomics'
sys_n = 'cell-line+CPTAC'

In [None]:
sng_lbl_disk_read_n = pd.read_csv( # single cancer   ------
    '../output/sng-lbl/'+cncr_n+'_'+moda_n+'_'+sys_n+'.tsv',
    sep = '\t', index_col = 0)

In [83]:
sng_lbl_disk_read_n.iloc[:3, :3]

Unnamed: 0_level_0,Labels,entrz_tran_1,entrz_tran_2
improve_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
smpl_id_11,cell_line,2.341986,0.070389
smpl_id_20,cell_line,4.560104,0.014355
smpl_id_28,cell_line,0.411426,0.214125


In [84]:
sng_lbl_disk_read_n.shape

(188, 19132)

In [85]:
sng_lbl_disk_read_n.insert(1, 'Cancer_type', cncr_n)

In [231]:
check.iloc[:3, :3]

Unnamed: 0_level_0,Labels,Cancer_type,entrz_tran_1
improve_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
smpl_id_11,cell_line,lung_adeno,2.341986
smpl_id_20,cell_line,lung_adeno,4.560104
smpl_id_28,cell_line,lung_adeno,0.411426


In [86]:
check = check.rename(columns={'Labels': 'System'})

In [None]:
dbl_cncr

In [206]:
dbl_cncr = pd.concat([ab, sng_lbl_disk_read_n], axis = 0, join = 'inner')

In [None]:
dbl_cncr

In [77]:
cancer

'head_neck'

In [210]:
c1 = 'pancreatic_adeno'
c2 = 'lung_adeno'

In [220]:
cncr = 'pancreatic-adeno'

In [221]:
cncr_n = 'lung-adeno'

In [222]:
systems, modality

('cell-line+CPTAC', 'transcriptomics')

In [223]:
dbl_cncr.to_csv('../output/'+out_dbl+'/'+cncr+'+'+cncr_n+'_'+modality+'_'+systems+'.tsv',
                    sep = '\t')

In [225]:
dbl_chk = pd.read_csv('../output/'+out_dbl+'/'+cncr+'+'+cncr_n+'_'+modality+'_'+systems+'.tsv',
                    sep = '\t', index_col = 0)

In [None]:
dbl_chk