In [None]:
# data_loader
    # read from data/model system sub-folders
    # write to /r7/cancer_modality_system.input-data.tsv
    # /home/groups/EllrottLab/mdl-sys-bnchmrk/data/r7

https://pnnl-compbio.github.io/coderdata/pages/usage

#### Download data

In [None]:
import coderdata as cd

In [None]:
# cd.download_data_by_prefix('beataml')
# cd.download_data_by_prefix('cell_line')
# cd.download_data_by_prefix('cptac')
# cd.download_data_by_prefix('hcmi')

##### Exacloud download

In [None]:
pip install coderdata

##### Exacloud upload

In [None]:
ls ../data/beataml/

#### Set systems

In [None]:
pwd
mdl-sys-bnchmrk/code

In [None]:
%whos

In [None]:
import coderdata as cd
cell_line = cd.DatasetLoader('cell_line', data_directory = '../data/cell_line/') # a
cptac = cd.DatasetLoader('cptac', data_directory = '../data/cptac/') # b
beataml = cd.DatasetLoader('beataml', data_directory = '../data/beataml/') # c
hcmi = cd.DatasetLoader('hcmi', data_directory = '../data/hcmi/') # d

print('DatasetLoaders in RAM')
systems = 'cell-line+CPTAC'

#### Categorical checks

In [None]:
cell_line.transcriptomics

In [None]:
cell_line.proteomics

In [None]:
cell_line.mutations

In [None]:
cell_line.mutations.mutations.value_counts()

In [None]:
cell_line.mutations.variant_classification.value_counts()

In [None]:
cell_line.copy_number

In [None]:
cell_line.copy_number.copy_call.value_counts()

In [None]:
cptac.copy_number.copy_call.value_counts()

In [None]:
cptac.mutations.variant_classification.value_counts()

In [None]:
hcmi.mutations

In [None]:
# cptac.mutations.variant_classification.value_counts()
hcmi.mutations.variant_class.value_counts()

In [None]:
hcmi.copy_number.copy_call.value_counts()

In [None]:
beataml.mutations.variant_classification.value_counts()

In [None]:
beataml.copy_number # no copy number in beataml

In [None]:
beataml.copy_number.copy_call.value_counts()

#### Imports and functions

In [None]:
import glob
import pandas as pd
import umap
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

In [None]:
# Traspose long-form X-file into biomarkers as columns,
def extract(df_lite):
    import time
    start = time.time()
    data_type = df_lite.columns[2] # extract marker values

    dot_T = df_lite.pivot_table(
        index='improve_sample_id',
        columns='entrez_id',
        values=data_type,
        aggfunc='mean'             # average duplicate values
    )

    end = time.time()
    wall_clock = end - start
    return str(round(wall_clock / 60, 2)) + ' minutes', dot_T

In [None]:
# Extract ids and biomarker values
def df_check(X_n):
    df_lite = X_n.iloc[:, :3] # cut the last two columns, source and study
    size = f"{df_lite.shape[0]:,}"
    na_count = f"{df_lite.isna().sum().sum():,}"
    inf_count = f"{df_lite.isin([np.inf, -np.inf]).sum().sum():,}"
    return df_lite, size, na_count, inf_count

In [None]:
# dot_T = g(d_typ, dot_T.copy())
def g(d_typ, df):
    """
    Checks the data types of columns and index in a DataFrame and prints informative messages.

    Args:
        df (pandas.DataFrame): The DataFrame to check.

    Returns:
        None
    """

    if df.columns.dtype == 'float64' and df.index.dtype == 'float64':
        print('both float')
        df = float_to_string(d_typ, df)
    elif df.columns.dtype == 'float64' and df.index.dtype == 'int':
        print('columns are float, index are int')
        df = indx_int_colm_flt(d_typ, df)
    elif df.columns.dtype == 'int' and df.index.dtype == 'float':
        print('columns are int, index are float, fail, write another function')
        df = indx_flt_colm_int(d_typ, df)
    elif df.columns.dtype == 'int' and df.index.dtype == 'int':
        print('columns are int, index are int')
        df = int_to_string(d_typ, df)
    else:
        print('non int or float dtype detected')
    return df

In [None]:
def int_to_string(d_typ, dot_T):
    dot_T.columns = dot_T.columns.map(str)
    dot_T.columns = ['entrz_' + d_typ + i for i in dot_T.columns] #
    dot_T.columns.name = 'entrez_id'

    dot_T.index = dot_T.index.map(str)
    dot_T.index = ['smpl_id_' + i for i in dot_T.index]
    dot_T.index.name = 'improve_sample_id'
    return dot_T

In [None]:
def indx_int_colm_flt(d_typ, dot_T):
    dot_T.columns = dot_T.columns.map(str)
    dot_T.columns = [i.split('.')[0] for i in dot_T.columns]
    dot_T.columns = ['entrz_' + d_typ + i for i in dot_T.columns]
    dot_T.columns.name = 'entrez_id'
    
    dot_T.index = dot_T.index.map(str)
    dot_T.index = ['smpl_id_' + i for i in dot_T.index]
    dot_T.index.name = 'improve_sample_id'
    return dot_T

In [None]:
def float_to_string(d_typ, dot_T):
    dot_T.columns = dot_T.columns.map(str)
    dot_T.columns = [i.split('.')[0] for i in dot_T.columns]
    dot_T.columns = ['entrz_' + d_typ + i for i in dot_T.columns]
    dot_T.columns.name = 'entrez_id'
    
    dot_T.index = dot_T.index.map(str)
    dot_T.index = [i.split('.')[0] for i in dot_T.index]
    dot_T.index = ['smpl_id_' + i for i in dot_T.index]
    dot_T.index.name = 'improve_sample_id'
    return dot_T

In [None]:
def indx_flt_colm_int(d_typ, dot_T):
    dot_T.columns = dot_T.columns.map(str)
    dot_T.columns = ['entrz_' + d_typ + i for i in dot_T.columns]
    dot_T.columns.name = 'entrez_id'
    
    dot_T.index = dot_T.index.map(str)
    dot_T.columns = [i.split('.')[0] for i in dot_T.columns]
    dot_T.index = ['smpl_id_' + i for i in dot_T.index]
    dot_T.index.name = 'improve_sample_id'
    return dot_T

#### Categorical extracts

In [None]:
# Copy number test added below

#### HCMI inspections

In [None]:
sys_b = 'hcmi' # switch to system a with only cptac as sys b ()?                                                                                                                                                                                                                                                  
sys_b_samp = hcmi.samples

In [None]:
sys_b_samp.model_type.value_counts()[:100]

In [None]:
sys_b_samp[sys_b_samp.model_type == 'Peripheral Whole Blood']

In [None]:
sys_b_samp_orgnd = sys_b_samp[sys_b_samp.model_type == '3D Organoid']

In [None]:
sys_b_samp_orgnd.cancer_type.value_counts()[:100]

In [None]:
sys_b_samp_orgnd.common_name.value_counts()[:5]

In [None]:
sys_b_samp_orgnd.common_name == 'Colon, NOS'

In [None]:
# Descriptive stat plots? - mult_dim_dscrptv

In [None]:
print(sys_b, 'is system b')
sys_b_samp.common_name.value_counts()[:100]

In [None]:
sys_a_samp = cell_line.samples

In [None]:
sys_a_samp = cptac.samples

In [None]:
sys_a_samp = cell_line.samples

### Samples, abstract systems to a and b

#### HCMI + CPTAC to a and b

In [None]:
# HCMI + CPTAC
systems = 'HCMI+CPTAC'

sys_a_samp = hcmi.samples
sys_a_samp = sys_a_samp[sys_a_samp.model_type == '3D Organoid']

sys_a = 'hcmi'
# sys_a_lbl = 'HCMI'

sys_b_samp = cptac.samples
sys_b = 'cptac'
# sys_b_lbl = 'CPTAC'

##### cell line + HCMI, -> SKIP <-

In [None]:
# Not correcting between two non-human cancer model system

# cell line+HCMI
# systems = 'cell-line+HCMI'

# sys_a_samp = cell_line.samples
# sys_a = 'cell-line'
# sys_a_lbl = 'cell_line'

# sys_b_samp = hcmi.samples
# sys_b_samp = sys_b_samp[sys_b_samp.model_type == '3D Organoid'] # Subset to just organoids
#                                                                 # intentionaly extract cell line samples for
#                                                                 # Statistcal power
# sys_b = 'hcmi'
# sys_b_lbl = 'HCMI'

#### Cell line + AML to a and b

In [None]:
# AML
systems = 'cell-line+BeatAML'

sys_a_samp = cell_line.samples
sys_a = 'cell-line'
# sys_a_lbl = 'cell_line'

sys_b_samp = beataml.samples
sys_b = 'beataml'
# sys_b_lbl = 'BeatAML'

In [None]:
# No AML in HCMI, only cell line correction possible

#### Cell line + CPTAC to a and b

In [None]:
# Cell line + CPTAC
systems = 'cell-line+CPTAC'

sys_a_samp = cell_line.samples
sys_a = 'cell-line'
sys_a_lbl = 'cell_line'

sys_b_samp = cptac.samples
sys_b = 'cptac'
sys_b_lbl = 'CPTAC'

#### Cancer_type view

In [None]:
print(sys_a, 'is system a')
sys_a_samp.cancer_type.value_counts()[:205]

In [None]:
len(sys_a_samp.cancer_type.value_counts())

In [None]:
print(sys_a, 'is system a')
sys_a_samp.common_name.value_counts()[:60] # HCMI has common names

In [None]:
sys_a_samp

In [None]:
print(sys_b, 'is system b')
sys_b_samp.cancer_type.value_counts()[:50]

In [None]:
# beatAML has common_name colmn
print(sys_b, 'is system b')
# sys_b_samp.cancer_type.value_counts()[:50]
sys_b_samp.common_name.value_counts()[:50]

In [None]:
print(sys_c, 'is system c')
sys_c_samp.cancer_type.value_counts()[:2] # hold for 3-project combos

### Cancer_type toggles into systems A and B

#### HCMI + CPTAC

In [None]:
print('systems:', systems)
cncr = 'colon-nos-adeno'
# cncr_lbl = 'colon-nos-adeno'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.common_name == 'Colon, NOS'] # hcmi
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Colon adenocarcinoma'] # cptac
print(cncr)
print('sys a:', sys_a, len(sys_a_samp_canc_n.improve_sample_id.unique()),
    '\nsys b:', sys_b, len(sys_b_samp_canc_n.improve_sample_id.unique()))

In [None]:
sys_a_samp_canc_n.improve_sample_id

In [None]:
print('systems:', systems)
cncr = 'pancreatic-nos-ductal-ad'
# cncr_lbl = 'pancreatic'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.common_name == 'Pancreas, NOS'] # hcmi
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Pancreatic ductal adenocarcinoma'] # cptac
print(cncr)
print('sys a:', sys_a, len(sys_a_samp_canc_n.improve_sample_id.unique()),
    '\nsys b:', sys_b, len(sys_b_samp_canc_n.improve_sample_id.unique()))

#### BeatAML

In [None]:
print(systems)
cncr = 'AML'
# cncr_lbl = 'aml'
print(cncr)
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Acute Myeloid Leukemia'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'ACUTE MYELOID LEUKAEMIA'] # beatAML
print('sys a:', sys_a, len(sys_a_samp_canc_n.improve_sample_id.unique()),
    '\nsys b:', sys_b, len(sys_b_samp_canc_n.improve_sample_id.unique()))

#### Cell line + CPTAC, 5 cancers

In [None]:
# Lung adeno
print(systems)
cncr = 'lung-adeno'
print(cncr)
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Lung Adenocarcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Lung adenocarcinoma'] # cp
print('sys a:', sys_a, len(sys_a_samp_canc_n.improve_sample_id.unique()),
    '\nsys b:', sys_b, len(sys_b_samp_canc_n.improve_sample_id.unique()))

In [None]:
# Pancreatic adeno
print(systems)
cncr = 'pancreatic-adeno'
print(cncr)
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Pancreatic Adenocarcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Pancreatic ductal adenocarcinoma'] # cp
print('sys a:', sys_a, len(sys_a_samp_canc_n.improve_sample_id.unique()),
    '\nsys b:', sys_b, len(sys_b_samp_canc_n.improve_sample_id.unique()))

In [None]:
# Colon adeno
print(systems)
cncr = 'colon-adeno'
print(cncr)
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Colon Adenocarcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Colon adenocarcinoma'] # cp
print('sys a:', sys_a, len(sys_a_samp_canc_n.improve_sample_id.unique()),
    '\nsys b:', sys_b, len(sys_b_samp_canc_n.improve_sample_id.unique()))

In [None]:
# Glioblastoma
print(systems)
cncr = 'glioblastoma'
print(cncr)
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Glioblastoma']
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Glioblastoma multiforme'] # cp
print('sys a:', sys_a, len(sys_a_samp_canc_n.improve_sample_id.unique()),
    '\nsys b:', sys_b, len(sys_b_samp_canc_n.improve_sample_id.unique()))

In [None]:
# Renal clear cell
print(systems)
cncr = 'renal-clear-cell'
print(cncr)
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Renal Clear Cell Carcinoma']
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Clear cell renal cell carcinoma'] # cp
print('sys a:', sys_a, len(sys_a_samp_canc_n.improve_sample_id.unique()),
    '\nsys b:', sys_b, len(sys_b_samp_canc_n.improve_sample_id.unique()))

#### Cell line + CPTAC, BRCA

In [None]:
# Breast Ductal - 3 transciptomic samples for cell line
print(systems)
cncr = 'breast-ductal'
print(cncr)
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Breast Invasive Ductal Carcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Breast carcinoma']
print('sys a:', sys_a, len(sys_a_samp_canc_n.improve_sample_id.unique()),
    '\nsys b:', sys_b, len(sys_b_samp_canc_n.improve_sample_id.unique()))

In [None]:
# Breast Lobular - # one cell line sample for prot, 8 samples trans (post-data type mapping)
print(systems)
cncr = 'breast-lobular'
print(cncr)
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Breast Invasive Lobular Carcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Breast carcinoma']
print('sys a:', sys_a, len(sys_a_samp_canc_n.improve_sample_id.unique()),
    '\nsys b:', sys_b, len(sys_b_samp_canc_n.improve_sample_id.unique()))

In [None]:
# Breast NOS - # two cell line samples for prot - 3 transciptomic samples for cell line (post-data type mapping)
print(systems)
cncr = 'breast-nos'
print(cncr)
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Breast Invasive Carcinoma, NOS'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Breast carcinoma']
print('sys a:', sys_a, len(sys_a_samp_canc_n.improve_sample_id.unique()),
    '\nsys b:', sys_b, len(sys_b_samp_canc_n.improve_sample_id.unique()))

### ID extract

In [None]:
# def function:
print(cncr, sys_a, sys_b)
ids_sys_a = sys_a_samp_canc_n.improve_sample_id # 
ids_sys_b = sys_b_samp_canc_n.improve_sample_id #

In [None]:
ids_sys_a

### Data extraction

#### HCMI + CPTAC, data extraction

In [None]:
hcmi.transcriptomics

In [None]:
print(cncr, systems)
modality = 'transcriptomics'
print(modality)
moda = 'tran_'
mda_n_sys_a = hcmi.transcriptomics[hcmi.transcriptomics.improve_sample_id.isin(ids_sys_a)] # sys n
mda_n_sys_b = cptac.transcriptomics[cptac.transcriptomics.improve_sample_id.isin(ids_sys_b)] # hcmi is sys b
print('sys a:', len(mda_n_sys_a.improve_sample_id.unique()))
print('sys b:', len(mda_n_sys_b.improve_sample_id.unique()))

##### prot hcmi + cptac

In [None]:
print(cncr, systems)
modality = 'proteomics'
print(modality)
moda = 'prot_'
mda_n_sys_a = hcmi.proteomics[hcmi.proteomics.improve_sample_id.isin(ids_sys_a)] # sys n
mda_n_sys_b = cptac.proteomics[cptac.proteomics.improve_sample_id.isin(ids_sys_b)] # hcmi is sys b
print('sys a:', len(mda_n_sys_a.improve_sample_id.unique()))
print('sys b:', len(mda_n_sys_b.improve_sample_id.unique()))

In [None]:
cptac.proteomics

In [None]:
hcmi.proteomics

#### Cell line + BeatAML, data extraction

##### trans, cl + AML

In [None]:
print(cncr, systems)
modality = 'transcriptomics'
print(modality)
moda = 'tran_'
mda_n_sys_a = cell_line.transcriptomics[cell_line.transcriptomics.improve_sample_id.isin(ids_sys_a)] # cl
mda_n_sys_b = beataml.transcriptomics[beataml.transcriptomics.improve_sample_id.isin(ids_sys_b)] # baml
print('sys a:', len(mda_n_sys_a.improve_sample_id.unique()))
print('sys b:', len(mda_n_sys_b.improve_sample_id.unique()))

##### prot, cl + AML

In [None]:
print(cncr, systems)
modality = 'proteomics' # to file name
print(modality)
moda = 'prot_' # to columns and index
mda_n_sys_a = cell_line.proteomics[cell_line.proteomics.improve_sample_id.isin(ids_sys_a)] # cl
mda_n_sys_b = beataml.proteomics[beataml.proteomics.improve_sample_id.isin(ids_sys_b)] # beat aml
print('sys a:', len(mda_n_sys_a.improve_sample_id.unique()))
print('sys b:', len(mda_n_sys_b.improve_sample_id.unique()))

#### Cell line + CPTAC, data extraction

##### trns

In [None]:
print(cncr)
modality = 'transcriptomics' # to file name # BRCA bug?
print(modality)
moda = 'tran_' # to columns and index
mda_n_sys_a = cell_line.transcriptomics[cell_line.transcriptomics.improve_sample_id.isin(ids_sys_a)] # cl
mda_n_sys_b = cptac.transcriptomics[cptac.transcriptomics.improve_sample_id.isin(ids_sys_b)]
print('sys a:', len(mda_n_sys_a.improve_sample_id.unique()))
print('sys b:', len(mda_n_sys_b.improve_sample_id.unique()))

##### prot

In [None]:
print(cncr)
modality = 'proteomics' # to file name
print(modality)
moda = 'prot_' # to columns and index
mda_n_sys_a = cell_line.proteomics[cell_line.proteomics.improve_sample_id.isin(ids_sys_a)] # cl
mda_n_sys_b = cptac.proteomics[cptac.proteomics.improve_sample_id.isin(ids_sys_b)]
print('sys a:', len(mda_n_sys_a.improve_sample_id.unique()))
print('sys b:', len(mda_n_sys_b.improve_sample_id.unique()))

##### cpnm

In [None]:
out_dir = '../results/input-data-cat/'+r+'/'+r+'-fls/'
modality = 'copy-number' # to file name
moda = 'cpnm_' # to columns and index
mda_n_sys_a= cell_line.copy_number[cell_line.copy_number.improve_sample_id.isin(ids_sys_a)]
mda_n_sys_b= cptac.copy_number[cptac.copy_number.improve_sample_id.isin(ids_sys_b)]

##### muta

In [None]:
modality = 'mutations' # to file name
moda = 'muta_' # to columns and index
mda_n_sys_a = cell_line.mutations[cell_line.mutations.improve_sample_id.isin(ids_sys_a)]
mda_n_sys_b = cptac.mutations[cptac.mutations.improve_sample_id.isin(ids_sys_b)]

### Inspect / devel

In [None]:
mda_n_sys_a

In [None]:
mda_n_sys_b

In [None]:
X_n = mda_n_sys_a
df_lite = X_n.iloc[:, :3] # cut the last two columns, source and study
size = f"{df_lite.shape[0]:,}"
na_count = f"{df_lite.isna().sum().sum():,}"
inf_count = f"{df_lite.isin([np.inf, -np.inf]).sum().sum():,}"
# return df_lite, size, na_count, inf_count
print(df_lite, size, na_count, inf_count)

In [None]:
df_lite

In [None]:
data_type = df_lite.columns[2]

In [None]:
data_type

In [None]:
dot_T = df_lite.pivot_table(
    index='improve_sample_id',
    columns='entrez_id',
    values=data_type,
    aggfunc='mean'             # average duplicate values
)

In [None]:
dot_T

In [None]:
moda

In [None]:
g(moda, dot_T.copy())
# columns are int, index are float, fail, write another function

### Extraction to disk script

In [None]:
df_lite, size, na_count, inf_count = df_check(mda_n_sys_a)
print(sys_a, '| sys a')                 #               ^ 
print(cncr, modality)
print(size)
print(na_count)
print(inf_count)

wall_clock, dot_T = extract(df_lite)
dot_T = g(moda, dot_T.copy())
dot_T.dropna(axis = 1, inplace = True)
a = dot_T
print(len(a))
print(' ')

df_lite, size, na_count, inf_count = df_check(mda_n_sys_b)
print(sys_b, '| sys b')                 #               ^ 
print(cncr, modality)
print('len: ', size)
print('NaNs: ', na_count)
print('Infs: ', inf_count)

wall_clock, dot_T = extract(df_lite)
dot_T = g(moda, dot_T.copy())
dot_T.dropna(axis = 1, inplace = True)
b = dot_T
print(len(b))

a.insert(0, 'Cancer_type', cncr)
b.insert(0, 'Cancer_type', cncr)
a.insert(0, 'System', sys_a)
b.insert(0, 'System', sys_b)

ab = pd.concat([a, b], axis=0, join='inner')

### Disk write

In [None]:
out_dir = '../data/r8/'

In [None]:
out_dir, cncr, modality, systems

In [None]:
ab.System.value_counts()

In [None]:
ab.Cancer_type.value_counts()

In [None]:
ab

In [None]:
# Write two-system, single cancer type to disk
ab.to_csv(out_dir+
    '/'+cncr+'_'+modality+'_'+systems+'.tsv',
    sep = '\t')

In [None]:
chk_ab = pd.read_csv(out_dir+
    '/'+cncr+'_'+modality+'_'+systems+'.tsv',
    sep = '\t', index_col = 0)

In [None]:
print(out_dir, cncr, modality, systems)
chk_ab

In [None]:
cncr, modality, systems

In [None]:
# Make binary and multi-class files in UniFile

#### Experimental record

##### Check proteomics negatives

In [None]:
prot_neg_fls = glob.glob('../strctrd/one_cncr/*proteomics_cell-line+CPTAC.tsv')

In [None]:
prot_neg_fls

In [None]:
for png in prot_neg_fls:
    chk_ab = pd.read_csv(png,
        sep = '\t', index_col = 0)
    print(png.split('/')[3])
    print(chk_ab.iloc[:, 2:].min().min())
    print(' ')
    # break

In [None]:
chk_ab.iloc[:, 2:].min().min()

In [None]:
print(cncr)
ab.iloc[:3, :3]

In [None]:
ab.shape

### Devel

#### Extract sample ids

In [None]:
print(cncr, sys_a, sys_b)
ids_sys_a = sys_a_samp_canc_n.improve_sample_id # 
ids_sys_b = sys_b_samp_canc_n.improve_sample_id # 

#### Data extraction, AML

#### Project A

In [None]:
df_lite, size, na_count, inf_count = df_check(mda_n_sys_a)
print(sys_a, '| sys a')                 #               ^ 
print(cncr, modality)
print(size)
print(na_count)
print(inf_count)

wall_clock, dot_T = extract(df_lite)
dot_T = g(moda, dot_T.copy())
dot_T.dropna(axis = 1, inplace = True)
# add zero check
a = dot_T # cell line

#### Project B

In [None]:
df_lite, size, na_count, inf_count = df_check(mda_n_sys_b)
print(sys_b, '| sys b')                 #               ^ 
print(cncr, modality)
print('len: ', size)
print('NaNs: ', na_count)
print('Infs: ', inf_count)

wall_clock, dot_T = extract(df_lite)
dot_T = g(moda, dot_T.copy())
dot_T.dropna(axis = 1, inplace = True)
b = dot_T # 

#### Labels

##### Check a and b are different

In [None]:
a.iloc[:3, :3]

In [None]:
b.iloc[:3, :3]

##### Continue

In [None]:
a.insert(0, 'Cancer_type', cncr_lbl)
b.insert(0, 'Cancer_type', cncr_lbl)
a.insert(0, 'System', sys_a_lbl)
b.insert(0, 'System', sys_b_lbl)

In [None]:
ab = pd.concat([a, b], axis=0, join='inner')

In [None]:
print(cncr, modality)
ab

In [None]:
ab.shape