#### Set systems

In [None]:
%whos

In [None]:
import coderdata as cd
cell_line = cd.DatasetLoader('cell_line', data_directory = '../data/cell_line/') # a
cptac = cd.DatasetLoader('cptac', data_directory = '../data/cptac/') # b
beataml = cd.DatasetLoader('beataml', data_directory = '../data/beataml/') # c
hcmi = cd.DatasetLoader('hcmi', data_directory = '../data/hcmi/') # d

print('DatasetLoaders in RAM')
systems = 'cell-line+CPTAC'

In [None]:
# target output dir: strctrd/one_cncr

#### Semi-interactive devel

In [None]:
import glob
import pandas as pd
import umap
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import glob

##### Functions

In [None]:
# Traspose long-form X-file into biomarkers as columns,
def extract(df_lite):
    import time
    start = time.time()
    data_type = df_lite.columns[2] # extract marker values

    dot_T = df_lite.pivot_table(
        index='improve_sample_id',
        columns='entrez_id',
        values=data_type,
        aggfunc='mean'             # average duplicate values
    )

    end = time.time()
    wall_clock = end - start
    return str(round(wall_clock / 60, 2)) + ' minutes', dot_T

In [None]:
# Extract ids and biomarker values
def df_check(X_n):
    df_lite = X_n.iloc[:, :3] # cut the last two columns, source and study
    size = f"{df_lite.shape[0]:,}"
    na_count = f"{df_lite.isna().sum().sum():,}"
    inf_count = f"{df_lite.isin([np.inf, -np.inf]).sum().sum():,}"
    return df_lite, size, na_count, inf_count

In [None]:
# dot_T = g(d_typ, dot_T.copy())
def g(d_typ, df):
    """
    Checks the data types of columns and index in a DataFrame and prints informative messages.

    Args:
        df (pandas.DataFrame): The DataFrame to check.

    Returns:
        None
    """

    if df.columns.dtype == 'float64' and df.index.dtype == 'float64':
        print('both float')
        df = float_to_string(d_typ, df)
    elif df.columns.dtype == 'float64' and df.index.dtype == 'int':
        print('columns are float, index are int')
        df = indx_int_colm_flt(d_typ, df)
    elif df.columns.dtype == 'int' and df.index.dtype == 'float':
        print('columns are int, index are float, fail, write another function')
        # forth function
    elif df.columns.dtype == 'int' and df.index.dtype == 'int':
        print('columns are int, index are int')
        df = int_to_string(d_typ, df)
    else:
        print('non int or float dtype detected')
    return df

In [None]:
def int_to_string(d_typ, dot_T):
    dot_T.columns = dot_T.columns.map(str)
    dot_T.columns = ['entrz_' + d_typ + i for i in dot_T.columns] #
    dot_T.columns.name = 'entrez_id'

    dot_T.index = dot_T.index.map(str)
    dot_T.index = ['smpl_id_' + i for i in dot_T.index]
    dot_T.index.name = 'improve_sample_id'
    return dot_T

In [None]:
def indx_int_colm_flt(d_typ, dot_T):
    dot_T.columns = dot_T.columns.map(str)
    dot_T.columns = [i.split('.')[0] for i in dot_T.columns]
    dot_T.columns = ['entrz_' + d_typ + i for i in dot_T.columns]
    dot_T.columns.name = 'entrez_id'
    
    dot_T.index = dot_T.index.map(str)
    dot_T.index = ['smpl_id_' + i for i in dot_T.index]
    dot_T.index.name = 'improve_sample_id'
    return dot_T

In [None]:
def float_to_string(d_typ, dot_T):
    dot_T.columns = dot_T.columns.map(str)
    dot_T.columns = [i.split('.')[0] for i in dot_T.columns]
    dot_T.columns = ['entrz_' + d_typ + i for i in dot_T.columns]
    dot_T.columns.name = 'entrez_id'
    
    dot_T.index = dot_T.index.map(str)
    dot_T.index = [i.split('.')[0] for i in dot_T.index]
    dot_T.index = ['smpl_id_' + i for i in dot_T.index]
    dot_T.index.name = 'improve_sample_id'
    return dot_T

##### Systems toggles

In [None]:
systems = 'cell-line+CPTAC'

In [None]:
systems = 'cell-line+BeatAML'

In [None]:
systems = 'cell-line+HCMI'

##### Samples, abstract systems to a and b

In [None]:
# Cell line and CPTAC
sys_a_samp = cell_line.samples
sys_a = 'cell-line'
sys_a_lbl = 'cell_line'

sys_b_samp = cptac.samples
sys_b = 'cptac'
sys_b_lbl = 'CPTAC'

In [None]:
# AML
sys_a_samp = cell_line.samples
sys_a = 'cell-line'
sys_a_lbl = 'cell_line'

sys_b_samp = beataml.samples
sys_b = 'beataml'
sys_b_lbl = 'BeatAML'

##### Cancer_type view

In [None]:
print(sys_a, 'is system a')
sys_a_samp.cancer_type.value_counts()[:60]

In [None]:
print(sys_b, 'is system b')
sys_b_samp.cancer_type.value_counts()[:50]

In [None]:
print(sys_c, 'is system c')
sys_c_samp.cancer_type.value_counts()[:2] # expect one cancer type for beataml

#### Cancer_type toggles, cell line as a, CPTAC as b

In [None]:
##### single cancers to /strctrd/one_cncr

In [None]:
out_one, cncr, modality, systems

In [None]:
ls ../strctrd/one_cncr/

In [None]:
for file in glob.glob('../strctrd/one_cncr/*'):
    one_chk = pd.read_csv(file,
                    sep = '\t', index_col = 0)

    break

In [None]:
one_chk

In [None]:
# Lung adeno
cncr = 'lung-adeno'
cncr_lbl = 'lung_adeno'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Lung Adenocarcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Lung adenocarcinoma'] # cp

In [None]:
# Pancreatic adeno
cncr = 'pancreatic-adeno'
cncr_lbl = 'pancreatic_adeno'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Pancreatic Adenocarcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Pancreatic ductal adenocarcinoma'] # cp

In [None]:
# Head and neck
cncr = 'head-neck'
cncr_lbl = 'head_neck'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Head and Neck Squamous Cell Carcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Head and Neck squamous cell carcinoma'] # cp

In [None]:
# build non-brca cell line to cptac for production run
names = ['lung-adeno', 'pancreatic-adeno', 'head-neck', 'colon-adeno', 'glioblastoma', 'renal-clear-cell']
labels = ['lung_adeno', 'pancreatic_adeno', 'head_neck', 'colon_adeno', 'glioblastoma', 'renal-clear_cell']
a_list = ['Lung Adenocarcinoma',
          'Pancreatic Adenocarcinoma',
          'Head and Neck Squamous Cell Carcinoma',
          'Colon Adenocarcinoma',
          'Glioblastoma',
          'Renal Clear Cell Carcinoma']
b_list = ['Lung adenocarcinoma',
          'Pancreatic ductal adenocarcinoma',
          'Head and Neck squamous cell carcinoma',
          'Colon adenocarcinoma',
          'Glioblastoma multiforme',
          'Clear cell renal cell carcinoma']

In [None]:
# Colon adeno
cncr = 'colon-adeno'
cncr_lbl = 'colon_adeno'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Colon Adenocarcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Colon adenocarcinoma'] # cp

In [None]:
# Glioblastoma
cncr = 'glioblastoma'
cncr_lbl = 'glioblastoma'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Glioblastoma']
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Glioblastoma multiforme'] # cp

In [None]:
# Renal clear cell
cncr = 'renal-clear-cell'
cncr_lbl = 'renal_clear_cell'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Renal Clear Cell Carcinoma']
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Clear cell renal cell carcinoma'] # cp

#### BRCA

In [None]:
# Breast Ductal
cncr = 'breast-ductal'
cncr_lbl = 'breast_ductal'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Breast Invasive Ductal Carcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Breast carcinoma']

In [None]:
# Breast Lobular
cncr = 'breast-lobular'
cncr_lbl = 'breast_lobular'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Breast Invasive Lobular Carcinoma'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Breast carcinoma']

In [None]:
# Breast NOS
cncr = 'breast-nos'
cncr_lbl = 'breast_nos'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Breast Invasive Carcinoma, NOS'] # cl
sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == 'Breast carcinoma']

##### brca single cancers to /strctrd/one_cncr

In [None]:
pwd

In [None]:
ls

In [None]:
# strctr_one.py
# Script to build one_cncr files
    # Sript for building strctrd/two_cncr files
    # will read from strctrd/one_cncr files
# pwd: /mdl-sys-bnchmrk/code

# Imports <-------------------------------- Devel toggle
# import coderdata as cd
# import glob
# import pandas as pd
# import umap
# import numpy as np
# import os
# import matplotlib.pyplot as plt
# from matplotlib.lines import Line2D

# pwd
# mdl-sys-bnchmrk/code

# ls
# strctr.py

# Import functions
import strctr

systems = 'cell-line+CPTAC'

# Read data (construct loader objects) <--- Devel toggle
# cell_line = cd.DatasetLoader(
#   'cell_line', data_directory = '../data/cell_line/') # a 
# cptac = cd.DatasetLoader(
#   'cptac', data_directory = '../data/cptac/') # b
print('Data loader modules built')

# Cell lines are system A
sys_a_samp = cell_line.samples
sys_a = 'cell-line'
sys_a_lbl = 'cell_line'

# CPTAC is system B
sys_b_samp = cptac.samples
sys_b = 'cptac'
sys_b_lbl = 'CPTAC'

# BeatAML is system B
# sys_b_samp = beataml.samples
# sys_b = 'beat-aml'
# sys_b_lbl = 'BeatAML'

# Transcriptomics data modality extraction
modality = 'transcriptomics' # to file name
moda = 'tran_' # to columns and index

# Proteomics data modality extraction
# modality = 'proteomics' # to file name
# moda = 'prot_' # to columns and index

# Copy number and Mutations
# Data modality extractions insertion point

# BRCA discrete labels <---------------------------
# names = ['breast-ductal', 'breast-lobular', 'breast-nos']
# labels = ['breast_ductal', 'breast_lobular', 'breast_nos']
# a_list = ['Breast Invasive Ductal Carcinoma',
#           'Breast Invasive Lobular Carcinoma',
#           'Breast Invasive Carcinoma, NOS']
# b_list = ['Breast carcinoma',
#           'Breast carcinoma',
#           'Breast carcinoma']

# BRCA cell line unified labels <------------------
# names = ['breast-ductal', 'breast-lobular', 'breast-nos']
# labels = ['breast_ductal', 'breast_lobular', 'breast_nos']
# a_list = ['Breast Invasive Ductal Carcinoma']
# b_list = ['Breast carcinoma']

# BeatAML direct mapping to cell line <------------
# names = ['']
# labels = ['']
# a_list = ['']
# b_list = ['']

# Cell line and CPTAC production run <------------
names = ['lung-adeno',
         'pancreatic-adeno',
         'head-neck',
         'colon-adeno',
         'glioblastoma',
         'renal-clear-cell']
labels = ['lung_adeno',
          'pancreatic_adeno',
          'head_neck',
          'colon_adeno',
          'glioblastoma',
          'renal-clear_cell']
a_list = ['Lung Adenocarcinoma',
          'Pancreatic Adenocarcinoma',
          'Head and Neck Squamous Cell Carcinoma',
          'Colon Adenocarcinoma',
          'Glioblastoma',
          'Renal Clear Cell Carcinoma']
b_list = ['Lung adenocarcinoma',
          'Pancreatic ductal adenocarcinoma',
          'Head and Neck squamous cell carcinoma',
          'Colon adenocarcinoma',
          'Glioblastoma multiforme',
          'Clear cell renal cell carcinoma']

for i, cncr in enumerate(names):
    # cncr = a_list[i]
    cncr_lbl = labels[i]
    
    sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == a_list[i]]
    sys_b_samp_canc_n = sys_b_samp[sys_b_samp.cancer_type == b_list[i]]

    ids_sys_a = sys_a_samp_canc_n.improve_sample_id # cl
    ids_sys_b = sys_b_samp_canc_n.improve_sample_id # cp

    mda_n_sys_a = cell_line.transcriptomics[cell_line.transcriptomics.improve_sample_id.isin(ids_sys_a)] # cl
    mda_n_sys_b = cptac.transcriptomics[cptac.transcriptomics.improve_sample_id.isin(ids_sys_b)]
    # break
    df_lite, size, na_count, inf_count = df_check(mda_n_sys_a)
    # df_lite, size, na_count, inf_count = strctr.df_check(mda_n_sys_a)
    print(sys_a, '| sys a')
    print(cncr, modality)
    print('len: ', size)
    print('NaNs: ', na_count)
    print('Infs: ', inf_count)

    wall_clock, dot_T = extract(df_lite)
    dot_T = g(moda, dot_T.copy())
    dot_T.dropna(axis = 1, inplace = True)
    a = dot_T # cell line

    df_lite, size, na_count, inf_count = df_check(mda_n_sys_b)
    # df_lite, size, na_count, inf_count = strctr.df_check(mda_n_sys_a)
    print(sys_b, '| sys b')
    print(cncr, modality)
    print('len: ', size)
    print('NaNs: ', na_count)
    print('Infs: ', inf_count)
    
    wall_clock, dot_T = extract(df_lite)
    dot_T = g(moda, dot_T.copy())
    dot_T.dropna(axis = 1, inplace = True)
    b = dot_T # cptac
    
    a.insert(0, 'Cancer_type', cncr_lbl)
    b.insert(0, 'Cancer_type', cncr_lbl)
    a.insert(0, 'System', sys_a_lbl)
    b.insert(0, 'System', sys_b_lbl)

    ab = pd.concat([a, b], axis=0, join='inner')
    print(ab.System.value_counts())
    print(ab.Cancer_type.value_counts())

    out_one = '../strctrd/one_cncr/'
    print('disk break')
    # break
    # Write two-system, single cancer type to disk <--- Devel toggle 
    ab.to_csv(
        '../strctrd/'+out_one+'/'+cncr+'_'+modality+'_'+systems+'.tsv',
        sep = '\t')
    # break
print('One-cancer production run complete')

In [None]:
# disk write check

In [None]:
ls ../strctrd/one_cncr/

In [None]:
for file in glob.glob('../strctrd/one_cncr/*'):
    one_chk = pd.read_csv(file,
                    sep = '\t', index_col = 0)

    break

In [None]:
one_chk

##### Merged BRCA as single cancer

##### intertools, brca, two cancer

In [None]:
from itertools import combinations

#### Cancer_type toggles, cell line as a, BeatAML as c

In [None]:
# devel, c = b
cncr = 'beat-aml'
cncr_lbl = 'beat_aml'
sys_a_samp_canc_n = sys_a_samp[sys_a_samp.cancer_type == 'Renal Clear Cell Carcinoma']
sys_c_samp_canc_n = sys_c_samp[sys_c_samp.cancer_type == 'Clear cell renal cell carcinoma']

In [None]:
# beat aml as single cancer


#### Extract sample ids

In [None]:
print(cncr)
ids_sys_a = sys_a_samp_canc_n.improve_sample_id # cl
ids_sys_b = sys_b_samp_canc_n.improve_sample_id # cp

In [None]:
print(cncr)
ids_sys_a = sys_a_samp_canc_n.improve_sample_id # cl
ids_sys_b = sys_b_samp_canc_n.improve_sample_id # beataml

#### Data extraction

In [None]:
modality = 'transcriptomics' # to file name
moda = 'tran_' # to columns and index
mda_n_sys_a = cell_line.transcriptomics[cell_line.transcriptomics.improve_sample_id.isin(ids_sys_a)] # cl
mda_n_sys_b = cptac.transcriptomics[cptac.transcriptomics.improve_sample_id.isin(ids_sys_b)]

In [None]:
modality = 'proteomics' # to file name
moda = 'prot_' # to columns and index
mda_n_sys_a = cell_line.proteomics[cell_line.proteomics.improve_sample_id.isin(ids_proj_a)] # cl
mda_n_sys_b = cptac.proteomics[cptac.proteomics.improve_sample_id.isin(ids_proj_b)]

In [None]:
modality = 'copy-number' # to file name
moda = 'cpnm_' # to columns and index
mda_n_sys_a= cell_line.copy_number[cell_line.copy_number.improve_sample_id.isin(ids_proj_a)]
mda_n_sys_b= cell_line.copy_number[cell_line.copy_number.improve_sample_id.isin(ids_proj_b)]

In [None]:
modality = 'mutations' # to file name
moda = 'muta_' # to columns and index
mda_n_sys_a = cptac.mutations[cptac.mutations.improve_sample_id.isin(ids_proj_a)]
mda_n_sys_b = cell_line.mutations[cell_line.mutations.improve_sample_id.isin(ids_proj_b)]

#### Project A, cell line

In [None]:
df_lite, size, na_count, inf_count = df_check(mda_n_sys_a)
print(sys_a, '| sys a')
print(cncr, modality)
print(size)
print(na_count)
print(inf_count)

wall_clock, dot_T = extract(df_lite)
dot_T = g(moda, dot_T.copy())
dot_T.dropna(axis = 1, inplace = True)
a = dot_T # cell line

##### a devel

In [None]:
wall_clock, dot_T = extract(df_lite)
print(wall_clock)
dot_T.iloc[:3, :3]

In [None]:
cncr

In [None]:
moda

In [None]:
# dot_T = g(d_typ, dot_T.copy())
dot_T = g(moda, dot_T.copy())
dot_T.iloc[:3, :3]

In [None]:
dot_T.shape

In [None]:
dot_T.dropna(axis = 1, inplace = True)

In [None]:
dot_T.shape

In [None]:
a = dot_T # cell line

#### Project B, CPTAC

In [None]:
df_lite, size, na_count, inf_count = df_check(mda_n_sys_b)
print(sys_b, '| sys b')
print(cncr, modality)
print('len: ', size)
print('NaNs: ', na_count)
print('Infs: ', inf_count)

wall_clock, dot_T = extract(df_lite)
dot_T = g(moda, dot_T.copy())
dot_T.dropna(axis = 1, inplace = True)
b = dot_T # cptac

##### b devel

In [None]:
wall_clock, dot_T = extract(df_lite)
print(wall_clock)
dot_T.iloc[:3, :3]

In [None]:
dot_T = g(moda, dot_T.copy())

In [None]:
dot_T.iloc[:3, :3]

In [None]:
dot_T.shape

In [None]:
dot_T.dropna(axis = 1, inplace = True)

In [None]:
dot_T.shape

In [None]:
b = dot_T # cptac

#### Labels

##### Check a and b are different

In [None]:
a.iloc[:3, :3]

In [None]:
b.iloc[:3, :3]

##### Continue

In [None]:
a.insert(0, 'Cancer_type', cncr_lbl)
b.insert(0, 'Cancer_type', cncr_lbl)
a.insert(0, 'System', sys_a_lbl)
b.insert(0, 'System', sys_b_lbl)

In [None]:
ab = pd.concat([a, b], axis=0, join='inner')

In [None]:
ab

In [None]:
ab.System.value_counts()

In [None]:
ab.Cancer_type.value_counts()

In [None]:
# Set target output dirs
out_dbl = '../output/dbl-cncr/'
out_sng = '../output/sng-cncr'
ump_out = '../output/umap/'

In [None]:
# out, u_o, cancer, data_type
out_dbl, out_sng, cncr, modality, ump_out

In [None]:
# Write two-system, single cancer type to disk
ab.to_csv(
    '../output/'+out_sng+'/'+cncr+'_'+modality+'_'+systems+'.tsv',
    sep = '\t')

In [None]:
sng_chk = pd.read_csv('../output/'+out_sng+'/'+cncr+'_'+modality+'_'+systems+'.tsv',
                    sep = '\t', index_col = 0)

In [None]:
print(cncr)
sng_chk

#### Dbl Cncr

In [None]:
print(cncr)
ab.iloc[:3, :3]

In [None]:
ab.shape

#### Disk read Cancer_n

In [None]:
# Interactive zone for batch correction experiments

In [None]:
# Template read-in path
output/sng_cncr/lung_adeno_transcriptomics_cell_line_CPTAC.tsv

#### Experimental record

In [None]:
# toggle zone 1

In [None]:
systems = 'cell-line+CPTAC'

In [None]:
cncr_n = 'renal-clear-cell'
moda_n = 'transcriptomics'
sys_n = 'cell-line+CPTAC'

In [None]:
cncr_n = 'pancreatic-adeno'
moda_n = 'transcriptomics'
sys_n = 'cell-line+CPTAC'

In [None]:
cncr_n = 'lung-adeno'
moda_n = 'transcriptomics'
sys_n = 'cell-line+CPTAC'

In [None]:
cncr_n = 'glioblastoma'
moda_n = 'transcriptomics'
sys_n = 'cell-line+CPTAC'

In [None]:
cncr_n = 'colon-adeno'
moda_n = 'transcriptomics'
sys_n = 'cell-line+CPTAC'

In [None]:
# toggle zone 2

In [None]:
c1 = pd.read_csv( # single cancer 1
    '../output/sng-cncr/'+cncr_n+'_'+moda_n+'_'+sys_n+'.tsv',
    sep = '\t', index_col = 0)

In [None]:
c1.iloc[:3, :3]

In [None]:
c2 = pd.read_csv( # single cancer 1
    '../output/sng-cncr/'+cncr_n+'_'+moda_n+'_'+sys_n+'.tsv',
    sep = '\t', index_col = 0)

In [None]:
c2.iloc[:3, :3]

In [None]:
print(moda_n)
dbl_cncr = pd.concat([c1, c2], axis = 0, join = 'inner')
dbl_cncr.shape

In [None]:
dbl_cncr.iloc[:3, :3]

In [None]:
dbl_cncr.iloc[-3:, :3]

In [None]:
# two sets of number for each label type

In [None]:
dbl_cncr.System.value_counts()

In [None]:
dbl_cncr.Cancer_type.value_counts()

In [None]:
out_dbl, cncr, cncr_n, moda_n, systems

In [None]:
ls ../output/dbl-cncr/

In [None]:
dbl_cncr.to_csv('../output/'+out_dbl+'/'+cncr+'+'+cncr_n+'_'+moda_n+'_'+systems+'.tsv',
                    sep = '\t')

In [None]:
dbl_chk = pd.read_csv('../output/'+out_dbl+'/'+cncr+'+'+cncr_n+'_'+moda_n+'_'+systems+'.tsv',
                    sep = '\t', index_col = 0)

In [None]:
dbl_chk