In [None]:
# uni-file
# Build combined tissue file for VAE tests
    # Within single data modality, start with transcriptomics
    # Within single model system to Human project
        # Start with cell line to CPTAC

In [None]:
%whos

In [None]:
import pandas as pd
import glob as glob

### V7, multicat version

In [None]:
# check v5 files

In [None]:
ls ../data/uni-files/*.tsv

In [None]:
pths = sorted(glob.glob('../data/uni-files/*.tsv'))

In [None]:
pths

In [None]:
pths[1]

In [None]:
df_n = pd.read_csv(
        pths[1], sep = '\t', index_col = 0)

In [None]:
df_n

In [None]:
df_n.Cancer_type.value_counts()

In [None]:
df_n.System.value_counts()

In [None]:
# add beataml as cancer type?

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

In [None]:
# Dimensions
systems = ['Cell line', 'CPTAC', 'HCMI']
cancers = ['Lung', 'Brain', 'Breast', 'Pancreas']
sample_counts = np.array([
    [5, 3, 1],
    [20, 15, 10],
    [50, 40, 30],
    [100, 80, 60]
])

In [None]:
three_d(systems, cancers, sample_counts, title = 'Transcriptomics')

In [None]:
def three_d(categories, levels, values, title):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    
    xpos, ypos = np.meshgrid(np.arange(values.shape[1]), np.arange(values.shape[0]))
    xpos = xpos.flatten()
    ypos = ypos.flatten()
    zpos = np.zeros(values.shape).flatten()
    
    dx = 0.8 * np.ones_like(zpos)
    dy = dx.copy()
    dz = values.flatten()

    # dx = 0.6 * np.ones_like(zpos)  # Adjust the value to control the width
    # dy = dx.copy()
    
    # Initial bottom values for stacking
    bottom = np.zeros_like(dz)
    
    for i in range(len(dz)):
        ax.bar3d(xpos[i], ypos[i], bottom[i], dx[i], dy[i], dz[i], color='skyblue', zsort='average')
        bottom[i] += dz[i]  # Update bottom for the next layer
    
    ax.set_xlabel('Systems', fontweight='bold')
    ax.set_ylabel('Cancer types', fontweight='bold')
    ax.set_zlabel('Sample counts', fontweight='bold')
    ax.set_xticks(np.arange(len(categories)) + 0.4)  # Center the bars
    ax.set_yticks(np.arange(len(levels)) + 0.4)
    ax.set_xticklabels(categories)
    ax.set_yticklabels(levels)
    ax.view_init(elev=15, azim=-60) # Adjust to remove overlab
    plt.title(title, y = .9)
    plt.savefig('plots/3d_test_2.png')
    plt.show()

In [None]:
# Dimensions
categories = ['A', 'B', 'C']
levels = ['Level 1', 'Level 2', 'Level 3']
values = np.array([
    [5, 3, 1],
    [6, 2, 4],
    [3, 5, 2]
]) 

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

xpos, ypos = np.meshgrid(np.arange(values.shape[1]), np.arange(values.shape[0]))
xpos = xpos.flatten()
ypos = ypos.flatten()
zpos = np.zeros(values.shape).flatten()

dx = 0.8 * np.ones_like(zpos)
dy = dx.copy()
dz = values.flatten()

# Initial bottom values for stacking
bottom = np.zeros_like(dz)

for i in range(len(dz)):
    ax.bar3d(xpos[i], ypos[i], bottom[i], dx[i], dy[i], dz[i], color='skyblue', zsort='average')
    bottom[i] += dz[i]  # Update bottom for the next layer

ax.set_xlabel('Categories')
ax.set_ylabel('Levels')
ax.set_zlabel('Values')
ax.set_xticks(np.arange(len(categories)) + 0.4)  # Center the bars
ax.set_yticks(np.arange(len(levels)) + 0.4)
ax.set_xticklabels(categories)
ax.set_yticklabels(levels)
# plt.tight_layout()
plt.savefig('plots/3d_test_1.png')
plt.show()

### Original builds, version 5:

In [None]:
ls ../results/input-data/

In [None]:
r = 'r5'
pths_n = sorted(glob.glob('../results/input-data/'+r+'/'+r+'-fls/*'))

In [None]:
prot_pths = glob.glob('../results/input-data/r5/r5-fls/*_proteomics_*')
len(prot_pths)

In [None]:
tran_pths = glob.glob('../results/input-data/r5/r5-fls/*_transcriptomics_*')
len(tran_pths)

### Cell line + CPTAC

In [None]:
tran_pths_cl_cp = glob.glob(
    '../results/input-data/r5/r5-fls/*_transcriptomics_cell-line+CPTAC*')
len(tran_pths_cl_cp)

In [None]:
tall_stack = pd.DataFrame()
for i, pth_n in enumerate(tran_pths_cl_cp):
    print(pth_n.split('/')[-1])
    df_n = pd.read_csv(
        pth_n, sep = '\t', index_col = 0)
    print(df_n.System.value_counts())
    # print(df_n.Cancer_type.value_counts())
    print('')
    tall_stack = pd.concat([df_n, tall_stack],
                           axis = 0,
                           # join = 'inner'
                          )
    # break

In [None]:
tall_stack.System.value_counts()

In [None]:
tall_stack.Cancer_type.value_counts()

In [None]:
tall_stack.isna().sum().sum()

In [None]:
tall_stack.to_csv(
    '../data/uni-files/trans_cl_cp_r5.tsv', sep = '\t')

### CPTAC+HCMI

In [None]:
tran_pths_hc_cp = glob.glob(
    '../results/input-data/r5/r5-fls/*_transcriptomics_CPTAC+HCMI*')
len(tran_pths_hc_cp)

In [None]:
tall_stack = pd.DataFrame()
for i, pth_n in enumerate(tran_pths_hc_cp):
    print(pth_n.split('/')[-1])
    df_n = pd.read_csv(
        pth_n, sep = '\t', index_col = 0)
    print(df_n.shape)
    print(df_n.System.value_counts())
    # print(df_n.Cancer_type.value_counts())
    print(' ')
    tall_stack = pd.concat([df_n, tall_stack],
                           axis = 0,
                           # join = 'inner'
                          )

In [None]:
tall_stack.System.value_counts()

In [None]:
tall_stack.Cancer_type.value_counts()

In [None]:
tall_stack.isna().sum().sum()

In [None]:
tall_stack.to_csv(
    '../data/uni-files/trans_hc_cp_r5.tsv', sep = '\t')

## Proteomics

### Cell line + CPTAC

In [None]:
prot_pths_cl_cp = glob.glob(
    '../results/input-data/r5/r5-fls/*_proteomics_cell-line+CPTAC*')
len(prot_pths_cl_cp)

In [None]:
tall_stack = pd.DataFrame()
for i, pth_n in enumerate(prot_pths_cl_cp):
    print(pth_n.split('/')[-1])
    df_n = pd.read_csv(
        pth_n, sep = '\t', index_col = 0)
    print(df_n.shape)
    # print(df_n.head(1))
    print(df_n.System.value_counts())
    # print(df_n.Cancer_type.value_counts())
    print(' ')
    tall_stack = pd.concat([df_n, tall_stack],
                           axis = 0,
                          )
    # break
tall_stack.dropna(axis=1, inplace = True)

In [None]:
tall_stack.shape

In [None]:
tall_stack.System.value_counts()

In [None]:
tall_stack.Cancer_type.value_counts()

In [None]:
tall_stack.isna().sum().sum()

In [None]:
tall_stack.to_csv(
    '../data/uni-files/prot_cl_cp_r5.tsv', sep = '\t')

### CPTAC+HCMI

In [None]:
prot_pths_hc_cp = glob.glob(
    '../results/input-data/r5/r5-fls/*_proteomics_CPTAC+HCMI*')
len(prot_pths_hc_cp)