In [29]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import mpl_toolkits.mplot3d.axes3d as p3
from sklearn.decomposition import PCA
import scanpy as sc
import math

import pandas as pd

from matplotlib import animation, rc
from IPython.display import HTML

import sys
DATA_PATH = '/home/risitop/OneDrive/Documents/PHD/y1/data/'
sys.path.append(DATA_PATH)

In [30]:
plt.style.use('dark_background')

In [31]:
df_gene_weights = pd.read_csv(DATA_PATH + 'CCLE/JHU011_UPPER_AERODIGESTIVE_TRACT_ica_S.xls', sep='\t', index_col=0)
df_gene_weights

Unnamed: 0_level_0,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,Unnamed: 16
PROBE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
CDK1,-0.9190,5.0419,8.2869,-0.9463,0.5935,-0.0419,0.0775,-1.5286,-1.7286,8.2389,-1.3952,0.6260,-0.3849,0.8650,-0.8569,
UBE2C,0.6697,3.1003,9.5936,-1.6848,-1.2334,-0.0712,-0.1914,4.6683,-0.5717,4.0906,-0.4803,1.1264,-0.3727,0.8197,-0.8002,
CCNB1,-0.4128,-3.9112,11.4137,0.4421,1.2527,1.1820,-1.3029,6.0085,-1.2580,0.3552,-0.9335,0.2340,1.4414,0.2826,3.3749,
AURKB,-1.3091,4.9938,8.9267,-0.5047,1.0750,0.2742,0.9673,3.2298,-0.3919,4.0472,-2.2181,-1.2844,-0.6276,0.5016,0.7448,
AURKA,-0.1496,-2.8967,11.9240,0.1731,0.9337,0.2881,0.4520,-0.9910,0.2682,-0.3779,0.3459,0.6775,1.6482,0.4719,3.1775,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LGALS3,-0.1466,-0.9214,-0.0991,-0.6781,-0.1304,-1.3243,1.0802,0.2069,-0.8436,0.8988,-1.2667,1.4437,0.0509,0.4350,0.8060,
DOK1,0.4799,0.1539,-0.3843,-0.4367,-0.2834,-0.4136,0.1613,-0.0583,0.2711,-0.1829,-0.7776,-0.4143,0.2355,0.1569,-0.1020,
MVB12A,-0.6149,-0.3588,-0.5584,-0.6734,1.2381,0.1803,0.2249,-0.0821,-0.0896,0.5334,0.3950,-0.5984,0.1611,-1.5826,-1.0250,
HRCT1,-0.9159,-0.2209,-0.4397,-0.1543,0.2590,0.2297,0.0160,0.4183,1.3329,0.5509,0.0661,0.1099,1.0450,-0.1599,0.0919,


In [32]:
adata = sc.read_h5ad(DATA_PATH + 'CCLE/CPM_data_pp.h5ad')
adata

AnnData object with n_obs × n_vars = 53513 × 22590
    obs: 'Cell_line', 'Cancer_type', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'batch'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'

In [33]:
def batch_computation(X, f, k=10):
    n = len(X)
    step = int(n/k)
    X_tot = None
    for i in range(k+1):
        print('Computing batch %i...' % (i + 1))
        X_batch = f(X[i*step:(i+1)*step,:])
        if X_tot is None:
            X_tot = X_batch
        else:
            X_tot = np.vstack( (X_tot, X_batch) )
    return X_tot

def compute_ICs(adata, df_S):
    
    def one_step(X):
        X = np.log(X + 1)
        X = X - X.mean(axis=0)
        return X@df_S.values[:,:-1]
    
    print('Selecting relevant genes...', end=' ')
    IC_genes = [g for g in list(df_S.index) if g in adata.var_names]
    print('%i genes retained.' % len(IC_genes))
    df_S = df_S.loc[IC_genes,:]
    adata_red = adata[:,IC_genes]
    X_ICs = batch_computation(adata_red.X, one_step, k=10)
    
    return pd.DataFrame(
        data=X_ICs, 
        index=adata.obs_names.str.decode('utf-8'), 
        columns=['IC%i' % (i+1) for i in range(len(X_ICs.T))]
    )

In [34]:
df = compute_ICs(adata, df_gene_weights)

Selecting relevant genes... 10000 genes retained.
Computing batch 1...
Computing batch 2...
Computing batch 3...
Computing batch 4...
Computing batch 5...
Computing batch 6...
Computing batch 7...
Computing batch 8...
Computing batch 9...
Computing batch 10...
Computing batch 11...


In [36]:
df['Cell_line'] = list(adata.obs['Cell_line'])
df['Cells'] = list(adata.obs_names)
df = df.set_index('Cells')
df

Unnamed: 0_level_0,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,Cell_line
Cells,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AAAGATGCAGTTAACC-1-18-0,-459.273019,-99.719000,505.566067,-519.114922,169.847575,424.536166,-299.121371,116.057499,-106.598212,77.995203,5.576831,90.426192,-84.181949,-155.222429,92.378606,JHH7_LIVER
AACACGTAGACAGACC-1-18-0,-478.774515,178.630990,421.437251,-533.562986,284.366121,214.053055,-677.596930,368.076459,-172.676563,-75.932147,84.548938,59.592393,3.305337,-321.780319,180.195851,JHH7_LIVER
AACTCAGAGCGACGTA-1-18-0,-1051.309756,556.437037,258.953507,-797.287721,583.002547,420.020011,-776.948234,362.052093,-281.485070,51.031295,70.308815,-59.425816,-110.605379,-369.345908,94.166193,JHH7_LIVER
AACTCAGAGGTGCTAG-1-18-0,-321.972861,-121.400228,404.064765,-569.280165,104.051543,221.087551,-142.273235,227.717972,-74.066529,65.799318,-22.533060,-14.224675,-187.831258,-241.859425,-5.026453,JHH7_LIVER
AACTTTCAGATAGGAG-1-18-0,-665.343220,-11.258566,638.515797,-850.381385,116.533049,159.868563,-370.795667,287.936557,-94.601118,62.628048,35.066629,-16.878444,-351.609332,-96.789859,126.303109,JHH7_LIVER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTCTCCTTCAGCGACC-12-10-21,322.325904,352.770239,60.981889,651.414622,65.253577,-720.220860,464.040302,35.644125,127.001891,317.120433,219.426361,230.793803,173.595164,484.667149,-218.424113,HS729_SOFT_TISSUE
TTGCCGTGTAACGTTC-12-10-21,265.691919,100.153499,108.466144,567.538490,-16.043168,-661.138942,690.405381,-39.719779,124.096445,214.083750,145.419761,220.937859,138.363664,544.678195,-138.847494,HS729_SOFT_TISSUE
TTGTAGGCACATCCAA-12-10-21,-519.169840,396.303187,-208.636064,-322.383344,299.332509,18.559136,-123.652939,-159.495100,-188.274307,364.818837,-0.348277,-44.838243,-145.566006,-63.403952,-33.393872,HS729_SOFT_TISSUE
TTGTAGGTCGCCATAA-12-10-21,444.738501,-773.156501,682.459386,439.485426,-171.622208,-10.604976,75.673812,349.194274,201.423027,-533.556870,-57.012437,60.871715,280.090545,75.046452,-10.435954,HS729_SOFT_TISSUE


In [37]:
df.to_csv(DATA_PATH + "all_tumors_pp_ICs.csv")