In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scvelo as sv
import anndata
import scanpy as sc
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
DATA_PATH = '/home/risitop/OneDrive/Documents/PHD/y1/data/'

In [14]:
def get_nd_array(arr):
    x = None
    if str(type(arr)):
        x = arr
    else:
        x = arr.toarray()
    return x

def smooth_matrix_by_pooling(matrix,indices):
    matrix_pooled = matrix.copy()
    for i in range(len(indices)):
        matrix_pooled[i,:] = np.mean(matrix[indices[i],:],axis=0)
    return matrix_pooled

def smooth_adata_by_pooling(adata, X_embed, n_neighbors=10):
    print('Smoothing data...')
    adata_pooled = adata.copy()
    nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(X_embed)
    distances, indices = nbrs.kneighbors(X_embed)    
    adata_pooled.X = smooth_matrix_by_pooling(get_nd_array(adata.X),indices)
    if 'matrix' in adata.layers:
        adata_pooled.layers['matrix'] = smooth_matrix_by_pooling(get_nd_array(adata.layers['matrix']),indices)
    if 'spliced' in adata.layers:
        adata_pooled.layers['spliced'] = smooth_matrix_by_pooling(get_nd_array(adata.layers['spliced']),indices)
    if 'unspliced' in adata.layers:
        adata_pooled.layers['unspliced'] = smooth_matrix_by_pooling(get_nd_array(adata.layers['unspliced']),indices)
    return adata_pooled

def preprocessing_without_pooling(adata, log_transform, normalize_totals, top_variable_genes, n_pcs):
    adata = adata.copy()
    if log_transform:
        sc.pp.log1p(adata)
    if normalize_totals:
        sc.pp.normalize_total(adata, target_sum=10000)
    if top_variable_genes > 0:
        variances = np.var(adata.X, axis=0)
        inds = np.flip(np.argsort(variances))
        ind_genes = inds[0:top_variable_genes]
        if 0 in variances[ind_genes]:
            ind_first_zero = np.argwhere(variances[ind_genes] == 0)[0][0]
            ind_genes = ind_genes[0:ind_first_zero]
        adata = adata[:,ind_genes]
    sc.tl.pca(adata, n_comps=n_pcs)
    return adata

def pooling_procedure(adata, adata_pp, n_neighbors=10, n_pcs=30):
    if n_neighbors <= 0: return adata_pp
    print('Pooling data...')
    adata = adata.copy()
    X_pca = adata_pp.obsm['X_pca']
    adata = smooth_adata_by_pooling(adata, X_pca, n_neighbors=n_neighbors)
    return adata

def u900_recipe(adata, top_variable_genes=10000, normalize_totals=False, 
                log_transform=True, n_neighbors_pooling=10, n_pcs=30):

    adata = adata.copy()
    
    # MT-counts cleaning
    print('Filtering with mitochondria count...')
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    
    print('Preprocessing data...')
    adata_pp = preprocessing_without_pooling(adata, log_transform, normalize_totals, top_variable_genes, n_pcs)
    
    # pooling
    adata = pooling_procedure(adata, adata_pp, n_neighbors=n_neighbors_pooling, n_pcs=n_pcs)
    adata = preprocessing_without_pooling(adata, log_transform, normalize_totals, top_variable_genes, n_pcs)
        
    return adata

In [16]:
adata = sc.read_h5ad(DATA_PATH + 'CCLE/Head_and_Neck_Cancer.h5ad')
adata = u900_recipe(adata)
adata

Filtering with mitochondria count...
Preprocessing data...
Pooling data...
Smoothing data...


AnnData object with n_obs × n_vars = 7102 × 10000
    obs: 'CellLine', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [13]:
adata.write(DATA_PATH + 'CCLE/Head_and_Neck_Cancer_pp.h5ad', compression='gzip')

PREPROCESSING PARAMETERS:
Already_Log_Transformed= False
Normalize_Totals= False
number_of_pcs= 30
n_neighbours_for_pooling= 10
top_variable_genes= 10000


AnnData object with n_obs × n_vars = 7102 × 10000
    obs: 'CellLine', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'