### We want to compute DEGs between the ctrl and the perturbations and build a similarity matrix. Then, compare it to the reactome similarities and see whether it is any better than the VAE_tree or not.

In [12]:
import scanpy as scp
import pandas as pd
import numpy as np
import catboost as cb
from tqdm import tqdm
from scipy import sparse
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from collections import Counter
import lightgbm as lgb
import matplotlib.pyplot as plt
import umap
from sklearn.model_selection import StratifiedKFold

import ot
from sklearn.decomposition import PCA
from IPython.display import clear_output
from sklearn.preprocessing import QuantileTransformer

### Magics

In [13]:
GENE_PER_CELL_BINNING = False
N_BINS = 1000
N_ITER = 50
TOP_N_GENES = 500

### Prepare data

In [14]:
adata = scp.read_h5ad('./data/Norman_2019/norman_umi_go/perturb_processed.h5ad')

In [15]:
## Following the scGPT paper, we bin the genes within cell. 

def bin_nonzero_values(arr, num_bins):
    # Filter out non-zero values
    nonzero_vals = arr[arr != 0]
    
    # Calculate bin edges
    bin_edges = np.linspace(nonzero_vals.min(), nonzero_vals.max(), num_bins)
    
    # Bin the values
    binned_values = np.zeros_like(arr)
    binned_nonzero = np.digitize(nonzero_vals, bin_edges)
    binned_values[arr != 0] = binned_nonzero
    
    return binned_values

# Example usage
arr = np.random.randint(low=0, high=100, size=100)
num_bins = 3
binned_values = bin_nonzero_values(arr, num_bins)
print(set(binned_values))

{1, 2, 3}


In [16]:
scp.pp.normalize_total(adata, exclude_highly_expressed=True)
scp.pp.log1p(adata)
scp.pp.highly_variable_genes(adata, n_top_genes=TOP_N_GENES,subset=True)

In [17]:
if GENE_PER_CELL_BINNING:
    tempy = adata.X.toarray()
    
    for c in tqdm(range(adata.X.shape[0])):
        tempy[c,:] = bin_nonzero_values(tempy[c,:], N_BINS)
    
    adata.X = sparse.csr_matrix(tempy)
    del tempy

In [18]:
adata.obs['condition_code'] = np.nan
def f(x):
    if 'ctrl' in x and '+' in x:
        return x.replace('ctrl','').replace('+','')
    elif 'ctrl' in x:
        return 'ctrl'
    else:
        return np.nan
adata.obs['condition_code'] = adata.obs.condition.apply(lambda x: f(x)).values

In [19]:
group1 = "ctrl"
p_value_threshold = 0.0001  # Set your desired p-value threshold here

res_mtx = pd.DataFrame(index = adata.obs.condition_code.unique(), columns = adata.var.index)

for group2 in tqdm(adata.obs.condition_code.unique()):
    if group2 is not np.nan:
        scp.tl.rank_genes_groups(adata, groupby="condition_code", groups=[group1, group2], method='wilcoxon')
        
        # Access the results
        result = adata.uns["rank_genes_groups"]
        
        # Filter genes based on p-value threshold
        mask = result['pvals'][group2] < p_value_threshold
        filtered_genes = result['names'][group2][mask]
        filtered_pvals = result['pvals_adj'][group2][mask]
        
        # Sort filtered genes by p-value
        sorted_indices = np.argsort(filtered_pvals)
        sorted_genes = filtered_genes[sorted_indices]
        res_mtx.loc[group2, sorted_genes[:20]] = 1
        # Print the top 20 differentially expressed genes (or all if less than 20) for each group
        # print(f"Top differentially expressed genes in {group2} (p-value < {p_value_threshold}):")
        # for i, gene in enumerate(sorted_genes[:20]):
        #     print(f"{i+1}. {gene} (p-value: {filtered_pvals[sorted_indices[i]]:.6f})")
        # print("\n")
        

100%|██████████| 107/107 [09:22<00:00,  5.25s/it]


In [20]:
res_mtx.to_csv('./little_data/DEGs_binary_intermediate_top20_500genes.csv')

In [21]:
sim_mtx = pd.DataFrame(index = res_mtx.index, columns = res_mtx.index)
for gene1 in tqdm(sim_mtx.index):
    for gene2 in sim_mtx.columns:
        t1 = res_mtx.loc[gene1,:]
        t1 = t1.dropna().index.tolist()
        t2 = res_mtx.loc[gene2,:]
        t2 = t2.dropna().index.tolist()
        sim_mtx.loc[gene1, gene2] = len(set(t1).intersection(set(t2)))
sim_mtx.fillna(0, inplace=True)

100%|██████████| 107/107 [00:02<00:00, 38.70it/s]


In [22]:
sim_mtx.loc[sim_mtx.index.dropna(), sim_mtx.columns.dropna()].to_csv('./little_data/DEGs_sim_mtx_top20_500genes.csv')