 ##  UMI-12 Preprocessing and Clustering Pipeline

In [27]:
import scanpy as sc        
import numpy as np                 

# ground truhth
adata_12 = sc.read_h5ad("/data/dagyeman/cellranger/bam_file_analysis/1k_PBMCs/ub_objects/adata_matrices/adata_12.h5ad")  

adata = adata_12.copy()            

# preprocessing the data
sc.pp.normalize_total(adata_12, target_sum=1e4)    #  normalize counts per cell to total 10,000
sc.pp.log1p(adata_12)                              #log-transform the normalized counts (log(x + 1))
sc.pp.highly_variable_genes(adata_12, n_top_genes=2000)  # identify the top 2000 most variable genes to focus analysis, 2000 is the default

adata_12.raw = adata_12        


# Dimensionality reduction & clustering
sc.pp.pca(adata_12)                 # perform PCA on the highly variable genes
sc.pp.neighbors(adata_12)          # compute nearest-neighbor graph in PCA space
sc.tl.leiden(adata_12, resolution=0.6)   # cluster the cells using the Leiden algorithm with specified resolution (this is the default)


reference_labels = adata_12.obs['leiden'].copy()    ### Save cluster labels assigned to each cell
print(reference_labels.value_counts())  ### Check the number of cells in each cluster

sc.tl.rank_genes_groups(adata_12, groupby='leiden', method='wilcoxon') ### This gives the true DE gene sets between clusters using the Wilcoxon rank-sum test

## Wilcoxon rank-sum test is a non-parametric statistical test used to compare two independent samples. It is often used in single-cell RNA-seq analysis to identify differentially expressed genes between clusters or conditions.

leiden
0     304
1     212
2     144
3     139
4     129
5      93
6      58
7      56
8      44
9      29
10     16
11      8
Name: count, dtype: int64


In [22]:
### Naive adata_5

adata_5 = sc.read_h5ad("/data/dagyeman/cellranger/bam_file_analysis/1k_PBMCs/ub_objects/adata_matrices/adata_3.h5ad")


# Preprocess
sc.pp.normalize_total(adata_5, target_sum=1e4)
sc.pp.log1p(adata_5)
sc.pp.highly_variable_genes(adata_5, n_top_genes=2000)
adata_5.raw = adata_5

# Add labels from U[12]
adata_5.obs['leiden'] = adata_12.obs['leiden'].loc[adata_5.obs_names]


sc.tl.rank_genes_groups(adata_5, groupby='leiden', method='wilcoxon') ### This gives the DE gene sets between clusters using the Wilcoxon rank-sum test on the naive adata_5
## For each gene, it runs a Wilcoxon rank-sum test to see if expression in the target cluster is significantly different from all other clusters combined. The results are stored in `adata_5.uns['rank_genes_groups']`.

In [23]:
### Predicted adata_5 


# Load predicted UMI-5 counts (after applying your nonuniform model)
adata_5_pred = sc.read_h5ad("/data/dagyeman/cellranger/bam_file_analysis/1k_PBMCs/ub_objects/inv_nonunif_matrices/adata_3.h5ad")

### pre-processing
sc.pp.normalize_total(adata_5_pred, target_sum=1e4)                     # normalize per cell
sc.pp.log1p(adata_5_pred)                                               # log-transform
sc.pp.highly_variable_genes(adata_5_pred, n_top_genes=2000)            # select top 2000 HVGs
adata_5_pred.raw = adata_5_pred                                         # store preprocessed data for DE analysis

### adding clustering labels from U[12] - this assumes the predicted data has the same cell names as the original UMI-5 data
adata_5_pred.obs['leiden'] = adata_12.obs['leiden'].loc[adata_5_pred.obs_names]

# run differential expression, evaluating how well u[5] recovers DE genes between true clusters
sc.tl.rank_genes_groups(adata_5_pred, groupby='leiden', method='wilcoxon')


In [24]:
# Extract DE results for cluster '3' from each dataset
de_12 = sc.get.rank_genes_groups_df(adata_12, group='1')         # True counts (UMI-12)
de_5 = sc.get.rank_genes_groups_df(adata_5, group='1')           # Naive UMI-5 counts
de_5_pred = sc.get.rank_genes_groups_df(adata_5_pred, group='1') # Predicted UMI-5 counts

## top 100 most differentially expressed gene names for cluster above
top_12 = set(de_12['names'][:100])       # top genes from UMI-12 (ground truth)
top_5 = set(de_5['names'][:100])         # top genes from naive UMI-5
top_5_pred = set(de_5_pred['names'][:100]) # top genes from predicted UMI-5


print(f"Overlap (naive): {len(top_12 & top_5)} / 100")      ### count of overlapping genes in naive
print(f"Overlap (pred):  {len(top_12 & top_5_pred)} / 100") ### count of overlapping genes in predicted




Overlap (naive): 99 / 100
Overlap (pred):  99 / 100


In [26]:
for cluster_id in adata_12.obs['leiden'].unique():
    de_12 = sc.get.rank_genes_groups_df(adata_12, group=cluster_id)
    de_5 = sc.get.rank_genes_groups_df(adata_5, group=cluster_id)
    de_5_pred = sc.get.rank_genes_groups_df(adata_5_pred, group=cluster_id)

    top_12 = set(de_12['names'][:100])
    top_5 = set(de_5['names'][:100])
    top_5_pred = set(de_5_pred['names'][:100])

    print(f"Cluster {cluster_id} - Overlap (naive): {len(top_12 & top_5)} / 100")
    print(f"Cluster {cluster_id} - Overlap (pred):  {len(top_12 & top_5_pred)} / 100")

Cluster 1 - Overlap (naive): 99 / 100
Cluster 1 - Overlap (pred):  99 / 100
Cluster 3 - Overlap (naive): 98 / 100
Cluster 3 - Overlap (pred):  99 / 100
Cluster 4 - Overlap (naive): 97 / 100
Cluster 4 - Overlap (pred):  99 / 100
Cluster 0 - Overlap (naive): 98 / 100
Cluster 0 - Overlap (pred):  99 / 100
Cluster 6 - Overlap (naive): 96 / 100
Cluster 6 - Overlap (pred):  95 / 100
Cluster 2 - Overlap (naive): 94 / 100
Cluster 2 - Overlap (pred):  98 / 100
Cluster 5 - Overlap (naive): 99 / 100
Cluster 5 - Overlap (pred):  99 / 100
Cluster 7 - Overlap (naive): 98 / 100
Cluster 7 - Overlap (pred):  100 / 100
Cluster 10 - Overlap (naive): 99 / 100
Cluster 10 - Overlap (pred):  100 / 100
Cluster 8 - Overlap (naive): 99 / 100
Cluster 8 - Overlap (pred):  99 / 100
Cluster 9 - Overlap (naive): 94 / 100
Cluster 9 - Overlap (pred):  97 / 100
Cluster 11 - Overlap (naive): 99 / 100
Cluster 11 - Overlap (pred):  98 / 100


In [14]:
print(de_5)    

                names     scores  logfoldchanges          pvals      pvals_adj
0              S100A8  22.770594        7.442870  8.972436e-115  2.532560e-110
1              S100A9  22.554201        7.228644  1.221251e-112  1.723551e-108
2             S100A12  22.224096        6.141180  2.008828e-109  1.890040e-105
3                VCAN  21.909056        5.888480  2.129525e-106  1.502699e-102
4                 LYZ  21.757582        6.467295  5.855677e-105  3.305647e-101
...               ...        ...             ...            ...            ...
28221  CORO1B;PTPRCAP -16.292517       -6.399858   1.115447e-59   2.141810e-57
28222           RPS27 -16.699842       -1.392397   1.314124e-62   2.897849e-60
28223             EVL -16.782686       -3.604930   3.267173e-63   7.497497e-61
28224           RPS19 -17.051584       -1.612374   3.401817e-65   8.809145e-63
28225           KLF12 -17.401737       -6.347754   8.003854e-68   2.403370e-65

[28226 rows x 5 columns]
