In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr
import seaborn as sns
import scipy.io
import matplotlib.pyplot as plt
import os
import scvelo as scv
import harmonypy as hm
from matplotlib.pyplot import rc_context
import re
import scvi

Global seed set to 0


In [2]:
datapath = r"/nfs_master/prakrithi/abhay/testis_allfiles/" # make sure to put the 'r' in front

In [3]:
results_dir = '/nfs_master/prakrithi/abhay/testis_scripts/results_Nov25/script1_preprocessing/'

In [None]:
adata = sc.read_h5ad(datapath+'GSE112013_SRR6860519_Donor1_rep1.h5ad')

In [None]:
adata

# Doublet Removal

In [None]:
sc.pp.filter_genes(adata, min_cells = 10)

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes = 2000, subset = True, flavor = 'seurat_v3')

#### Predicting doublets using scvi model

In [None]:
scvi.model.SCVI.setup_anndata(adata)
vae = scvi.model.SCVI(adata)
vae.train()

#### Predicting doublets using solo model

In [None]:
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()

In [None]:
df = solo.predict()
df['prediction'] = solo.predict(soft = False)

#removes the -0 
df.index = df.index.map(lambda x: x[:-2])
df

In [None]:
df.groupby('prediction').count()

In [None]:
df['dif'] = df.doublet - df.singlet
df

In [None]:
sns.displot(df[df.prediction == 'doublet'], x = 'dif')
plt.show(sns)

In [None]:
doublets = df[(df.prediction == 'doublet') & (df.dif > 1)]
doublets

In [None]:
adata = sc.read_h5ad(datapath+'GSE112013_SRR6860519_Donor1_rep1.h5ad')


In [None]:
adata.obs['doublet'] = adata.obs.index.isin(doublets.index)

In [None]:
adata = adata[~adata.obs.doublet]

In [None]:
adata

# Preprocessing

In [None]:
adata.var['mt'] = adata.var.index.str.startswith('MT-')

In [None]:
adata.var

In [46]:
#list of ribosomal genes from Broad Institute
ribo_url = "http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=KEGG_RIBOSOME&fileType=txt"

In [47]:
ribo_genes = pd.read_table(ribo_url, skiprows=2, header = None)
ribo_genes


Unnamed: 0,0
0,FAU
1,MRPL13
2,RPL10
3,RPL10A
4,RPL10L
...,...
83,RPS9
84,RPSA
85,RSL24D1
86,RSL24D1P11


In [None]:
adata.var['ribo'] = adata.var_names.isin(ribo_genes[0].values)

In [None]:
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt', 'ribo'], percent_top=None, log1p=False, inplace=True)

In [None]:
# Shows how many cells each gene was found in.
sc.pp.filter_genes(adata, min_cells=3)
adata.var.sort_values('n_cells_by_counts')


In [None]:
# Shows how many genes each cell expresses
sc.pp.filter_cells(adata, min_genes = 200)
adata.obs.sort_values('n_genes_by_counts')


In [None]:
adata.obs.sort_values('pct_counts_mt')

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo'], 
             jitter=0.4, multi_panel=True)

In [None]:
upper_lim = np.quantile(adata.obs.n_genes_by_counts.values, .98)
upper_lim

In [None]:
adata = adata[adata.obs.n_genes_by_counts < upper_lim]
adata.obs

In [None]:
adata = adata[adata.obs.pct_counts_mt < 20]
adata

In [None]:
#adata = adata[adata.obs.pct_counts_ribo < 2]

In [None]:
adata

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo'], 
             jitter=0.4, multi_panel=True)

# Normalization

In [None]:
adata.X.sum(axis=1)

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4) #normalize every cell to 10,000 UMI

In [None]:
adata.X.sum(axis = 1)

In [None]:
sc.pp.log1p(adata) #change to log counts

In [None]:
adata.X.sum(axis = 1)

#### Freeze the data as it is now 

In [None]:
adata.raw = adata

# Clustering

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes = 2000)

In [None]:
adata.var

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata = adata[:, adata.var.highly_variable]

In [None]:
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt', 'pct_counts_ribo'])

In [None]:
# Normalize each gene to it's unit variance
sc.pp.scale(adata, max_value=10)

In [None]:
sc.tl.pca(adata, svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, log=True, n_pcs = 50)

In [None]:
sc.pp.neighbors(adata, n_pcs = 30)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution = 0.5)
sc.pl.umap(adata, color=['leiden'])

# Integration

In [58]:
#list of ribosomal genes from Broad Institute
ribo_url = "http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=KEGG_RIBOSOME&fileType=txt"
ribo_genes = pd.read_table(ribo_url, skiprows=2, header = None)
ribo_genes

Unnamed: 0,0
0,FAU
1,MRPL13
2,RPL10
3,RPL10A
4,RPL10L
...,...
83,RPS9
84,RPSA
85,RSL24D1
86,RSL24D1P11


In [59]:
def pp(csv_path):
    adata = sc.read_h5ad(csv_path)
    sc.pp.filter_genes(adata, min_cells = 10)
    sc.pp.highly_variable_genes(adata, n_top_genes = 2000, subset = True, flavor = 'seurat_v3')
    scvi.model.SCVI.setup_anndata(adata)
    vae = scvi.model.SCVI(adata)
    vae.train()
    solo = scvi.external.SOLO.from_scvi_model(vae)
    solo.train()
    df = solo.predict()
    df['prediction'] = solo.predict(soft = False)
    df.index = df.index.map(lambda x: x[:-2])
    df['dif'] = df.doublet - df.singlet
    doublets = df[(df.prediction == 'doublet') & (df.dif > 1)]
    
    adata = sc.read_h5ad(csv_path)
    adata.obs['Sample'] = csv_path.split('.')[0] #'raw_counts/GSM5226574_C51ctr_raw_counts.csv'
    
    adata.obs['doublet'] = adata.obs.index.isin(doublets.index)
    adata = adata[~adata.obs.doublet]
    
    
    sc.pp.filter_cells(adata, min_genes=200) #get rid of cells with fewer than 200 genes
    #sc.pp.filter_genes(adata, min_cells=3) #get rid of genes that are found in fewer than 3 cells
    adata.var['mt'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
    adata.var['ribo'] = adata.var_names.isin(ribo_genes[0].values)
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt', 'ribo'], percent_top=None, log1p=False, inplace=True)
    upper_lim = np.quantile(adata.obs.n_genes_by_counts.values, .98)
    adata = adata[adata.obs.n_genes_by_counts < upper_lim]
    adata = adata[adata.obs.pct_counts_mt < 20]
    #adata = adata[adata.obs.pct_counts_ribo < 2]

    return adata

In [60]:
datapath = r"/nfs_master/prakrithi/abhay/testis_allfiles/" # make sure to put the 'r' in front

In [61]:
os.chdir(datapath)

In [62]:
out = []
for file in os.listdir():
    out.append(pp(file))

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [00:36<00:00, 10.98it/s, loss=528, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|██████████████████████████| 400/400 [00:33<00:00, 11.91it/s, loss=0.428, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [04:19<00:00,  1.54it/s, loss=852, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 396/400:  99%|█████████████████████████▋| 396/400 [04:16<00:02,  1.55it/s, loss=0.244, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.213. Signaling Trainer to stop.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [03:03<00:00,  2.17it/s, loss=356, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 364/400:  91%|███████████████████████▋  | 364/400 [02:36<00:15,  2.33it/s, loss=0.371, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.338. Signaling Trainer to stop.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [02:08<00:00,  3.12it/s, loss=401, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 282/400:  70%|██████████████████▎       | 282/400 [01:31<00:38,  3.08it/s, loss=0.398, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.364. Signaling Trainer to stop.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [04:20<00:00,  1.54it/s, loss=835, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 244/400:  61%|███████████████▊          | 244/400 [02:37<01:40,  1.54it/s, loss=0.183, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.177. Signaling Trainer to stop.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [04:17<00:00,  1.55it/s, loss=537, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 270/400:  68%|█████████████████▌        | 270/400 [02:55<01:24,  1.54it/s, loss=0.225, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.213. Signaling Trainer to stop.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|██████████████████████████| 400/400 [00:23<00:00, 17.10it/s, loss=1e+03, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 203/400:  51%|█████████████▏            | 203/400 [00:11<00:10, 17.99it/s, loss=0.384, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.445. Signaling Trainer to stop.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [04:13<00:00,  1.58it/s, loss=880, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 313/400:  78%|████████████████████▎     | 313/400 [03:21<00:55,  1.55it/s, loss=0.244, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.214. Signaling Trainer to stop.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [04:18<00:00,  1.55it/s, loss=302, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 315/400:  79%|████████████████████▍     | 315/400 [03:23<00:54,  1.55it/s, loss=0.384, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.366. Signaling Trainer to stop.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [04:22<00:00,  1.52it/s, loss=467, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 145/400:  36%|█████████▍                | 145/400 [01:33<02:44,  1.55it/s, loss=0.276, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.247. Signaling Trainer to stop.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [02:41<00:00,  2.47it/s, loss=381, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 333/400:  83%|█████████████████████▋    | 333/400 [02:10<00:26,  2.55it/s, loss=0.391, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.379. Signaling Trainer to stop.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [04:21<00:00,  1.53it/s, loss=263, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 265/400:  66%|█████████████████▏        | 265/400 [02:53<01:28,  1.53it/s, loss=0.255, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.244. Signaling Trainer to stop.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [00:42<00:00,  9.40it/s, loss=646, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|██████████████████████████| 400/400 [00:40<00:00,  9.78it/s, loss=0.385, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [03:15<00:00,  2.05it/s, loss=372, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 267/400:  67%|█████████████████▎        | 267/400 [02:12<01:05,  2.02it/s, loss=0.293, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.254. Signaling Trainer to stop.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [03:02<00:00,  2.19it/s, loss=715, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 244/400:  61%|███████████████▊          | 244/400 [01:49<01:10,  2.23it/s, loss=0.245, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.215. Signaling Trainer to stop.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████| 400/400 [03:23<00:00,  1.96it/s, loss=367, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|██████████████████████████| 400/400 [03:22<00:00,  1.98it/s, loss=0.375, v_num=1]


In [64]:
out

[View of AnnData object with n_obs × n_vars = 965 × 58389
     obs: 'Sample', 'doublet', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'
     var: 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts',
 View of AnnData object with n_obs × n_vars = 8954 × 58389
     obs: 'Sample', 'doublet', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'
     var: 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts',
 View of AnnData object with n_obs × n_vars = 6009 × 58389
     obs: 'Sample', 'doublet', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'
     var: 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts',
 View of AnnData object with n_obs × n_vars = 4311 × 58389


In [65]:
adata = sc.concat(out)

In [67]:
adata.X

<99542x58389 sparse matrix of type '<class 'numpy.float32'>'
	with 155437391 stored elements in Compressed Sparse Row format>

In [68]:
sc.pp.filter_genes(adata, min_cells = 10)

In [69]:
adata

AnnData object with n_obs × n_vars = 99542 × 43238
    obs: 'Sample', 'doublet', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'
    var: 'n_cells'

In [70]:
adata.write_h5ad('testis_combined.h5ad')

In [72]:
adata = sc.read_h5ad('testis_combined.h5ad')
adata

AnnData object with n_obs × n_vars = 99542 × 43238
    obs: 'Sample', 'doublet', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'
    var: 'n_cells'

In [76]:
adata.obs.groupby('Sample').count()

Unnamed: 0_level_0,doublet,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,total_counts_ribo,pct_counts_ribo
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GSE112013_SRR6860519_Donor1_rep1,8600,8600,8600,8600,8600,8600,8600,8600
GSE112013_SRR6860520_Donor1_rep2,6654,6654,6654,6654,6654,6654,6654,6654
GSE112013_SRR6860521_Donor2_rep1,4311,4311,4311,4311,4311,4311,4311,4311
GSE112013_SRR6860522_Donor2_rep2,5047,5047,5047,5047,5047,5047,5047,5047
GSE112013_SRR6860523_Donor3_rep1,6009,6009,6009,6009,6009,6009,6009,6009
GSE112013_SRR6860524_Donor3_rep2,8420,8420,8420,8420,8420,8420,8420,8420
GSE153947_SRR12164933_Normal_1,9026,9026,9026,9026,9026,9026,9026,9026
GSE153947_SRR12164934_Normal_2,9084,9084,9084,9084,9084,9084,9084,9084
GSE153947_SRR12164935_Normal_3,8954,8954,8954,8954,8954,8954,8954,8954
GSE153947_SRR12164936_Crypto_1,6071,6071,6071,6071,6071,6071,6071,6071


In [77]:
#saving the raw data, scVI will use the counts layer
adata.layers['counts'] = adata.X.copy()

In [78]:
#normalize the concatenated data, will be used by other functions
sc.pp.normalize_total(adata, target_sum = 1e4)
sc.pp.log1p(adata)
adata.raw = adata

In [79]:
adata.obs.head()

Unnamed: 0,Sample,doublet,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,total_counts_ribo,pct_counts_ribo
AAACGGGGTCTGGAGA-1,GSE154535_SRR12234213_Donor_OA,False,460,460,1021.0,0.0,0.0,12.0,1.175318
AAAGATGTCCGATATG-1,GSE154535_SRR12234213_Donor_OA,False,460,460,900.0,0.0,0.0,86.0,9.555556
AAAGCAACAGGGTTAG-1,GSE154535_SRR12234213_Donor_OA,False,5877,5877,25996.0,0.0,0.0,1536.0,5.908601
AAAGCAAGTAACGACG-1,GSE154535_SRR12234213_Donor_OA,False,311,311,434.0,0.0,0.0,14.0,3.225806
AAAGCAAGTGAGTGAC-1,GSE154535_SRR12234213_Donor_OA,False,5672,5672,24980.0,0.0,0.0,1628.0,6.517214


In [80]:
#Batch correction for combined data
scvi.model.SCVI.setup_anndata(adata, layer = "counts",
                             categorical_covariate_keys=["Sample"],
                             continuous_covariate_keys=['pct_counts_mt', 'total_counts', 'pct_counts_ribo'])


In [81]:
model = scvi.model.SCVI(adata)

In [82]:
model.train() #may take a while without GPU

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 80/80: 100%|████████████████████████████| 80/80 [17:59<00:00, 13.49s/it, loss=4.8e+03, v_num=1]


In [83]:
adata.obsm['X_scVI'] = model.get_latent_representation()

In [84]:
adata.layers['scvi_normalized'] = model.get_normalized_expression(library_size = 1e4)

In [86]:
sc.pp.neighbors(adata, use_rep = 'X_scVI')

2022-11-29 19:21:59.052004: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-29 19:21:59.786924: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /apps/compiler/openmpi-3.6.1/lib64::/usr/local/cuda-10.1/lib64:/home/prakrithi/miniconda3/envs/scRNA_new/bin/:/home/prakrithi/miniconda3/envs/scRNA_new/lib/
2022-11-29 19:21:59.787326: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /apps/compiler/openmpi-3.6.1/lib64::/usr/local/cuda-10.1/lib64:/home/prakrithi/miniconda3/envs/scRNA_new/bin/:/home/prakrithi/miniconda3/en

In [87]:
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution = 0.5)

In [90]:
sc.pl.umap(adata, color = ['leiden', 'Sample'], frameon = False)
plt.savefig('unannotated-umap.png', dpi = 300)