In [1]:
import scanpy as sc
import scvi
import os
import seaborn as sns
import pandas as pd
import numpy as np


Global seed set to 0


In [2]:
# adata = sc.read_h5ad('/nfs_master/prakrithi/abhay/testis_allfiles/GSE112013_SRR6860519_Donor1_rep1.h5ad')

In [3]:
ribo_url = "http://software.broadinstitute.org/gsea/msigdb/download_geneset.jsp?geneSetName=KEGG_RIBOSOME&fileType=txt"


In [4]:
ribo_genes = pd.read_table(ribo_url, skiprows=2, header = None)

In [5]:
def pp(data_path):
    adata = sc.read_h5ad(data_path)
    sc.pp.filter_genes(adata, min_cells = 10)
    sc.pp.highly_variable_genes(adata, n_top_genes = 2000, subset = True, flavor = 'seurat_v3')
    scvi.model.SCVI.setup_anndata(adata)
    vae = scvi.model.SCVI(adata)
    vae.train()
    solo = scvi.external.SOLO.from_scvi_model(vae)
    solo.train()
    df = solo.predict()
    df['prediction'] = solo.predict(soft = False)
    df.index = df.index.map(lambda x: x[:-2])
    df['dif'] = df.doublet - df.singlet
    doublets = df[(df.prediction == 'doublet') & (df.dif > 1)]
    
    adata = sc.read_h5ad(data_path)
    adata.obs['Sample'] = data_path.split('.')[0] #'raw_counts/GSM5226574_C51ctr_raw_counts.csv'
    
    adata.obs['doublet'] = adata.obs.index.isin(doublets.index)
    adata = adata[~adata.obs.doublet]
    
    
    sc.pp.filter_cells(adata, min_genes=200) #get rid of cells with fewer than 200 genes
    #sc.pp.filter_genes(adata, min_cells=3) #get rid of genes that are found in fewer than 3 cells
    adata.var['mt'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
    adata.var['ribo'] = adata.var_names.isin(ribo_genes[0].values)
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt', 'ribo'], percent_top=None, log1p=False, inplace=True)
    upper_lim = np.quantile(adata.obs.n_genes_by_counts.values, .98)
    adata = adata[adata.obs.n_genes_by_counts < upper_lim]
    adata = adata[adata.obs.pct_counts_mt < 20]
    adata = adata[adata.obs.pct_counts_ribo < 2]

    return adata

In [6]:
os.listdir('/nfs_master/prakrithi/abhay/testis_allfiles/')

['GSE154535_SRR12234213_Donor_OA.h5ad',
 'GSE153947_SRR12164935_Normal_3.h5ad',
 'GSE112013_SRR6860523_Donor3_rep1.h5ad',
 'GSE112013_SRR6860521_Donor2_rep1.h5ad',
 'GSE153947_SRR12164933_Normal_1.h5ad',
 'GSE153947_SRR12164938_Crypto_3.h5ad',
 'GSE154535_SRR12234211_Donor2_iNOA.h5ad',
 'GSE153947_SRR12164934_Normal_2.h5ad',
 'GSE112013_SRR6860524_Donor3_rep2.h5ad',
 'GSE154535_SRR12234212_Donor3_iNOA.h5ad',
 'GSE112013_SRR6860522_Donor2_rep2.h5ad',
 'GSE112013_SRR6860519_Donor1_rep1.h5ad',
 'GSE154535_SRR12234210_Donor1_iNOA.h5ad',
 'GSE112013_SRR6860520_Donor1_rep2.h5ad',
 'GSE153947_SRR12164936_Crypto_1.h5ad',
 'GSE153947_SRR12164937_Crypto_2.h5ad']

In [7]:
#will take 2-3 hours on HPC

path = '/nfs_master/prakrithi/abhay/testis_allfiles/'
out = []

for file in os.listdir(path):
    out.append(pp(path + file))

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [00:38<00:00, 10.48it/s, loss=527, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 131/400:  33%|████████████████████▎                                         | 131/400 [00:11<00:22, 11.75it/s, loss=0.468, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.485. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [04:29<00:00,  1.49it/s, loss=872, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 358/400:  90%|███████████████████████████████████████████████████████▍      | 358/400 [03:56<00:27,  1.52it/s, loss=0.253, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.221. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [03:02<00:00,  2.19it/s, loss=336, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 333/400:  83%|████████████████████████████████████████████████████▍          | 333/400 [02:29<00:30,  2.22it/s, loss=0.35, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.341. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [02:20<00:00,  2.85it/s, loss=410, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 289/400:  72%|████████████████████████████████████████████▊                 | 289/400 [01:38<00:37,  2.94it/s, loss=0.404, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.371. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [04:37<00:00,  1.44it/s, loss=841, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 246/400:  62%|██████████████████████████████████████▏                       | 246/400 [02:43<01:42,  1.51it/s, loss=0.183, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.171. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [04:39<00:00,  1.43it/s, loss=550, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 274/400:  68%|██████████████████████████████████████████▍                   | 274/400 [03:07<01:26,  1.46it/s, loss=0.241, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.216. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|███████████████████████████████████████████████████████████| 400/400 [00:26<00:00, 15.36it/s, loss=1.01e+03, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 208/400:  52%|████████████████████████████████▏                             | 208/400 [00:12<00:11, 16.97it/s, loss=0.378, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.433. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [04:52<00:00,  1.37it/s, loss=870, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 279/400:  70%|███████████████████████████████████████████▏                  | 279/400 [03:15<01:24,  1.43it/s, loss=0.231, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.207. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [05:10<00:00,  1.29it/s, loss=287, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 272/400:  68%|██████████████████████████████████████████▏                   | 272/400 [03:16<01:32,  1.38it/s, loss=0.379, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.366. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [04:54<00:00,  1.36it/s, loss=448, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 171/400:  43%|██████████████████████████▌                                   | 171/400 [01:55<02:34,  1.49it/s, loss=0.271, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.235. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [02:48<00:00,  2.38it/s, loss=386, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|██████████████████████████████████████████████████████████████| 400/400 [02:46<00:00,  2.40it/s, loss=0.374, v_num=1]


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [04:57<00:00,  1.34it/s, loss=269, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 282/400:  70%|███████████████████████████████████████████▋                  | 282/400 [03:20<01:23,  1.41it/s, loss=0.251, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.249. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [00:48<00:00,  8.19it/s, loss=652, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 306/400:  76%|███████████████████████████████████████████████▍              | 306/400 [00:35<00:10,  8.70it/s, loss=0.401, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.422. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [03:45<00:00,  1.77it/s, loss=370, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 130/400:  32%|████████████████████▍                                          | 130/400 [01:07<02:20,  1.92it/s, loss=0.31, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.270. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [03:13<00:00,  2.07it/s, loss=711, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 356/400:  89%|███████████████████████████████████████████████████████▏      | 356/400 [02:45<00:20,  2.15it/s, loss=0.219, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.210. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 400/400: 100%|████████████████████████████████████████████████████████████████| 400/400 [05:08<00:00,  1.30it/s, loss=357, v_num=1]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                            


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 351/400:  88%|██████████████████████████████████████████████████████▍       | 351/400 [03:16<00:27,  1.79it/s, loss=0.379, v_num=1]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.349. Signaling Trainer to stop.


  adata.obs['n_genes'] = number


In [20]:
out[10]


View of AnnData object with n_obs × n_vars = 1010 × 58389
    obs: 'Sample', 'doublet', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'
    var: 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'

In [21]:
adata = sc.concat(out)

  utils.warn_names_duplicates("obs")


In [22]:
adata

AnnData object with n_obs × n_vars = 12183 × 58389
    obs: 'Sample', 'doublet', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'

In [23]:
adata.obs

Unnamed: 0,Sample,doublet,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,total_counts_ribo,pct_counts_ribo
AAACGGGGTCTGGAGA-1,/nfs_master/prakrithi/abhay/testis_allfiles/GS...,False,460,460,1021.0,0.0,0.0,12.0,1.175318
AACACGTGTCGAACAG-1,/nfs_master/prakrithi/abhay/testis_allfiles/GS...,False,862,862,1407.0,0.0,0.0,25.0,1.776830
AACCGCGAGGAGTACC-1,/nfs_master/prakrithi/abhay/testis_allfiles/GS...,False,840,840,2247.0,0.0,0.0,22.0,0.979083
AACCGCGCAGACAAAT-1,/nfs_master/prakrithi/abhay/testis_allfiles/GS...,False,863,863,1367.0,0.0,0.0,22.0,1.609364
AACGTTGTCCTATGTT-1,/nfs_master/prakrithi/abhay/testis_allfiles/GS...,False,386,386,733.0,0.0,0.0,10.0,1.364256
...,...,...,...,...,...,...,...,...,...
TTCTTAGGTGGAAAGA-1,/nfs_master/prakrithi/abhay/testis_allfiles/GS...,False,1239,1239,4738.0,0.0,0.0,60.0,1.266357
TTGAACGGTAGCGCTC-1,/nfs_master/prakrithi/abhay/testis_allfiles/GS...,False,432,432,810.0,0.0,0.0,14.0,1.728395
TTGGAACCATTGTGCA-1,/nfs_master/prakrithi/abhay/testis_allfiles/GS...,False,591,591,1635.0,0.0,0.0,21.0,1.284404
TTGGCAAGTTCCAACA-1,/nfs_master/prakrithi/abhay/testis_allfiles/GS...,False,260,260,1184.0,0.0,0.0,18.0,1.520270
