In [3]:
import scrublet as scr
import scipy.io
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import csv
import scanpy as sc

#gzip -dk out_*/filtered_feature_bc_matrix/*.gz

In [10]:
samples = pd.read_csv ("../../../data/raw/scRNAseq/4918stdy_organoids.csv")
samples["SANGER SAMPLE ID"][0]

'4918STDY8615259'

In [12]:
doublet_threshold = []

for samp in range(0, len(samples["SANGER SAMPLE ID"])):
    input_dir = '../../../data/raw/scRNAseq/scRNASEQ_DATA/Pediatric_organoids/out_'+samples["SANGER SAMPLE ID"][samp]+'/filtered_feature_bc_matrix/'
    counts_matrix = scipy.io.mmread(input_dir + 'matrix.mtx').T.tocsc()
    genes = np.array(scr.load_genes(input_dir + 'features.tsv', delimiter='\t', column=1))
    
    print('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1]))
    print('Number of feature in feature list: {}'.format(len(genes)))
    
    scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06)
    
    doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, 
                                                          min_cells=3, 
                                                          min_gene_variability_pctl=85, 
                                                          n_prin_comps=30)
    #save plot
    scrub.plot_histogram()
    plt.savefig('../../../figs/scRNAseq/doublets_organoid/doublet_histogram_' + samples["SANGER SAMPLE ID"][samp] + '.pdf')
    
    #save thesholds to report
    doublet_threshold.append(scrub.threshold_)
    
    #save output for R/seurat
    barcodes = pd.read_csv (input_dir + 'barcodes.tsv', sep = '\t', header=None)
    
    df = pd.DataFrame({
    'index': barcodes[0],
    'doublet_score': scrub.doublet_scores_obs_,
    'predicted_doublet': scrub.predicted_doublets_
    })
    df.to_csv('../../../output/scrublet_output_table_' + samples["SANGER SAMPLE ID"][samp] + '.csv', index=False)

Counts matrix shape: 5737 rows, 33538 columns
Number of feature in feature list: 33538
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.40
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 6.0%
	Estimated  = 0.0%
Elapsed time: 5.5 seconds
Counts matrix shape: 3864 rows, 33538 columns
Number of feature in feature list: 33538
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.47
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.0%
Overall doublet rate:
	Expected   = 6.0%
	Estimated  = 0.0%
Elapsed time: 3.2 seconds
Counts matrix shape: 3697 rows, 33538 columns
Number of feature in feature list: 33538
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet s

In [14]:
doublet_threshold
doublet_threshold_df = pd.DataFrame(doublet_threshold)
doublet_threshold_df.to_csv("../../../output/doublet_threshold_organoid.csv") 