In [None]:
%matplotlib inline
import scrublet as scr
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
import os
from anndata import AnnData
import scanpy.api as sc
import seaborn as sns

## Run scrublet using log counts & scale


In [None]:
sample_name = "JYH_857_1_2"
neotic_ratio= .5

In [None]:
wd = os.path.join(os.getcwd(),sample_name)
adata = sc.read_h5ad(
    filename=os.path.join(wd, '{}.adata.h5ad'.format(sample_name)))
adata

low_frip = open(os.path.join(
    wd, '{0}.filtered.txt'.format(sample_name))).read().splitlines()
adata = adata[(~adata.obs.index.isin(low_frip)),:].copy()
adata


1. Using the most varible genomic bins as genes to input to scrublet (consistent with clustering steps). 
2. Converted to counts 

In [None]:
expected_doublet_th=adata.shape[0]/1000*.01*neotic_ratio
expected_doublet_th

In [None]:
adata_raw = adata.raw.copy()
adata_raw= adata_raw[:,adata_raw.var.index.isin(adata.var_names.tolist())]
counts_matris_2 = adata_raw.X.expm1()
del adata_raw

In [None]:
scrub = scr.Scrublet(counts_matris_2, expected_doublet_rate=expected_doublet_th)
doublet_scores, predicted_doublets = scrub.scrub_doublets(
    distance_metric='cosine',
    mean_center=False,
    n_prin_comps=50,
    log_transform=True,
    min_gene_variability_pctl=0)
scrub.plot_histogram()

In [None]:
predicted_doublets = scrub.call_doublets(threshold=np.quantile(
    doublet_scores, 1-expected_doublet_th))  # directly call by trheshold
print(sum(predicted_doublets))
print(sum(predicted_doublets)/len(predicted_doublets))

In [None]:
adata.obs['doublet_score'] = doublet_scores
adata.obs['doublet'] = predicted_doublets
adata.obs['log10_unique_usable_reads'] = np.log10(
    adata.obs['unique_usable_reads'] + 1)

adata_doub=adata[predicted_doublets].copy()
adata_nondoub=adata[~predicted_doublets].copy()
import pandas as pd
doub_sum = pd.concat([
    adata_doub.obs.leiden.value_counts(),
    adata_nondoub.obs.leiden.value_counts(),
    adata.obs.leiden.value_counts()
],axis=1)
doub_sum.columns=['doublet','non_doublet','init']
doub_sum['pertage']=round(doub_sum['doublet']/doub_sum['init']*100)
doub_sum

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(12, 8))
i = 0
to_plot = [
    'log10_unique_usable_reads', 'frac_promoters_used', 'doublet_score',
    'doublet', 'leiden'
]
for ax in axs.reshape(-1):

    if (i == 5):
        doub_sum.loc[:, ~doub_sum.columns.isin(['pertage', 'init'])].plot.bar(
            stacked=True, ax=ax)
    else:
        sc.pl.umap(adata,
                   color=to_plot[i],
                   cmap='Blues' if i == 0 else 'Reds',
                   size=9,
                   ax=ax,
                   show=False,
                   legend_loc='on data')
    i += 1

plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 3, figsize=(12, 6),sharex=True)
i=0
to_plot=[
        'log10_usable_counts','frac_reads_in_peaks', 'frac_reads_in_promoters',
        'frac_promoters_used', 'doublet_score', 'frac_duplicated_reads'
]

for ax in axs.reshape(-1): 
    sns.boxplot(x='leiden', y=to_plot[i], data=adata.obs,ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(to_plot[i])
    i+=1
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 3, figsize=(12, 6), sharex=True)
i = 0
for ax in axs.reshape(-1):
    sc.pl.umap(adata,
               color=to_plot[i],
               cmap='Blues' if i==0 else'Reds',
               size=9,
               ax=ax,
               show=False,
               legend_loc='on data')
    i+=1

plt.tight_layout()
plt.show()

In [None]:
adata.obs['doublet_score'] = doublet_scores
adata.obs['doublet'] = predicted_doublets
adata.obs['log10_unique_usable_reads'] = np.log10(
    adata.obs['unique_usable_reads'] + 1)

fig, axs = plt.subplots(2, 3, figsize=(12, 8))
i = 0
to_plot = [
    'log10_unique_usable_reads', 'frac_promoters_used', 'doublet_score',
    'doublet', 'leiden'
]
for ax in axs.reshape(-1):

    if (i == 5):
        doub_sum.loc[:, ~doub_sum.columns.isin(['pertage', 'init'])].plot.bar(
            stacked=True, ax=ax)
    else:
        sc.pl.umap(adata_nondoub,
                   color=to_plot[i],
                   cmap='Blues' if i == 0 else 'Reds',
                   size=9,
                   ax=ax,
                   show=False,
                   legend_loc='on data')
    i += 1

plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 3, figsize=(12, 6),sharex=True)
i=0
to_plot=[
        'log10_usable_counts','frac_reads_in_peaks', 'frac_reads_in_promoters',
        'frac_promoters_used', 'doublet_score', 'frac_duplicated_reads'
]

for ax in axs.reshape(-1): 
    sns.boxplot(x='leiden', y=to_plot[i], data=adata_nondoub.obs,ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(to_plot[i])
    i+=1
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 3, figsize=(12, 6), sharex=True)
i = 0
for ax in axs.reshape(-1):
    sc.pl.umap(adata,
               color=to_plot[i],
               cmap='Blues' if i==0 else'Reds',
               size=9,
               ax=ax,
               show=False,
               legend_loc='on data')
    i+=1

plt.tight_layout()
plt.show()

In [None]:
adata.obs['doublet_score'] = doublet_scores
adata.obs['doublet'] = predicted_doublets
adata.obs['log10_unique_usable_reads'] = np.log10(
    adata.obs['unique_usable_reads'] + 1)

fig, axs = plt.subplots(2, 3, figsize=(12, 8))
i = 0
to_plot = [
    'log10_unique_usable_reads', 'frac_promoters_used', 'doublet_score',
    'doublet', 'leiden'
]
for ax in axs.reshape(-1):

    if (i == 5):
        doub_sum.loc[:, ~doub_sum.columns.isin(['pertage', 'init'])].plot.bar(
            stacked=True, ax=ax)
    else:
        sc.pl.umap(adata_doub,
                   color=to_plot[i],
                   cmap='Blues' if i == 0 else 'Reds',
                   size=9,
                   ax=ax,
                   show=False,
                   legend_loc='on data')
    i += 1

plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 3, figsize=(12, 6),sharex=True)
i=0
to_plot=[
        'log10_usable_counts','frac_reads_in_peaks', 'frac_reads_in_promoters',
        'frac_promoters_used', 'doublet_score', 'frac_duplicated_reads'
]

for ax in axs.reshape(-1): 
    sns.boxplot(x='leiden', y=to_plot[i], data=adata_doub.obs,ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(to_plot[i])
    i+=1
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 3, figsize=(12, 6), sharex=True)
i = 0
for ax in axs.reshape(-1):
    sc.pl.umap(adata,
               color=to_plot[i],
               cmap='Blues' if i==0 else'Reds',
               size=9,
               ax=ax,
               show=False,
               legend_loc='on data')
    i+=1

plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
adata.obs['doublet'] = predicted_doublets
adata.obs['doublet'] = adata.obs['doublet'].astype('category')
g = sns.pairplot(adata.obs[[
    'log10_unique_usable_reads', 'frac_promoters_used', 'doublet_score',
    'doublet'
]],hue='doublet')

In [None]:
adata.obs['doublet'] = predicted_doublets.astype('int')

adata.obs.plot.scatter(
    x='log10_unique_usable_reads',
    y='frac_promoters_used',
    c='doublet',colormap='viridis',
    alpha=.25,
)

In [None]:

print('Running UMAP...')
scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))

# # Uncomment to run tSNE - slow
# print('Running tSNE...')
# scrub.set_embedding('tSNE', scr.get_tsne(scrub.manifold_obs_, angle=0.9))

# # Uncomment to run force layout - slow
# print('Running ForceAtlas2...')
# scrub.set_embedding('FA', scr.get_force_layout(scrub.manifold_obs_, n_neighbors=5. n_iter=1000))
    
print('Done.')
scrub.plot_embedding('UMAP', order_points=True);

In [None]:
doublet_result = pd.DataFrame(
    {
        'predicted_doublets': predicted_doublets,
        'doublet_scores': doublet_scores
    },
    columns=['predicted_doublets','doublet_scores'],
    index=adata.obs.index)
doublet_result.to_csv(os.path.join(wd,'{0}.doublet_result.txt'.format(sample_name)))