In [5]:
#%pip install scanpy matplotlib-venn harmonypy #liedenalg

In [8]:
import scanpy as sc
import os
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import doubletdetection
from scipy.stats import median_abs_deviation as mad
import numpy as np

In [9]:
import warnings 
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)

# Iterative QC

## Loading of the datasets

In [3]:
path_filtered_files_h5 = '/home/ybeaumatin/Documents/Data processing pipeline/scRNA/Data/h5_files_filtered'

In [4]:
adatas = [x for x in os.listdir(path_filtered_files_h5 ) 
          if x.endswith('.h5')]

In [5]:
def load_it(adata):
    samp= adata.split('_')[5]
    dpi_end = adata.split('_')[6]
    dpi = dpi_end.split('.')[0]
    adata = sc.read_10x_h5(path_filtered_files_h5 + '/' + adata)
    adata.obs['Sample'] = samp
    adata.obs['dpi'] = dpi
    adata.obs['Id'] = adata.obs['Sample'] + '_' + adata.obs['dpi']
    adata.obs.index = adata.obs.index + '-' + samp + '_' + dpi

    return adata

In [6]:
adatas = [load_it(ad) for ad in adatas]

In [7]:
adatas

[AnnData object with n_obs × n_vars = 9137 × 18087
     obs: 'Sample', 'dpi', 'Id'
     var: 'gene_ids', 'feature_types', 'genome',
 AnnData object with n_obs × n_vars = 10542 × 18087
     obs: 'Sample', 'dpi', 'Id'
     var: 'gene_ids', 'feature_types', 'genome',
 AnnData object with n_obs × n_vars = 8525 × 18087
     obs: 'Sample', 'dpi', 'Id'
     var: 'gene_ids', 'feature_types', 'genome',
 AnnData object with n_obs × n_vars = 11004 × 18087
     obs: 'Sample', 'dpi', 'Id'
     var: 'gene_ids', 'feature_types', 'genome']

## QC evaluation

In [8]:
for ad in adatas:
    ad.var_names_make_unique()

In [9]:
def qc(adata):
    sc.pp.filter_cells(adata, min_genes = 200)
    adata.var["mt"] = adata.var_names.str.startswith("MT-")
    adata.var["ribo"] = adata.var_names.str.startswith("RPS", "RPL")
    adata.var["hb"] = adata.var_names.str.startswith("^HB[^(P)]")
    sc.pp.calculate_qc_metrics(adata,qc_vars=["mt","ribo","hb"], inplace = True, percent_top = [20], log1p=True)

    remove = ['total_counts_mt', 'log1p_total_counts_mt', 'total_counts_ribo',
              'log1p_total_counts_ribo','total_counts_hb','log1p_total_counts_hb']

    adata.obs = adata.obs[[x for x in adata.obs.columns if x not in remove]]
    return adata

In [10]:
adatas = [qc(ad) for ad in adatas]

In [14]:
adatas[0].obs

Unnamed: 0,Sample,dpi,Id,n_genes,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,pct_counts_mt,pct_counts_ribo,pct_counts_hb
AAACAAGCAGCTCGCTATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,4730,4730,8.461892,14274.0,9.566265,18.775396,0.175144,0.0,0.0
AAACCAATCACCTAATATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,8415,8415,9.037890,54307.0,10.902427,19.159593,0.252270,0.0,0.0
AAACCAATCATTATGCATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,5073,5073,8.531885,18919.0,9.847975,29.494159,0.190285,0.0,0.0
AAACCAATCATTGCATATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,5705,5705,8.649274,20530.0,9.929691,16.585485,0.998539,0.0,0.0
AAACCAATCCAAATTCATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,5398,5398,8.593969,14307.0,9.568574,6.304606,0.146781,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGCGGTAACTACCATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,2631,2631,7.875499,4842.0,8.485290,15.386204,0.309789,0.0,0.0
TTTGGCGGTAAGCGTAATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,4769,4769,8.470102,15426.0,9.643874,15.493323,0.187994,0.0,0.0
TTTGGCGGTCTGTGATATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,3147,3147,8.054523,7126.0,8.871646,22.537188,0.266629,0.0,0.0
TTTGGCGGTTGGATGAATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,4975,4975,8.512382,15833.0,9.669915,15.208741,0.025264,0.0,0.0


In [15]:
os.makedirs('/home/ybeaumatin/Documents/Data processing pipeline/scRNA/Output/qc_data', exist_ok = True)

In [16]:
for ad in adatas:
    unique_id = ad.obs['Id'][0]
    file_path = os.path.join('/home/ybeaumatin/Documents/Data processing pipeline/scRNA/Output/qc_data', f'adata_{unique_id}.h5ad')
    ad.write(file_path)
    print(f"Saved: {file_path}")

Saved: /home/ybeaumatin/Documents/Data processing pipeline/scRNA/Output/qc_data/adata_Inf_J6.h5ad
Saved: /home/ybeaumatin/Documents/Data processing pipeline/scRNA/Output/qc_data/adata_Mock_J3.h5ad
Saved: /home/ybeaumatin/Documents/Data processing pipeline/scRNA/Output/qc_data/adata_Mock_J6.h5ad
Saved: /home/ybeaumatin/Documents/Data processing pipeline/scRNA/Output/qc_data/adata_Inf_J3.h5ad


## Doublet software to find them

In [None]:
#scDbFinder in Rstudio

In [None]:
#Use the file QC R in the folder R files

In [10]:
path_scDbFinder_data = "/home/ybeaumatin/Documents/Data processing pipeline/scRNA/Output/scDbFinder_dataR"

In [11]:
adatas = [sc.read_h5ad(path_scDbFinder_data + '/' + x) for x in os.listdir(path_scDbFinder_data)]

In [12]:
adatas

[AnnData object with n_obs × n_vars = 9128 × 18087
     obs: 'Sample', 'dpi', 'Id', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'doublet_score_scDbFinder', 'doublet_class_scDbFinder'
     var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts',
 AnnData object with n_obs × n_vars = 8525 × 18087
     obs: 'Sample', 'dpi', 'Id', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'doublet_score_scDbFinder', 'doublet_class_scDbFinder'
     var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p

In [13]:
adatas[0].obs

Unnamed: 0,Sample,dpi,Id,n_genes,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,pct_counts_mt,pct_counts_ribo,pct_counts_hb,doublet_score_scDbFinder,doublet_class_scDbFinder
AAACAAGCAGCTCGCTATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,4730.0,4730,8.461892,14274.0,9.566265,18.775396,0.175144,0.0,0.0,0.029705,singlet
AAACCAATCACCTAATATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,8415.0,8415,9.037890,54307.0,10.902427,19.159593,0.252270,0.0,0.0,0.999955,doublet
AAACCAATCATTATGCATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,5073.0,5073,8.531885,18919.0,9.847975,29.494159,0.190285,0.0,0.0,0.002290,singlet
AAACCAATCATTGCATATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,5705.0,5705,8.649274,20530.0,9.929691,16.585485,0.998539,0.0,0.0,0.994010,doublet
AAACCAATCCAAATTCATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,5398.0,5398,8.593969,14307.0,9.568574,6.304606,0.146781,0.0,0.0,0.015603,singlet
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGCGGTAACTACCATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,2631.0,2631,7.875499,4842.0,8.485290,15.386204,0.309789,0.0,0.0,0.000367,singlet
TTTGGCGGTAAGCGTAATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,4769.0,4769,8.470102,15426.0,9.643874,15.493323,0.187994,0.0,0.0,0.004287,singlet
TTTGGCGGTCTGTGATATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,3147.0,3147,8.054523,7126.0,8.871646,22.537188,0.266629,0.0,0.0,0.012853,singlet
TTTGGCGGTTGGATGAATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,4975.0,4975,8.512382,15833.0,9.669915,15.208741,0.025264,0.0,0.0,0.002952,singlet


## Outliers annotation

In [14]:
def mad_outlier(adata, metric, nmads, upper_only = False):
    M = adata.obs[metric]
    
    if not upper_only:
        return (M < np.median(M) - nmads * mad(M)) | (M > np.median(M) + nmads * mad(M))
    
    return (M > np.median(M) + nmads * mad(M))

In [15]:
clf = doubletdetection.BoostClassifier(
    n_iters=10,
    clustering_algorithm="louvain",
    standard_scaling=True,
    pseudocount=0.1,
    n_jobs=-1)

In [16]:
def pp(adata):
    adata = adata[adata.obs.pct_counts_mt < 6] 
    
    bool_vector = mad_outlier(adata, 'log1p_total_counts', 5) +\
            mad_outlier(adata, 'log1p_n_genes_by_counts', 5) +\
            mad_outlier(adata, 'pct_counts_in_top_20_genes', 5) +\
            mad_outlier(adata, 'pct_counts_mt', 3, upper_only = True)
    adata = adata[~bool_vector]

    adata.uns['cells_removed'] = sum(bool_vector)

    doublets = clf.fit(adata.X).predict(p_thresh=1e-16, voter_thresh=0.5)
    doublet_score = clf.doublet_score()

    adata.obs["doublet_dbd"] = doublets
    adata.obs["doublet_score_dbd"] = doublet_score

    adata.uns['doublets_removed_dbd'] = adata.obs.doublet_dbd.sum()
    #adata = adata[adata.obs.doublet_dbd == 0]
    #adata = adata[adata.obs.doublet_class_scDbFinder == 0]

    return adata

In [17]:
adatas = [pp(ad) for ad in adatas]

  0%|          | 0/10 [00:00<?, ?it/s]

AttributeError: 'csr_matrix' object has no attribute 'A'

In [11]:
for adata in adatas:
    print(len(adata), adata.uns['cells_removed'], adata.uns['doublets_removed_dbd'],adata.obs.doublet_class_scDbFinder.value_counts())

7287 1838 973.0 doublet_class_scDbFinder
singlet    6007
doublet    1280
Name: count, dtype: int64
6805 1712 863.0 doublet_class_scDbFinder
singlet    5398
doublet    1407
Name: count, dtype: int64
8699 2291 1198.0 doublet_class_scDbFinder
singlet    7142
doublet    1557
Name: count, dtype: int64
8057 2459 1189.0 doublet_class_scDbFinder
singlet    6753
doublet    1304
Name: count, dtype: int64


In [20]:
adatas[0][adatas[0].obs.doublet_class_scDbFinder == 'singlet'].obs

Unnamed: 0,Sample,dpi,Id,n_genes,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,pct_counts_mt,pct_counts_ribo,pct_counts_hb,doublet_score_scDbFinder,doublet_class_scDbFinder,doublet_dbd,doublet_score_dbd
AAACAAGCAGCTCGCTATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,4730.0,4730,8.461892,14274.0,9.566265,18.775396,0.175144,0.0,0.0,0.029705,singlet,0.0,8.614676e-14
AAACCAATCATTATGCATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,5073.0,5073,8.531885,18919.0,9.847975,29.494159,0.190285,0.0,0.0,0.002290,singlet,0.0,2.221546e-08
AAACCAATCCAAATTCATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,5398.0,5398,8.593969,14307.0,9.568574,6.304606,0.146781,0.0,0.0,0.015603,singlet,0.0,4.507599e-12
AAACCAATCGAAATGCATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,5514.0,5514,8.615227,20776.0,9.941602,24.234694,0.096265,0.0,0.0,0.026639,singlet,0.0,5.325626e-11
AAACCAATCTGGAACGATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,4395.0,4395,8.388450,11835.0,9.378901,17.769328,0.430925,0.0,0.0,0.108566,singlet,0.0,7.174440e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGCGGTAACTACCATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,2631.0,2631,7.875499,4842.0,8.485290,15.386204,0.309789,0.0,0.0,0.000367,singlet,0.0,1.185642e-20
TTTGGCGGTAAGCGTAATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,4769.0,4769,8.470102,15426.0,9.643874,15.493323,0.187994,0.0,0.0,0.004287,singlet,0.0,1.044962e-09
TTTGGCGGTCTGTGATATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,3147.0,3147,8.054523,7126.0,8.871646,22.537188,0.266629,0.0,0.0,0.012853,singlet,0.0,4.640930e-12
TTTGGCGGTTGGATGAATGTTGAC-1-Inf_J6,Inf,J6,Inf_J6,4975.0,4975,8.512382,15833.0,9.669915,15.208741,0.025264,0.0,0.0,0.002952,singlet,0.0,1.707393e-11


In [21]:
#def filter_doublets(adata):           -->    removed to keep the outliers 
    #adata = adata[adata.obs.doublet_class_scDbFinder == 'singlet']
    #return adata

In [22]:
#adatas = [filter_doublets(ad) for ad in adatas]

In [23]:
adatas

[View of AnnData object with n_obs × n_vars = 6007 × 18087
     obs: 'Sample', 'dpi', 'Id', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'doublet_score_scDbFinder', 'doublet_class_scDbFinder', 'doublet_dbd', 'doublet_score_dbd'
     var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
     uns: 'cells_removed', 'doublets_removed_dbd',
 View of AnnData object with n_obs × n_vars = 5398 × 18087
     obs: 'Sample', 'dpi', 'Id', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'doublet_score_scDbFinder', 'doublet_class_scDbFinder', 'doublet_dbd', 'doublet_score_dbd'
     var: 'gene_ids', 'feature_type

In [24]:
os.makedirs('/home/ybeaumatin/Documents/Data processing pipeline/scRNA/Output_iteration/processed_data', exist_ok = True)

In [25]:
for ad in adatas:
    unique_id = ad.obs['Id'][0]
    file_path = os.path.join('/home/ybeaumatin/Documents/Data processing pipeline/scRNA/Output_iteration/processed_data', f'adata_{unique_id}.h5ad')
    ad.write(file_path)
    print(f"Saved: {file_path}")

Saved: /home/ybeaumatin/Documents/Data processing pipeline/scRNA/Output/processed_data/adata_Inf_J6.h5ad
Saved: /home/ybeaumatin/Documents/Data processing pipeline/scRNA/Output/processed_data/adata_Mock_J6.h5ad
Saved: /home/ybeaumatin/Documents/Data processing pipeline/scRNA/Output/processed_data/adata_Inf_J3.h5ad
Saved: /home/ybeaumatin/Documents/Data processing pipeline/scRNA/Output/processed_data/adata_Mock_J3.h5ad


## Integration