In [None]:
import scanpy as sc
import os
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import doubletdetection
from scipy.stats import median_abs_deviation as mad
import numpy as np

In [None]:
import anndata2ri
import logging

import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro

rcb.logger.setLevel(logging.ERROR)
ro.pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

In [None]:
%%R
library(Seurat)
library(scater)
library(scDblFinder)
library(BiocParallel)

In [None]:
import warnings 
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)

In [None]:
path_filtered_files_h5 = 'C:/Users/test/Documents/yohan/Data processing pipeline/scRNA/Data/h5_files_filtered'

In [None]:
adatas = [x for x in os.listdir(path_filtered_files_h5 ) 
          if x.endswith('.h5')]

In [None]:
def load_it(adata):
    samp= adata.split('_')[5]
    dpi_end = adata.split('_')[6]
    dpi = dpi_end.split('.')[0]
    adata = sc.read_10x_h5(path_filtered_files_h5 + '/' + adata)
    adata.obs['Sample'] = samp
    adata.obs['dpi'] = dpi
    adata.obs['Id'] = adata.obs['Sample'] + '_' + adata.obs['dpi']
    adata.obs.index = adata.obs.index + '-' + samp + '_' + dpi
    remove = ['Sample','dpi']

    adata.obs = adata.obs[[x for x in adata.obs.columns if x not in remove]]
    return adata

In [None]:
adatas = [load_it(ad) for ad in adatas]

In [None]:
adatas

In [None]:
for ad in adatas:
    ad.var_names_make_unique()

In [None]:
def qc(adata):
    sc.pp.filter_cells(adata, min_genes = 200)
    adata.var["mt"] = adata.var_names.str.startswith("MT-")
    adata.var["ribo"] = adata.var_names.str.startswith("RPS", "RPL")
    adata.var["hb"] = adata.var_names.str.startswith("^HB[^(P)]")
    sc.pp.calculate_qc_metrics(adata,qc_vars=["mt","ribo","hb"], inplace = True, percent_top = [20], log1p=True)

    remove = ['total_counts_mt', 'log1p_total_counts_mt', 'total_counts_ribo',
              'log1p_total_counts_ribo','total_counts_hb','log1p_total_counts_hb']

    adata.obs = adata.obs[[x for x in adata.obs.columns if x not in remove]]
    return adata

In [None]:
adatas = [qc(ad) for ad in adatas]

In [None]:
data_mat = [x.X.T for x in adatas]

In [None]:
#scDbFinder

In [None]:
%%R -i data_mat -o doublet_scores_list -o doublet_classes_list

doublet_scores_list <- list()
doublet_classes_list <- list()


for (i in seq_along(data_mat)) {
  current_data <- data_mat[[i]]
  set.seed(123)
  sce <- scDblFinder(
    SingleCellExperiment(
      list(counts = current_data)
    )
  )
  doublet_scores_list[[i]] <- sce$scDblFinder.score
  doublet_classes_list[[i]] <- sce$scDblFinder.class
}

In [None]:
for i,ad in enumerate(adatas):
    ad.obs["scDblFinder_score"] = doublet_scores_list[i]
    ad.obs["scDblFinder_class"] = doublet_classes_list[i]
    print(ad.obs['Id'][0])
    ad.obs.scDblFinder_class.value_counts()

In [None]:
def pp(adata):
    adata = adata[adata.obs.pct_counts_mt < 6] 
    
    bool_vector = mad_outlier(adata, 'log1p_total_counts', 5) +\
            mad_outlier(adata, 'log1p_n_genes_by_counts', 5) +\
            mad_outlier(adata, 'pct_counts_in_top_20_genes', 5) +\
            mad_outlier(adata, 'pct_counts_mt', 3, upper_only = True)
    adata = adata[~bool_vector]

    adata.uns['cells_removed'] = sum(bool_vector)

    doublets = clf.fit(adata.X).predict(p_thresh=1e-16, voter_thresh=0.6)
    doublet_score = clf.doublet_score()

    adata.obs["doublet_dbd"] = doublets
    adata.obs["doublet_score_dbd"] = doublet_score

    adata.uns['doublets_removed_dbd'] = adata.obs.doublet_dbd.sum()
    #adata = adata[adata.obs.doublet == 0]

    return adata

In [None]:
adatas = [pp(ad) for ad in adatas]

In [None]:
# partie avec choix de la prédiction en fonction d'une méthode machine learning ou pondérée

In [None]:
for adata in adatas:
    print(len(adata), adata.uns['cells_removed'], adata.uns['doublets_removed_dbd'])

In [None]:
os.makedirs('C:/Users/test/Documents/yohan/Data processing pipeline/scRNA/Output/processed_data', exist_ok = True)

In [None]:
for ad in adatas:
    unique_id = ad.obs['Id'][0]
    file_path = os.path.join("processed_data", f'adata_{unique_id}.h5ad')
    ad.write(file_path)
    print(f"Saved: {file_path}")