# Preprocessing of RNAseq data

In [2]:
import scanpy as sc
import os
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
#import doubletdetection
from scipy.stats import median_abs_deviation as mad
import numpy as np

In [3]:
import warnings 
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)

# Creation of the anndata frames

##### I'm using the filtered output from the 10X Genomics pipelin to generate a list of dataframes, one for each condition.

In [4]:
path_filtered_files_h5 = 'C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Data/h5_files_filtered'

In [5]:
adatas = [x for x in os.listdir(path_filtered_files_h5 ) 
          if x.endswith('.h5')]

In [6]:
def load_it(adata):
    """add sample, dpi and Id columns in .obs based on the origine of the cell"""
    samp= adata.split('_')[5]
    dpi_end = adata.split('_')[6]
    dpi = dpi_end.split('.')[0]
    adata = sc.read_10x_h5(path_filtered_files_h5 + '/' + adata)
    adata.obs['Sample'] = samp
    adata.obs['dpi'] = dpi
    adata.obs['Id'] = adata.obs['Sample'] + '_' + adata.obs['dpi']
    adata.obs.index = adata.obs.index + '-' + samp + '_' + dpi

    return adata

In [7]:
adatas = [load_it(ad) for ad in adatas]

In [8]:
adatas

[AnnData object with n_obs × n_vars = 11004 × 18087
     obs: 'Sample', 'dpi', 'Id'
     var: 'gene_ids', 'feature_types', 'genome',
 AnnData object with n_obs × n_vars = 9137 × 18087
     obs: 'Sample', 'dpi', 'Id'
     var: 'gene_ids', 'feature_types', 'genome',
 AnnData object with n_obs × n_vars = 10542 × 18087
     obs: 'Sample', 'dpi', 'Id'
     var: 'gene_ids', 'feature_types', 'genome',
 AnnData object with n_obs × n_vars = 8525 × 18087
     obs: 'Sample', 'dpi', 'Id'
     var: 'gene_ids', 'feature_types', 'genome']

In [9]:
for ad in adatas:
    ad.var_names_make_unique()

In [10]:
def qc(adata):
    # give a tag to identify Mt, ribo or hb genes
    sc.pp.filter_cells(adata, min_genes = 200)
    adata.var["mt"] = adata.var_names.str.startswith("MT-")
    adata.var["ribo"] = adata.var_names.str.startswith("RPS", "RPL")
    adata.var["hb"] = adata.var_names.str.startswith("^HB[^(P)]")
    sc.pp.calculate_qc_metrics(adata,qc_vars=["mt","ribo","hb"], inplace = True, percent_top = [20], log1p=True)

    # remove useless metrics

    remove = ['total_counts_mt', 'log1p_total_counts_mt', 'total_counts_ribo',
              'log1p_total_counts_ribo','total_counts_hb','log1p_total_counts_hb']

    adata.obs = adata.obs[[x for x in adata.obs.columns if x not in remove]]
    return adata

In [11]:
adatas = [qc(ad) for ad in adatas]

In [12]:
adatas[0].obs

Unnamed: 0,Sample,dpi,Id,n_genes,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,pct_counts_mt,pct_counts_ribo,pct_counts_hb
AAACAAGCAAGAACAAAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,4997,4997,8.516793,15760.0,9.665294,16.935279,0.158629,0.0,0.0
AAACAAGCACAATGGCAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,7298,7298,8.895493,36604.0,10.507940,22.724292,0.519069,0.0,0.0
AAACAAGCACACTAAGAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,4357,4357,8.379769,14550.0,9.585415,26.054983,0.041237,0.0,0.0
AAACAAGCACCGTTTGAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,6425,6425,8.768108,37675.0,10.536778,39.777040,0.172528,0.0,0.0
AAACAAGCACTTCGATAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,4802,4802,8.476996,12024.0,9.394743,9.971723,0.024950,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGAGTCCTTATTAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,4338,4338,8.375399,12116.0,9.402365,21.153846,0.198085,0.0,0.0
TTTGTGAGTCGAAGTAAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,7765,7765,8.957511,43462.0,10.679666,13.416318,0.147255,0.0,0.0
TTTGTGAGTGCTGATTAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,5193,5193,8.555259,16969.0,9.739202,16.052802,0.070717,0.0,0.0
TTTGTGAGTGTCCAATAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,5683,5683,8.645410,17963.0,9.796125,20.503257,0.011134,0.0,0.0


In [13]:
os.makedirs('C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Output/qc_data', exist_ok = True)

In [15]:
for ad in adatas:
    unique_id = ad.obs['Id'][0]
    file_path = os.path.join('C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Output/qc_data', f'adata_{unique_id}.h5ad')
    ad.write(file_path)
    print(f"Saved: {file_path}")

Saved: C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Output/qc_data\adata_Inf_J3.h5ad
Saved: C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Output/qc_data\adata_Inf_J6.h5ad
Saved: C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Output/qc_data\adata_Mock_J3.h5ad
Saved: C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Output/qc_data\adata_Mock_J6.h5ad


# scDbFinder in Rstudio

In [None]:
#Use the file QC R in the folder R files

In [None]:
# You have to use R studio and do this step there

# Remove the identified doublets and dead cells

In [4]:
path_scDbFinder_data = "C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Output/scDbFinder_dataR"

In [5]:
adatas = [sc.read_h5ad(path_scDbFinder_data + '/' + x) for x in os.listdir(path_scDbFinder_data)]

In [6]:
adatas

[AnnData object with n_obs × n_vars = 10997 × 18087
     obs: 'Sample', 'dpi', 'Id', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'doublet_score', 'doublet_class'
     var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts',
 AnnData object with n_obs × n_vars = 9128 × 18087
     obs: 'Sample', 'dpi', 'Id', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'doublet_score', 'doublet_class'
     var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts',
 AnnData object with n_obs 

In [7]:
adatas[0].obs

Unnamed: 0,Sample,dpi,Id,n_genes,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,pct_counts_mt,pct_counts_ribo,pct_counts_hb,doublet_score,doublet_class
AAACAAGCAAGAACAAAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,4997.0,4997.0,8.516793,15760.0,9.665294,16.935279,0.158629,0.0,0.0,0.004295,singlet
AAACAAGCACAATGGCAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,7298.0,7298.0,8.895493,36604.0,10.507940,22.724292,0.519069,0.0,0.0,0.999850,doublet
AAACAAGCACACTAAGAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,4357.0,4357.0,8.379769,14550.0,9.585415,26.054983,0.041237,0.0,0.0,0.002755,singlet
AAACAAGCACCGTTTGAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,6425.0,6425.0,8.768108,37675.0,10.536778,39.777040,0.172528,0.0,0.0,0.149686,singlet
AAACAAGCACTTCGATAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,4802.0,4802.0,8.476996,12024.0,9.394743,9.971723,0.024950,0.0,0.0,0.000124,singlet
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGAGTCCTTATTAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,4338.0,4338.0,8.375399,12116.0,9.402365,21.153846,0.198085,0.0,0.0,0.004089,singlet
TTTGTGAGTCGAAGTAAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,7765.0,7765.0,8.957511,43462.0,10.679666,13.416318,0.147255,0.0,0.0,0.999401,doublet
TTTGTGAGTGCTGATTAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,5193.0,5193.0,8.555259,16969.0,9.739202,16.052802,0.070717,0.0,0.0,0.000517,singlet
TTTGTGAGTGTCCAATAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,5683.0,5683.0,8.645410,17963.0,9.796125,20.503257,0.011134,0.0,0.0,0.024200,singlet


In [8]:
def mad_outlier(adata, metric, nmads, upper_only = False):
    M = adata.obs[metric]
    
    if not upper_only:
        return (M < np.median(M) - nmads * mad(M)) | (M > np.median(M) + nmads * mad(M))
    
    return (M > np.median(M) + nmads * mad(M))

In [9]:
#clf = doubletdetection.BoostClassifier(
#    n_iters=10,
#    clustering_algorithm="louvain",
#    standard_scaling=True,
#    pseudocount=0.1,
#    n_jobs=-1)

In [10]:
def pp(adata):
    adata = adata[adata.obs.pct_counts_mt < 6] 
    
    bool_vector = mad_outlier(adata, 'log1p_total_counts', 5) +\
            mad_outlier(adata, 'log1p_n_genes_by_counts', 5) +\
            mad_outlier(adata, 'pct_counts_in_top_20_genes', 5) +\
            mad_outlier(adata, 'pct_counts_mt', 3, upper_only = True)
    adata = adata[~bool_vector]

    adata.uns['cells_removed'] = sum(bool_vector)

    #doublets = clf.fit(adata.X).predict(p_thresh=1e-16, voter_thresh=0.5)
    #doublet_score = clf.doublet_score()

    #adata.obs["doublet_dbd"] = doublets
    #adata.obs["doublet_score_dbd"] = doublet_score

    #adata.uns['doublets_removed_dbd'] = adata.obs.doublet_dbd.sum()
    #adata = adata[adata.obs.doublet_dbd == 0]
    #adata = adata[adata.obs.doublet_class_scDbFinder == 0]

    return adata

In [11]:
adatas = [pp(ad) for ad in adatas]

In [14]:
for adata in adatas:
    print(len(adata),adata.obs.doublet_class.value_counts())

8699 doublet_class
singlet    7260
doublet    1439
Name: count, dtype: int64
7287 doublet_class
singlet    6213
doublet    1074
Name: count, dtype: int64
8057 doublet_class
singlet    6847
doublet    1210
Name: count, dtype: int64
6805 doublet_class
singlet    5863
doublet     942
Name: count, dtype: int64


In [15]:
adatas[0][adatas[0].obs.doublet_class == 'singlet'].obs

Unnamed: 0,Sample,dpi,Id,n_genes,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,pct_counts_mt,pct_counts_ribo,pct_counts_hb,doublet_score,doublet_class
AAACAAGCAAGAACAAAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,4997.0,4997.0,8.516793,15760.0,9.665294,16.935279,0.158629,0.0,0.0,0.004295,singlet
AAACAAGCACACTAAGAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,4357.0,4357.0,8.379769,14550.0,9.585415,26.054983,0.041237,0.0,0.0,0.002755,singlet
AAACAAGCACCGTTTGAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,6425.0,6425.0,8.768108,37675.0,10.536778,39.777040,0.172528,0.0,0.0,0.149686,singlet
AAACAAGCACTTCGATAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,4802.0,4802.0,8.476996,12024.0,9.394743,9.971723,0.024950,0.0,0.0,0.000124,singlet
AAACAAGCATGGTCAAAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,4699.0,4699.0,8.455318,15655.0,9.658609,23.027787,0.229958,0.0,0.0,0.000208,singlet
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGCGGTGATCCTGAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,3204.0,3204.0,8.072467,6884.0,8.837100,15.862870,0.203370,0.0,0.0,0.000439,singlet
TTTGTGAGTCCTTATTAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,4338.0,4338.0,8.375399,12116.0,9.402365,21.153846,0.198085,0.0,0.0,0.004089,singlet
TTTGTGAGTGCTGATTAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,5193.0,5193.0,8.555259,16969.0,9.739202,16.052802,0.070717,0.0,0.0,0.000517,singlet
TTTGTGAGTGTCCAATAGTAGGCT-1-Inf_J3,Inf,J3,Inf_J3,5683.0,5683.0,8.645410,17963.0,9.796125,20.503257,0.011134,0.0,0.0,0.024200,singlet


In [16]:
def filter_doublets(adata):
    adata = adata[adata.obs.doublet_class == 'singlet']
    return adata

In [17]:
adatas = [filter_doublets(ad) for ad in adatas]

In [18]:
adatas

[View of AnnData object with n_obs × n_vars = 7260 × 18087
     obs: 'Sample', 'dpi', 'Id', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'doublet_score', 'doublet_class'
     var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
     uns: 'cells_removed',
 View of AnnData object with n_obs × n_vars = 6213 × 18087
     obs: 'Sample', 'dpi', 'Id', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'doublet_score', 'doublet_class'
     var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_t

In [19]:
os.makedirs('C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Output/processed_data', exist_ok = True)

In [21]:
for ad in adatas:
    unique_id = ad.obs['Id'][0]
    file_path = os.path.join('C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Output/processed_data', f'adata_{unique_id}.h5ad')
    ad.write(file_path)
    print(f"Saved: {file_path}")

Saved: C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Output/processed_data\adata_Inf_J3.h5ad
Saved: C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Output/processed_data\adata_Inf_J6.h5ad
Saved: C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Output/processed_data\adata_Mock_J3.h5ad
Saved: C:/Users/MDV/Documents/Pipeline folder Yohan/scRNA/Output/processed_data\adata_Mock_J6.h5ad
