## A molecular single-cell lung atlas of lethal COVID-19
##### referene: https://www.nature.com/articles/s41586-021-03569-1
##### platform: Illumina NovaSeq 6000 and snRNA-seq using a droplet-based platform (10x Genomics)
##### overall design: Single-nuclei RNA sequencing of 116,314 cells from 20 frozen lungs obtained from 19 COVID-19 decedents and seven control patients.
##### control: 7 patients vs disease: 19 covid-19 patients

In [2]:
import scanpy as sc
import scvi

import os
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

import seaborn as sns


scvi.settings.seed = 0

  self.seed = seed
  self.dl_pin_memory_gpu_training = (
  from .autonotebook import tqdm as notebook_tqdm
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)
Global seed set to 0


# Pre-process all the samples

In [3]:
data_dir = '/home/bonny/Documents/bioinformatics_workspace/datasets/single_cell_pipline/covid-19/'

#### Function to pre-process all the samples

In [4]:
## Function to pre-process all the samples

def sample_preprocessing(input_csv: str, sample: str):

    ## 1) find doubets 

    ## load .csv file
    adata = sc.read_csv(input_csv).T

    ## filter genes : at least in 10 cells
    sc.pp.filter_genes(adata, min_cells=10)

    ## get highly variable 2000 genes 
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True, flavor='seurat_v3')

    ## load and train SCVI model to identify 'doublets'
    scvi.model.SCVI.setup_anndata(adata)
    vae = scvi.model.SCVI(adata)
    vae.train()

    ## Train solo model to predict "doublets". It needs train scvi model to be passed.

    solo = scvi.external.SOLO.from_scvi_model(vae)
    solo.train()

    df = solo.predict()
    df['prediction'] = solo.predict(soft=False)
    df['diff'] = df.doublet - df.singlet
    doublets = df[(df['prediction']=='doublet') & (df['diff']>1 )]


    ## 2) pre-processing

    ## assign samples
    adata = sc.read_csv(input_csv).T
    adata.obs['sample'] = sample

    ## assign doublets
    adata.obs['doublet'] = adata.obs.index.isin( doublets.index )
    adata = adata[~ adata.obs.doublet]

    ## label 'mitochondrial' and 'ribosomal' genes
    sc.pp.filter_cells(adata, min_genes=200) ## filter cells with fewer than 200 genes
    adata.var['mt'] = adata.var_names.str.startswith( 'MT-' ) ## label mitochondrial genes
    adata.var['ribosomal'] = adata.var_names.str.startswith(("RPS", "RPL")) ## label ribosomal genes
    # adata.var['ribosomal'] = adata.var_names.isin( ribosomal_genes[0].values ) ## label ribosomal genes

    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt', 'ribosomal'], percent_top=None, log1p=False, inplace=True)
    upper_limit = np.quantile(adata.obs.n_genes_by_counts.values, 0.98)
    adata = adata[adata.obs.n_genes_by_counts < upper_limit]
    adata = adata[adata.obs.pct_counts_mt < 20]
    adata = adata[adata.obs.pct_counts_ribosomal < 2]

    return adata



In [5]:
## create sample_name : sample_path dictionary
sample_dict = dict()
for fi in sorted( os.listdir(data_dir) ):
    file_extension = os.path.splitext(fi)[-1]
    if file_extension == '.gz':
        file_path = data_dir + fi
        sample = fi.split('_')[1]
        sample_dict[sample] = file_path

In [6]:
sample_dict

{'C51ctr': '/home/bonny/Documents/bioinformatics_workspace/datasets/single_cell_pipline/covid-19/GSM5226574_C51ctr_raw_counts.csv.gz',
 'C52ctr': '/home/bonny/Documents/bioinformatics_workspace/datasets/single_cell_pipline/covid-19/GSM5226575_C52ctr_raw_counts.csv.gz',
 'C53ctr': '/home/bonny/Documents/bioinformatics_workspace/datasets/single_cell_pipline/covid-19/GSM5226576_C53ctr_raw_counts.csv.gz',
 'C54ctr': '/home/bonny/Documents/bioinformatics_workspace/datasets/single_cell_pipline/covid-19/GSM5226577_C54ctr_raw_counts.csv.gz',
 'C55ctr': '/home/bonny/Documents/bioinformatics_workspace/datasets/single_cell_pipline/covid-19/GSM5226578_C55ctr_raw_counts.csv.gz',
 'C56ctr': '/home/bonny/Documents/bioinformatics_workspace/datasets/single_cell_pipline/covid-19/GSM5226579_C56ctr_raw_counts.csv.gz',
 'C57ctr': '/home/bonny/Documents/bioinformatics_workspace/datasets/single_cell_pipline/covid-19/GSM5226580_C57ctr_raw_counts.csv.gz',
 'L01cov': '/home/bonny/Documents/bioinformatics_worksp

In [7]:
sorted( sample_dict.keys() )

['C51ctr',
 'C52ctr',
 'C53ctr',
 'C54ctr',
 'C55ctr',
 'C56ctr',
 'C57ctr',
 'L01cov',
 'L03cov',
 'L04cov',
 'L04covaddon',
 'L05cov',
 'L06cov',
 'L07cov',
 'L08cov',
 'L09cov',
 'L10cov',
 'L11cov',
 'L12cov',
 'L13cov',
 'L15cov',
 'L16cov',
 'L17cov',
 'L18cov',
 'L19cov',
 'L21cov',
 'L22cov']

In [9]:
s = ['L22cov']

In [10]:
for sample, sample_csv in sample_dict.items():
    if sample in s:
        print(sample, sample_csv)
        print()
        adata = sample_preprocessing(sample_csv, sample)
        adata.write_h5ad(data_dir + f'{sample}.h5ad')
        print('='*100)
        print()

L22cov /home/bonny/Documents/bioinformatics_workspace/datasets/single_cell_pipline/covid-19/GSM5226600_L22cov_raw_counts.csv.gz



No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Epoch 400/400: 100%|███████████████████████████████████████████████████████████████████████████████| 400/400 [09:26<00:00,  1.53s/it, v_num=1, train_loss_step=192, train_loss_epoch=340]

`Trainer.fit` stopped: `max_epochs=400` reached.


Epoch 400/400: 100%|███████████████████████████████████████████████████████████████████████████████| 400/400 [09:26<00:00,  1.42s/it, v_num=1, train_loss_step=192, train_loss_epoch=340]
[34mINFO    [0m Creating doublets, preparing SOLO model.                                                                  


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Epoch 376/400:  94%|██████████████████████████████████████████████████████████████████████▌    | 376/400 [02:38<00:10,  2.37it/s, v_num=1, train_loss_step=0.418, train_loss_epoch=0.317]
Monitored metric validation_loss did not improve in the last 30 records. Best score: 0.303. Signaling Trainer to stop.


  adata.obs['n_genes'] = number
  df[key] = c



