## Loading libraries

In [5]:
import anndata as ad
import seaborn as sns
from matplotlib import pyplot as plt
import scvi
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.stats import median_abs_deviation
import scanpy as sc
import os

<h2>Loading data</h2>

In [None]:
# Dir to the samples
sample_path = 'Samples'
# Folders between the sample name and the files (if doesn't exist, put '')
middle_path = '/filtered_feature_bc_matrix'

sample_names = []
for foldername in os.listdir(sample_path):
    if os.path.isdir(os.path.join(sample_path, foldername)):
        sample_names.append(foldername)
print(sample_names)
# For each sample, read the directory to a list of samples.
        
sample_list = []

for name in sample_names:
    # Getting anndata (transposed to obs X vars)
    path = f'samples/{name}{middle_path}/matrix.mtx.gz'
    sample = sc.read(path, cache=True).T
    
    # Getting obs
    path = f'samples/{name}{middle_path}/barcodes.tsv.gz'
    obs = pd.read_csv(path, sep='\t', header=None, index_col=0)
    obs.index.name = 'barcode'
    sample.obs = obs
    
    # Adding metadata
    sample.obs['Patient'] = name
    sample.obs['Condition'] = name[-1].upper()
    
    # Getting vars
    path = f"samples/{name}{middle_path}/features.tsv.gz"
    var = pd.read_table(path, sep='\t', header=None, index_col=1)
    var.index.name = 'genes'
    sample.var = var
    sample.var_names_make_unique(join="-")
    
    
    sample_list.append(sample)

# Removing doublets with SOLO (scVI)

In [None]:
sc.pp.filter_cells(adata, min_counts=500)
sc.pp.filter_cells(adata, min_genes=200)

adata.layers['counts']=adata.X.copy()

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=3000, 
                            layer='counts',subset=True, batch_key='Author')

In [None]:
scvi.model.SCVI.setup_anndata(adata, layer='counts')
model = scvi.model.SCVI(adata)
model.train(max_epochs=100)

In [None]:
solo = scvi.external.SOLO.from_scvi_model(model)
solo.train()

In [None]:
solo_df = solo.predict()

## QC

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=4000,
                           layer='counts',subset=True,batch_key='Author')

adata.var['mt'] = adata.var_names.str.startswith('MT-')
adata.var['ribo'] = adata.var_names.str.startswith(('RPS','RPL'))
adata.var['hb'] = adata.var_names.str.startswith(("^HB[^(P)]"))

sc.pp.calculate_qc_metrics(
    adata, qc_vars=['mt','ribo','hb'], inplace=True, percent_top=[20], log1p=True
)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_ribo',
                     'pct_counts_hb', 'pct_counts_mt'], jitter=0.4, multi_panel=True)

In [None]:
def is_outlier(adata, metric: str, nmads: int):
    M = adata.obs[metric]
    outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
        np.median(M) + nmads * median_abs_deviation(M) < M
    )
    return outlier

adata.obs["outlier"] = (
    is_outlier(adata, "log1p_total_counts", 5)
    | is_outlier(adata, "log1p_n_genes_by_counts", 5)
    | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
)

adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 3) | (
    adata.obs["pct_counts_mt"] > 8
)

adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20)

In [None]:
adata.X = adata.layers['counts'].copy()
adata.write('raw_clean_adata.h5ad')

## scVI Integration

In [None]:
scvi.model.SCVI.setup_anndata(adata, batch_key='Patient', layer='counts')

In [None]:
arches_params = dict(
    use_layer_norm='both',
    use_batch_norm='none',
    encode_covariates=True,
    dropout_rate=0.2,
    n_layers=2,
)

vae = scvi.model.SCVI(adata, **arches_params)
vae.train()

In [None]:
adata.obsm["X_scVI"] = vae.get_latent_representation() #dimensional reduction
adata.layers['scvi_normalized'] = vae.get_normalized_expression(library_size = 1e4)
sc.pp.neighbors(adata, use_rep="X_scVI")
sc.tl.umap(adata)

## Umap Visualization

In [None]:
sc.pl.umap(adata, color = ['Condition', 'Patient'], frameon=False, ncols = 2)

In [None]:
adata.write_h5ad("adataV1.h5ad")