## Loading libraries

In [None]:
import anndata as ad
import seaborn as sns
from matplotlib import pyplot as plt
import scvi
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.stats import median_abs_deviation
import scanpy as sc
import os

Global seed set to 0


<h2>Loading data</h2>

<p style="color:orange; font-size:20px">Reading all the samples in the respective sample path.</p>

In [None]:
temp = pd.read_table('Samples/p018n/filtered_feature_bc_matrix/features.tsv.gz', names=[0,1,2], index_col=1)
print(temp[1:10])

In [None]:
print(temp.index.values[0:1])

In [None]:
# Dir to the samples
sample_path = 'Samples'
# Folders between the sample name and the files (if doesn't exist, put '')
middle_path = '/filtered_feature_bc_matrix'

sample_names = []
for foldername in os.listdir(sample_path):
    if os.path.isdir(os.path.join(sample_path, foldername)):
        sample_names.append(foldername)
print(sample_names)
# For each sample, read the directory to a list of samples.
        
sample_list = []

for name in sample_names:
    # Getting anndata (transposed to obs X vars)
    path = f'samples/{name}{middle_path}/matrix.mtx.gz'
    sample = sc.read(path, cache=True).T
    
    # Getting obs
    path = f'samples/{name}{middle_path}/barcodes.tsv.gz'
    obs = pd.read_table(path, sep='\t', header=None)
    obs.index.name = 'barcode'
    sample.obs = obs
    
    # Adding metadata
    sample.obs['Patient'] = name
    sample.obs['Condition'] = name[-1].upper()
    
    # Getting vars
    path = f"samples/{name}{middle_path}/features.tsv.gz"
    var = pd.read_table(path, sep='\t', header=None, index_col=1)
    var.index.name = 'genes'
    sample.var = var
    sample.var_names_make_unique(join="-")
    
    
    sample_list.append(sample)
sample_list

In [None]:
adata = ad.concat(sample_list)
del sample_list
adata.obs_names_make_unique(join="-")
adata.obs['author'] = 'Peng_2019'
adata

In [None]:
print(adata.var_names[0:10])

## QC - Removing doublets with SOLO (scVI)

<p style="color:orange; font-size:20px">Making the model.</p>

In [None]:
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=3000, layer='counts',subset=True, batch_key="author")

In [None]:
adata

In [None]:
scvi.model.SCVI.setup_anndata(adata)
model = scvi.model.SCVI(adata)
model.train()

<p style="color:orange; font-size:20px">Creating and training SOLO model.</p>

In [None]:
solo = scvi.external.SOLO.from_scvi_model(model)
solo.train()

<p style="color:orange; font-size:20px">We then extract a predition dataframe. But as SOLO adds 2 characters in the barcode, we remove it for it to be the same format as Anndata.</p>

In [None]:
df = solo.predict()
df['prediction'] = solo.predict(soft = False)
df.index = df.index.map(lambda x: x[:-2])
df

In [None]:
df. groupby('prediction').count()

In [None]:
adata.obs['doublet_prediction'] = df.prediction
adata.obs

<p style="color:orange; font-size:20px">Savind data for Seurat workflow.</p>

In [None]:
adata.obs.to_csv("D:/Scanpy/metadata_workflow_peng.csv")

In [None]:
metadata = pd.read_csv("D:/Scanpy/Workflow_A/metadata_workflow_peng.csv", index_col = 0)
adata.obs = metadata
adata.obs

In [None]:
adata = adata[adata.obs.doublet_pred == 'singlet'].copy()

## Filtering low quality cells

<p style="color:orange; font-size:20px">Calculating QC metrics.</p>

In [None]:
# mitochondrial genes
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes.
adata.var["hb"] = adata.var_names.str.contains(("^HB[^(P)]"))

In [None]:
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
adata

In [None]:
plot1 = sns.displot(adata.obs["total_counts"], bins=100, kde=False)
plot3 = sc.pl.scatter(adata, "total_counts", "n_genes_by_counts")

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_ribo', 'pct_counts_hb'],
         jitter=0.4, multi_panel=True)

<p style="color:orange; font-size:20px">Automatic threshold (outlier detection) with MAD.</p>

In [None]:
def is_outlier(adata, metric: str, nmads: int):
    M = adata.obs[metric]
    outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
        np.median(M) + nmads * median_abs_deviation(M) < M
    )
    return outlier

In [None]:
adata.obs["outlier"] = (
    is_outlier(adata, "log1p_total_counts", 5)
    | is_outlier(adata, "log1p_n_genes_by_counts", 5)
    | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
)

adata.obs.outlier.value_counts()

In [None]:
print(f"Total number of cells: {adata.n_obs}")
adata = adata[(~adata.obs.outlier)].copy()

print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, )