## Loading libraries

In [12]:
import anndata as ad
import scvi
import numpy as np
import pandas as pd
from scipy import sparse
import scanpy as sc
import os

<h2>Loading data</h2>

<p style="color:orange; font-size:20px">Reading all the samples in the respective sample path.</p>

In [13]:
# Dir to the samples
sample_path = 'Samples'
# Folders between the sample name and the files (if doesn't exist, put '')
middle_path = '/filtered_feature_bc_matrix'

sample_names = []
for foldername in os.listdir(sample_path):
    if os.path.isdir(os.path.join(sample_path, foldername)):
        sample_names.append(foldername)

# For each sample, read the directory to a list of samples.
        
sample_list = []

for name in sample_names:
    # Getting anndata (transposed to obs X vars)
    path = f'samples/{name}{middle_path}/matrix.mtx.gz'
    sample = sc.read(path, cache=True).T
    
    # Getting obs
    path = f'samples/{name}{middle_path}/barcodes.tsv.gz'
    obs = pd.read_csv(path, sep='\t', header=None)
    obs.index.name = 'barcode'
    sample.obs = obs
    
    # Adding metadata
    sample.obs['Patient'] = name
    sample.obs['Condition'] = name[-1].upper()
    
    # Getting vars
    path = f"samples/{name}{middle_path}/features.tsv.gz"
    var = pd.read_csv(path, sep='\t', header=None)
    var.index.name = 'genes'
    sample.vars = var
    
    sample_list.append(sample)
sample_list

[AnnData object with n_obs × n_vars = 12183 × 33538
     obs: 0, 'Patient', 'Condition',
 AnnData object with n_obs × n_vars = 14770 × 33538
     obs: 0, 'Patient', 'Condition',
 AnnData object with n_obs × n_vars = 1547 × 33538
     obs: 0, 'Patient', 'Condition',
 AnnData object with n_obs × n_vars = 1557 × 33538
     obs: 0, 'Patient', 'Condition']

In [14]:
adata = ad.concat(sample_list)
del sample_list

adata.var_names_make_unique()
adata.obs_names_make_unique()
adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 30057 × 33538
    obs: 0, 'Patient', 'Condition'

In [22]:
adata.obs['author'] = 'Peng_2019'

## QC - Removing doublets with SOLO (scVI)

<p style="color:orange; font-size:20px">Making the model.</p>

In [15]:
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [16]:
sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=3000, layer='counts',subset=True, batch_key="author")

In [17]:
adata.shape

(30057, 3000)

In [18]:
scvi.model.SCVI.setup_anndata(adata)
vae = scvi.model.SCVI(adata)
vae.train()

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                                                  
[34mINFO    [0m No label_key inputted, assuming all cells have same label                                                 
[34mINFO    [0m Using data from adata.X                                                                                   
[34mINFO    [0m Successfully registered anndata object containing [1;36m30057[0m cells, [1;36m3000[0m vars, [1;36m1[0m batches, [1;36m1[0m labels, and [1;36m0[0m      
         proteins. Also registered [1;36m0[0m extra categorical covariates and [1;36m0[0m extra continuous covariates.               
[34mINFO    [0m Please do not further modify adata until model is trained.                                                


GPU available: False, used: False
TPU available: False, using: 0 TPU cores


Epoch 1/266:   0%|                                                                             | 0/266 [00:00<?, ?it/s]



Epoch 2/266:   0%|▏                                               | 1/266 [00:21<1:36:18, 21.81s/it, loss=561, v_num=1]



Epoch 266/266: 100%|████████████████████████████████████████████| 266/266 [1:40:53<00:00, 22.76s/it, loss=487, v_num=1]


<p style="color:orange; font-size:20px">Creating and training SOLO model.</p>

In [19]:
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()



[34mINFO    [0m Creating doublets, preparing SOLO model.                                                                  



See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html


AttributeError: Can only use .str accessor with string values!

<p style="color:orange; font-size:20px">We then extract a predition dataframe. But as SOLO adds 2 characters in the barcode, we remove it for it to be the same format as Anndata.</p>

In [None]:
df = solo.predict()
df['prediction'] = solo.predict(soft = False)

df.index = df.index.map(lambda x: x[:-2])

df

In [None]:
df. groupby('prediction').count()

In [None]:
adata.obs['doublet_prediction'] = df.prediction
adata.obs

<p style="color:orange; font-size:20px">Savind data for Seurat workflow.</p>

In [None]:
adata.obs.to_csv("D:/Scanpy/metadata_workflow_peng.csv")

In [None]:
metadata = pd.read_csv("D:/Scanpy/Workflow_A/metadata_workflow_peng.csv", index_col = 0)
adata.obs = metadata
adata.obs

In [None]:
adata = adata[adata.obs.doublet_pred == 'singlet'].copy()

<p style="color:orange; font-size:20px">Calculating QC metrics.</p>

In [None]:
# mitochondrial genes
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes.
adata.var["hb"] = adata.var_names.str.contains(("^HB[^(P)]"))

<p style="color:orange; font-size:20px">Savind data for Seurat workflow.</p>

In [None]:
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
adata

In [None]:
plot1 = sns.displot(adata.obs["total_counts"], bins=100, kde=False)
plot2 = sc.pl.violin(adata, "pct_counts_mt")
plot3 = sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt")