# Import libraries.

In [6]:
import numpy as np
import matplotlib.pyplot as plt 
import scanpy as sc
import scvi
import pandas as pd

# Load data.

In [7]:
full_adata = sc.read_h5ad('human_6_8_12and19_merged_final_cleaned.h5ad')
cm_adata = full_adata[full_adata.obs.new_manual_annotation.isin(['Ventricular_myocytes', 'Atrial_myocytes'])]
cm_adata

View of AnnData object with n_obs × n_vars = 3748 × 2000
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'Sample', 'read_depth', 'new_manual_annotation', 'time', 'EXPT'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable'
    obsm: 'X_harmony', 'X_pca', 'X_umap'
    varm: 'HARMONY', 'PCs'

In [8]:
adata = cm_adata.raw.to_adata()
adata

AnnData object with n_obs × n_vars = 3748 × 68120
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'Sample', 'read_depth', 'new_manual_annotation', 'time', 'EXPT'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable'
    obsm: 'X_harmony', 'X_pca', 'X_umap'

# Doublet removal.

In [9]:
sc.pp.filter_genes(adata, min_cells = 10)

In [10]:
adata.layers['raw'] = pd.DataFrame.sparse.from_spmatrix(adata.X)

adata.layers['raw'].columns = adata.var.index
adata.layers['raw'].index = adata.obs.index

sc.pp.highly_variable_genes(adata=adata, n_top_genes = 3000, layer = 'raw', flavor = 'seurat')

In [None]:
scvi.model.SCVI.setup_anndata(adata)
vae = scvi.model.SCVI(adata)
vae.train()

In [None]:
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()

In [None]:
df = solo.predict()
df['prediction'] = solo.predict(soft = False)

df.index = df.index.map(lambda x: x[:-2])

df

In [None]:
df.groupby('prediction').count()

There are only a few predicted doublets, so any doublets present will not have any significant impact on the outcomes of the model. Thus, doublets will not be filtered out.

# Preprocessing

In [11]:
sc.pp.filter_genes(adata, min_cells = 5)
adata

AnnData object with n_obs × n_vars = 3748 × 15687
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'Sample', 'read_depth', 'new_manual_annotation', 'time', 'EXPT'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg'
    obsm: 'X_harmony', 'X_pca', 'X_umap'
    layers: 'raw'

In [12]:
adata = adata[:, adata.var.highly_variable]
adata

View of AnnData object with n_obs × n_vars = 3748 × 3000
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'Sample', 'read_depth', 'new_manual_annotation', 'time', 'EXPT'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg'
    obsm: 'X_harmony', 'X_pca', 'X_umap'
    layers: 'raw'

In [13]:
adata.write_h5ad('preprocessed_human_6_8_12and19_merged_final_cleaned.h5ad')

  df[key] = c
  df[key] = c
  df[key] = c
