In [None]:

from pathlib import Path

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import squidpy as sq

import anndata as ad

import pandas as pd

import scvi

sc.logging.print_header()

In [None]:

adata = sq.read.nanostring(
    path="",
    counts_file="exprMat_file.csv",
    meta_file="metadata_file.csv",
    fov_file="fov_positions_file.csv",
)

In [None]:
display(adata)

In [None]:
adata.obs['genotype'] = pd.to_numeric(adata.obs['fov']).apply(lambda x: 'WT' if 1 <= x <= 35 else 'MT' if 36 <= x <= 70 else 'unknown')

In [None]:
display(adata)

In [None]:

filtered_var_names = [
    name for name in adata.var_names
    if not name.startswith('SystemControl') and not name.startswith('Negative')
]

adata = adata[:, filtered_var_names]

print(adata.var_names)

In [None]:
adata.var["NegPrb"] = adata.var_names.str.startswith("NegPrb")
sc.pp.calculate_qc_metrics(adata, qc_vars=["NegPrb"], inplace=True)

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
adata.obs["total_counts_NegPrb"].sum() / adata.obs["total_counts"].sum() * 100

In [None]:

fig, axs = plt.subplots(1, 3, figsize=(15, 4))

axs[0].set_title("Total transcripts per cell")
sns.histplot(
    adata.obs["total_counts"],
    kde=False,
    ax=axs[0],
)

axs[1].set_title("Unique transcripts per cell")
sns.histplot(
    adata.obs["n_genes_by_counts"],
    kde=False,
    ax=axs[1],
)

axs[2].set_title("Transcripts per FOV")
sns.histplot(
    adata.obs.groupby("fov").sum()["total_counts"],
    kde=False,
    ax=axs[2],
)

In [None]:

MIN_COUNTS = 200
MIN_GENES = 120
MIN_CELLS = 500

print('Total number of cells: {:d}'.format(adata.n_obs))

sc.pp.filter_cells(adata, min_counts = MIN_COUNTS)
print('Number of cells after min count filter: {:d}'.format(adata.n_obs))

sc.pp.filter_cells(adata, min_genes = MIN_GENES)
print('Number of cells after gene filter: {:d}'.format(adata.n_obs))

sc.pp.filter_genes(adata, min_cells = MIN_CELLS)
print('Number of cells after cell filter: {:d}'.format(adata.n_obs))



In [None]:

fig, axs = plt.subplots(1, 3, figsize=(15, 4))

axs[0].set_title("Total transcripts per cell")
sns.histplot(
    adata.obs["total_counts"],
    kde=False,
    ax=axs[0],
)

axs[1].set_title("Unique transcripts per cell")
sns.histplot(
    adata.obs["n_genes_by_counts"],
    kde=False,
    ax=axs[1],
)

axs[2].set_title("Transcripts per FOV")
sns.histplot(
    adata.obs.groupby("fov").sum()["total_counts"],
    kde=False,
    ax=axs[2],
)

In [None]:
adata.layers["counts"] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, inplace=True)
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=300)
print('\n','Number of highly variable genes: {:d}'.format(np.sum(adata.var['highly_variable'])))

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata.var

In [None]:
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')

In [None]:
print(adata.obsm['X_pca'].shape)

In [None]:
sc.pl.pca(adata, color='total_counts')

In [None]:
sc.pl.pca(adata, color='total_counts', components=['1,2', '2,3', '1,3'])

In [None]:
sc.pp.neighbors(adata)

In [None]:
adata.obsp['connectivities']

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color='total_counts')

In [None]:
sc.tl.leiden(adata, resolution=0.5, key_added='leiden_r0.5')

In [None]:
sc.pl.umap(adata, color=['genotype', 'leiden_r0.5'], ncols=2, frameon=False)

In [None]:
scvi.model.SCVI.setup_anndata(
    adata,
    layer='counts',
    batch_key='genotype',
)


In [None]:
model = scvi.model.SCVI(adata)

In [None]:
model.train()

In [None]:
model.save('./models/scVI_model', overwrite=True)

In [None]:
adata.write(filename='./adata_scvimodel.h5ad')

In [None]:
adata.obsm['X_scVI'] = model.get_latent_representation()

In [None]:
adata.layers['scvi_normalized'] = model.get_normalized_expression(library_size=1e4)

In [None]:
adata

In [None]:
sc.pp.neighbors(adata,
                n_neighbors=30,
                use_rep="X_scVI")
sc.tl.umap(adata, min_dist=0.2)
sc.pl.umap(adata, color='genotype')

In [None]:
sc.tl.leiden(adata, key_added="leiden_scVI", resolution=0.5)
sc.pl.umap(adata,
           color=['genotype', 'leiden_scVI'],
           ncols=2,
           frameon=False)