In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from matplotlib import pyplot as plt

import scanpyHelpers as scH

In [None]:
writeDir = "write"
tissue = "Muscle"

species1 = "human"
adataFile1 = f'{writeDir}/{tissue}Human_DeM_QC.h5ad'

species2 = "mouse"
adataFile2a = f'{writeDir}/{tissue}Mouse_QC.h5ad'
adataFile2b = f'{writeDir}/{tissue}Mouse_DeM_QC.h5ad'

mergedAdataFileQC = f'{writeDir}/{tissue}{species1}{species2}_QC.h5ad'
mergedAdataFile = f'{writeDir}/{tissue}{species1}{species2}.h5ad'

homologyFile = "data/hgncHM_121.csv"

In [None]:
homology = pd.read_table(homologyFile, sep=",")
homology

In [None]:
adata1 = sc.read(adataFile1)
adata1.layers["counts"] = adata1.X.copy()
adata1

In [None]:
sc.pp.normalize_total(adata1, target_sum=1e4)#,exclude_highly_expressed=True)#
sc.pp.log1p(adata1)

In [None]:
adata2a = sc.read(adataFile2a)
adata2a.obs["cell_annotation"] = len(adata2a)*["U"]
adata2b = sc.read(adataFile2b)

adata2 = ad.concat([adata2a,adata2b], label="dataset", keys=["DeM","Rb"])
adata2.layers["counts"] = adata2.X.copy()
adata2

In [None]:
sc.pp.normalize_total(adata2, target_sum=1e4)#,exclude_highly_expressed=True)#
sc.pp.log1p(adata2)

In [None]:
genes1 = np.array(adata1.var_names, dtype=object)
genes2 = np.array(adata2.var_names, dtype=object)
genes1, genes2 = scH.getOverlapGenes(genes1, genes2, homology, species1, species2)

In [None]:
print(f"{len(genes1)} {len(genes2)} {len(set(genes1).intersection(set(genes2)))}")

In [None]:
adata1.var_names = genes1
adata2.var_names = genes2

In [None]:
adata = ad.concat([adata1, adata2], join="outer", label="species", keys=[species1, species2], fill_value=0)
adata.obs_names_make_unique()

In [None]:
adata.obs = adata.obs[["species","cell_annotation"]]

In [None]:
adata

In [None]:
mergedAdataFileQC

In [None]:
adata.write(mergedAdataFileQC)

In [None]:
batchKey = "species"

In [None]:
sc.pp.highly_variable_genes(adata, batch_key=batchKey)

In [None]:
minMean = 0.05
maxMean = 2.9
minDisp = 0.25

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

means = adata.var[["means"]][adata.var[["means"]] > np.exp(-14)]#adata.var[["means"]],
axs[0].hist(np.log(means), bins=100)#, log=True),
axs[0].axvline(np.log(minMean), color='k', linestyle='dashed', linewidth=1)
axs[0].axvline(np.log(maxMean), color='k', linestyle='dashed', linewidth=1)
axs[0].set_title('Gene means counts')
axs[0].set_xlabel('means')
axs[0].set_ylabel('counts')

dispNorm = adata.var[["dispersions_norm"]][adata.var[["dispersions_norm"]] > np.exp(-8)]#adata.var[["means"]],
axs[1].hist(np.log(dispNorm), bins=100)#, log=True),
axs[1].axvline(np.log(minDisp), color='k', linestyle='dashed', linewidth=1)
axs[1].set_title('Gene dispersions counts')
axs[1].set_xlabel('dispersions')
axs[1].set_ylabel('counts')

sc.pp.highly_variable_genes(adata, min_disp=minDisp, min_mean=minMean, max_mean=maxMean, batch_key=batchKey)
print(sum(adata.var.highly_variable))
print(sum(adata.var.highly_variable_intersection))

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('MT-') 

In [None]:
adata.raw = adata

In [None]:
#adata = adata[:, adata.var.highly_variable]
adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]

In [None]:
#sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

In [None]:
#sc.pp.scale(adata, max_value=5)

In [None]:
sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100, log=True)

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100)

In [None]:
sc.pl.pca(adata, color=["cell_annotation"])

In [None]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=60)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.2)

In [None]:
adata

In [None]:
sc.pl.umap(adata, color=["leiden","cell_annotation",batchKey],ncols=2)

In [None]:
from collections import Counter

In [None]:
#for i in adata.obs.leiden.cat.categories:
#    print(i)
#    print(Counter(adata.obs[adata.obs.leiden==i].cell_annotation))
#    print(Counter(adata.obs[adata.obs.leiden==i].species))
#    print()

In [None]:
leidenDict = {'0':"Endo", 
              '1':"Fibro", 
              '2':"Fibro",
              '3':"Fibro",
              '4':"Endo",
              '5':"Fibro",
              '6':"Fibro",
              '7':"MuSCs",
              '8':"Smooth",
              '9':"MuSCs",
              '10':"Adipo",
              '11':"Macro",
              '12':"B/T/NK",
              '13':"Smooth",
              '14':"Skel",
              '15':"Neural",
              '16':"MuSCs",
              '17':"B/T/NK",
              '18':"B/T/NK",
              '19':"Macro",
              '20':"Skel",
              '21':"Smooth",
              '22':"Endo",}
adata.obs["simple"] = [leidenDict[c] for c in adata.obs.leiden]

In [None]:
sc.pl.umap(adata, color=["simple","species"],legend_loc="on data")

In [None]:
mergedAdataFile

In [None]:
adata.write(mergedAdataFile)