In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from matplotlib import pyplot as plt

import scanpyHelpers as scH

In [None]:
writeDir = "write"
tissue = "pdac"

dictLab = ["species","adataName"]
dataDicts = {"mou":dict(zip(dictLab, ["mouse",f'{writeDir}/{tissue}Mouse'])),
             "veh":dict(zip(dictLab, ["mouse",f'{writeDir}/{tissue}MouseVeh'])),
             "pdx":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}PdxVeh'])),
             "hum":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}Human'])),
             #"met":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}HumanMet'])),
             #"org":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}Organoid'])),
             "hwa":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}Hwang'])),
             "lin":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}Lin']))
            }

mergedAdataFileQC = f'{writeDir}/{tissue}mouseHumanVehHwangLin_QC.h5ad'
mergedAdataFile = f'{writeDir}/{tissue}mouseHumanVehHwangLin.h5ad'

homologyFile = "data/hgncHM_121.csv"

In [None]:
homology = pd.read_table(homologyFile, sep=",")
homology

In [None]:
def getAdata(adataFile, neededObs):
    adata = sc.read(adataFile+"_QC.h5ad")
    adataPost = sc.read(adataFile+".h5ad")
    sc.pl.umap(adataPost, color=["cellState"])
    adata = adata[adataPost.obs_names,:]
    adata.obs[neededObs] = adataPost.obs[neededObs]
    adata.layers["counts"] = adata.X.copy()
    #adata.obs["sample"] = adata.obs["Classification"].copy()
    return(adata)

def getNormAdata(adataFile, neededObs):
    adata = getAdata(adataFile, neededObs)
    sc.pp.normalize_total(adata, target_sum=1e4)#,exclude_highly_expressed=True)#
    sc.pp.log1p(adata)
    return adata

In [None]:
geneSpecDict = []
adatas = []
for datakey in dataDicts.keys():
    print(datakey)
    adata = getNormAdata(dataDicts[datakey]["adataName"], ["cellState","decoupler"])
    geneSpecDict.append({"genes":np.array(adata.var_names.copy()),"species":dataDicts[datakey]["species"]})
    adatas.append(adata)

geneSpecDict

In [None]:
scH.getOverlapGenesMulti(homology, geneSpecDict)

In [None]:
for i,genesDict in enumerate(geneSpecDict):
    adatas[i].var_names = genesDict["genes"]

In [None]:
dataDicts.keys()

In [None]:
adata = ad.concat(adatas, join="outer", label="dataset", keys=dataDicts.keys(), fill_value=0)
adata.obs_names_make_unique()

In [None]:
adata

In [None]:
adata.var

In [None]:
adata.obs = adata.obs[['batch', 'mouse','dataset',"cellState","decoupler"]]
adata.obs["species"] = ["mouse" if lab=="mou" else "human" for lab in adata.obs.dataset]

In [None]:
mergedAdataFileQC

In [None]:
adata.write(mergedAdataFileQC)

In [None]:
adata

In [None]:
combos = {"mouseHumanVehHwangLin":['mou', 'veh', 'pdx', 'hum', 'hwa', 'lin'],
          #"MouseHumanHwangLin":['mou', 'hum', 'hwa', 'lin'],
          "HumanPDXHwangLin":['hum', 'pdx', 'hwa', 'lin'],
          "MouseHuman":['mou', 'hum'],
          "MouseHwang":['mou', 'hwa'],
          "MouseLin":['mou', 'lin'],
          "MousePDX":['mou', 'pdx'],
          "MouseVeh":['mou', 'veh'],
          "VehHuman":['veh', 'hum'],
          "VehPDX":['veh', 'pdx'],
          "VehHwang":['veh', 'hwa'],
          "VehLin":['veh', 'lin']}

name = "MouseLin"
combo = combos[name]

In [None]:
for name in combos:
    combo = combos[name]
    adata = sc.read(mergedAdataFileQC)
    mergedAdataFile = f'{writeDir}/{tissue}{name}.h5ad'
    adata = adata[[datas in combo for datas in adata.obs.dataset]]
    batchKey = "dataset"
    cellTypeKey = "decoupler"
    sc.pp.highly_variable_genes(adata, batch_key=batchKey, flavor="seurat_v3", n_top_genes=2000, layer="counts")
    adata.var['mt'] = adata.var_names.str.startswith('mt-') 
    adata.raw = adata
    adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]
    sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=20, n_pcs=60)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=0.2)
    sc.pl.umap(adata, color=["leiden",cellTypeKey,batchKey,"species"],ncols=4)
    #adata.write(mergedAdataFile)
    break

In [None]:
adata

In [None]:
adata = sc.read(mergedAdataFileQC)
mergedAdataFile = f'{writeDir}/{tissue}{name}.h5ad'
adata = adata[[datas in combo for datas in adata.obs.dataset]]

In [None]:
batchKey = "dataset"
cellTypeKey = "decoupler"

In [None]:
sc.pp.highly_variable_genes(adata, batch_key=batchKey)

In [None]:
minMean = 0.1
maxMean = 2.7
minDisp = 0.2

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

means = adata.var[["means"]][adata.var[["means"]] > np.exp(-14)]#adata.var[["means"]],
axs[0].hist(np.log(means), bins=100)#, log=True),
axs[0].axvline(np.log(minMean), color='k', linestyle='dashed', linewidth=1)
axs[0].axvline(np.log(maxMean), color='k', linestyle='dashed', linewidth=1)
axs[0].set_title('Gene means counts')
axs[0].set_xlabel('means')
axs[0].set_ylabel('counts')

dispNorm = adata.var[["dispersions_norm"]][adata.var[["dispersions_norm"]] > np.exp(-8)]#adata.var[["means"]],
axs[1].hist(np.log(dispNorm), bins=100)#, log=True),
axs[1].axvline(np.log(minDisp), color='k', linestyle='dashed', linewidth=1)
axs[1].set_title('Gene dispersions counts')
axs[1].set_xlabel('dispersions')
axs[1].set_ylabel('counts')

sc.pp.highly_variable_genes(adata, min_disp=minDisp, min_mean=minMean, max_mean=maxMean, batch_key=batchKey)
print(sum(adata.var.highly_variable))
print(sum(adata.var.highly_variable_intersection))

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata.var[adata.var.highly_variable]

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('mt-') 

In [None]:
adata.raw = adata

In [None]:
#adata = adata[:, adata.var.highly_variable]
adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]

In [None]:
#sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

In [None]:
#sc.pp.scale(adata, max_value=5)

In [None]:
sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100, log=True)

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100)

In [None]:
sc.pl.pca(adata, color=[cellTypeKey,"species"])

In [None]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=60)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.2)

In [None]:
adata

In [None]:
sc.pl.umap(adata, color=["leiden",cellTypeKey,batchKey,"species"],ncols=2)

In [None]:
adata

In [None]:
mergedAdataFile

In [None]:
sc.tl.leiden(adata, resolution=0.2, key_added="leid")

In [None]:
adata.write(mergedAdataFile)

In [None]:
#adata = sc.read(mergedAdataFile)

In [None]:
dataDicts.keys()

In [None]:
adata = sc.read(dataDicts["mou"]["adataName"]+".h5ad")
sc.pl.umap(adata, color=["mouse","decoupler","Ptprc"])

In [None]:
adata = sc.read(dataDicts["hum"]["adataName"]+".h5ad")
sc.pl.umap(adata, color=["samples","celltypes","decoupler","PTPRC"])

In [None]:
adata = sc.read(dataDicts["met"]["adataName"]+".h5ad")
sc.pl.umap(adata, color=["donor_ID","decoupler","PTPRC"])

In [None]:
adata = sc.read(dataDicts["org"]["adataName"]+".h5ad")
sc.pl.umap(adata, color=["donor_ID","decoupler","PTPRC"])

In [None]:
adata = sc.read(dataDicts['hwa']["adataName"]+".h5ad")
sc.pl.umap(adata, color=["sample","decoupler","PTPRC"])

In [None]:
adata = sc.read(dataDicts['lin']["adataName"]+".h5ad")
sc.pl.umap(adata, color=["sample","decoupler","PTPRC"])