In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from matplotlib import pyplot as plt
import scipy as si
import scanpyHelpers as scH

In [None]:
writeDir = "write"
tissue = "pdac"

dictLab = ["species","adataName"]
dataDicts = {"mou":dict(zip(dictLab, ["mouse",f'{writeDir}/{tissue}Mouse'])),
             "veh":dict(zip(dictLab, ["mouse",f'{writeDir}/{tissue}MouseVeh'])),
             "pdx":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}PdxVeh'])),
             "hum":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}Human'])),
             #"met":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}HumanMet'])),
             #"org":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}Organoid'])),
             "hwa":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}Hwang'])),
             "lin":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}Lin']))
            }

mergedAdataFileQC = f'{writeDir}/{tissue}mouseHumanVehHwangLin_QC.h5ad'
mergedAdataFile = f'{writeDir}/{tissue}mouseHumanVehHwangLin.h5ad'

homologyFile = "data/hgncHM_121.csv"

In [None]:
homology = pd.read_table(homologyFile, sep=",")
homology

In [None]:
sigFile = "data/PDAC/basClaSigGenes.csv"
sigGenes = pd.read_csv(sigFile)
sigGenes
humanList =  list(homology["human"])
for i in sigGenes.index:
    for j in sigGenes.columns:
        if sigGenes.loc[i,j] in humanList:
            sigGenes.loc[i,j] = f'{homology["mouse"][humanList.index(sigGenes.loc[i,j])]}/{sigGenes.loc[i,j]}'
sigGenes = sigGenes[["scBasal","scClassical","EMT"]]
sigGenes

In [None]:
def getAdata(adataFile, neededObs):
    adata = sc.read_h5ad(adataFile+"_QC.h5ad")
    adataPost = sc.read_h5ad(adataFile+".h5ad")
    #import pdb; pdb.set_trace()
    #adata = adata[adataPost.obs_names,:]
    adata = adata[list(set(adata.obs_names).intersection(set(adataPost.obs_names))),:]
    adata.obs[neededObs] = adataPost.obs[neededObs].copy()
    adata.layers["counts"] = adata.X.copy()
    #adata.obs["sample"] = adata.obs["Classification"].copy()
    return(adata)

def getNormAdata(adataFile, neededObs):
    adata = getAdata(adataFile, neededObs)
    sc.pp.normalize_total(adata, target_sum=1e4)#,exclude_highly_expressed=True)#
    sc.pp.log1p(adata)
    return adata

In [None]:
geneSpecDict = []
adatas = []
for datakey in dataDicts.keys():
    print(datakey)
    adata = getNormAdata(dataDicts[datakey]["adataName"], [])
    geneSpecDict.append({"genes":np.array(adata.var_names.copy()),"species":dataDicts[datakey]["species"]})
    adatas.append(adata)

geneSpecDict

In [None]:
scH.getOverlapGenesMulti(homology, geneSpecDict)

In [None]:
for i,genesDict in enumerate(geneSpecDict):
    adatas[i].var_names = genesDict["genes"]

In [None]:
adata = ad.concat(adatas, join="outer", label="dataset", keys=dataDicts.keys(), fill_value=0)
adata.obs_names_make_unique()

In [None]:
adata

In [None]:
labels = np.empty(len(adata.obs_names), dtype="object")
for i, cell in enumerate(adata.obs_names):
    sample, classif, label = adata.obs.loc[cell,["sample","Classification","dataset"]]
    if(not pd.isna(classif)):
        spliton = "_"
        if "-" in classif:
             spliton = "-"
        label=classif.split(spliton)[0]
    elif(not pd.isna(sample)):
        label=sample
    labels[i] = label
adata.obs = adata.obs[["sample","Classification",'dataset']]
adata.obs["species"] = ["mouse" if lab=="mou" else "human" for lab in adata.obs.dataset]
adata.obs["label"] = labels   
adata = adata[np.logical_not(["MET" in lab for lab in adata.obs.label])]
adata.obs

In [None]:
adata.write(mergedAdataFileQC)

In [None]:
from collections import Counter
Counter(adata.obs.dataset)

In [None]:
Counter(adata.obs.Classification)

In [None]:
['U2','U9','U11','U15','U3','U12','U18','U14','U6','U16','U13','U4','U7','P10','U5','U10']

In [None]:
datasetLabels = ['hwa','hum','lin','veh','pdx']
patientLabels = ['U2','U9','U11','U15','U3','U12','U18','U14','U6','U16','U13','U4','U7','P10','U5','U10']
for dlabel in datasetLabels:
    print(dlabel, "mou")
for plabel in patientLabels:
    print(plabel, "mou")

In [None]:
adata = sc.read_h5ad(mergedAdataFileQC)
adata

In [None]:
batchKey = "dataset"
cellTypeKey = "gmmClass"

In [None]:
plabel = "mou"
adata = sc.read_h5ad(mergedAdataFileQC)
adata = adata[adata.obs["label"] == plabel]
sc.pp.highly_variable_genes(adata, batch_key=batchKey, flavor="seurat", n_top_genes=2500)
adata.var['mt'] = adata.var_names.str.startswith('mt-') 
adata.raw = adata
mouseRawCounts = adata.layers["counts"]
adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]
sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=60)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=0.2)

scoreNames, newBClabel, ogLabelScoreMed = scH.scoreAndLabel(adata, sigGenes, labelOfSigGenes= sigGenes.columns, ogLabel="leiden", translate=False)
gmmClass = scH.gmmScoreGeneSig(adata.obs[scoreNames],show=False)
adata.obs[cellTypeKey] = [ f"{x}_{plabel}" if(x == "Negative") else x for x in gmmClass]

sc.pl.umap(adata, color=["leiden",cellTypeKey,batchKey,"species"]+scoreNames,ncols=4)
adataM = adata

In [None]:
adataMo = adataM.raw.to_adata()
adataMo.obs_names_make_unique()
adataMo.layers["counts"] = mouseRawCounts
ad.concat([adataMo,adataMo], join="outer", fill_value=0)

In [None]:
dAdatas = np.empty(len(datasetLabels),dtype="object")
dRawCounts = np.empty(len(datasetLabels),dtype="object")

for i,dlabel in enumerate(datasetLabels):
    adata = sc.read_h5ad(mergedAdataFileQC)
    adata = adata[adata.obs["dataset"] == dlabel]
    sc.pp.highly_variable_genes(adata, batch_key=batchKey, flavor="seurat", n_top_genes=2500)
    adata.var['mt'] = adata.var_names.str.startswith('mt-') 
    adata.raw = adata
    dRawCounts[i] = adata.layers["counts"]
    
    adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]
    sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=20, n_pcs=60)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=0.2)
    
    scoreNames, newBClabel, ogLabelScoreMed = scH.scoreAndLabel(adata, sigGenes, labelOfSigGenes= sigGenes.columns, ogLabel="leiden", translate=False)
    gmmClass = scH.gmmScoreGeneSig(adata.obs[scoreNames],show=False)
    adata.obs[cellTypeKey] = [ f"{x}_{dlabel}" if(x == "Negative") else x for x in gmmClass]
    
    sc.pl.umap(adata, color=["leiden",cellTypeKey,batchKey,"species"]+scoreNames,ncols=4)
    dAdatas[i] = adata

adataA = ad.concat([adataM.raw.to_adata(),
                    dAdatas[0].raw.to_adata(),
                    dAdatas[1].raw.to_adata(),
                    dAdatas[2].raw.to_adata(),
                    dAdatas[3].raw.to_adata(),
                    dAdatas[4].raw.to_adata()], join="outer", fill_value=0)
adataA.layers["counts"] = si.sparse.vstack([mouseRawCounts]+list(dRawCounts))
sc.pp.highly_variable_genes(adataA, batch_key=batchKey, flavor="seurat", n_top_genes=2500)
adataA.var['mt'] = adata.var_names.str.startswith('mt-') 
adataA.raw = adataA
adataA = adataA[:, np.logical_and(adataA.var.highly_variable, np.logical_not(adataA.var.mt))]

for i,dAdata in enumerate(dAdatas):
    adataMo = adataM.raw.to_adata()
    adataMo.layers["counts"] = mouseRawCounts
    
    adataDa = dAdata.raw.to_adata()
    adataDa.layers["counts"] = dRawCounts[i]
    adata = ad.concat([adataMo,adataDa], join="outer", fill_value=0)
    
    sc.pp.highly_variable_genes(adata, batch_key=batchKey, flavor="seurat", n_top_genes=2500)
    adata.var['mt'] = adata.var_names.str.startswith('mt-') 
    adata.raw = adata
    adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]
    sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=20, n_pcs=60)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=0.2)
    sc.pl.umap(adata, color=["leiden",cellTypeKey,batchKey,"species"],ncols=4)
    labelName = datasetLabels[i]
    if("_" in labelName):
        labelName = labelName.split("_")[0]
    adata.write(f"write/pdacAdatas/PdacMouseHuman{labelName}.h5ad")
    dAdatas[i] = adata
    print(adata)

In [None]:
pAdatas = np.empty(len(patientLabels),dtype="object")
pRawCounts = np.empty(len(patientLabels),dtype="object")

for i,plabel in enumerate(patientLabels):
    adata = sc.read_h5ad(mergedAdataFileQC)
    adata = adata[adata.obs["label"] == plabel]
    sc.pp.highly_variable_genes(adata, batch_key=batchKey, flavor="seurat", n_top_genes=2500)
    adata.var['mt'] = adata.var_names.str.startswith('mt-') 
    adata.raw = adata
    pRawCounts[i] = adata.layers["counts"]

    adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]
    sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=20, n_pcs=60)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=0.2)
    
    scoreNames, newBClabel, ogLabelScoreMed = scH.scoreAndLabel(adata, sigGenes, labelOfSigGenes= sigGenes.columns, ogLabel="leiden", translate=False)
    gmmClass = scH.gmmScoreGeneSig(adata.obs[scoreNames],show=False)
    adata.obs[cellTypeKey] = [ f"{x}_{plabel}" if(x == "Negative") else x for x in gmmClass]
    
    sc.pl.umap(adata, color=["leiden",cellTypeKey,batchKey,"species"]+scoreNames,ncols=4)
    pAdatas[i] = adata

for i,pAdata in enumerate(pAdatas):
    adataMo = adataM.raw.to_adata()
    adataMo.layers["counts"] = mouseRawCounts
    
    adataPa = pAdata.raw.to_adata()
    adataPa.layers["counts"] = pRawCounts[i]
    adata = ad.concat([adataMo,adataPa], join="outer", fill_value=0)
    
    sc.pp.highly_variable_genes(adata, batch_key=batchKey, flavor="seurat", n_top_genes=2500)
    adata.var['mt'] = adata.var_names.str.startswith('mt-') 
    adata.raw = adata
    adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]
    sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=20, n_pcs=60)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=0.2)
    sc.pl.umap(adata, color=["leiden",cellTypeKey,batchKey,"species"],ncols=4)
    labelName = patientLabels[i]
    if("_" in labelName):
        labelName = labelName.split("_")[0]
    adata.write(f"write/pdacAdatas/PdacMouseHuman{labelName}.h5ad")
    pAdatas[i] = adata
    print(adata)

In [None]:
adataA.write(f"write/pdacAdatas/PdacMouseHumanALL.h5ad")

In [None]:
print("filename,batchSize,numEpoch,learningRate,inLayerDims,lastLayer,inDiscLayer,reconCoef,klCoef,discCoef,batchName,cellName,res")
print(f"PdacMouseHumanALL,128,128,1e-5,1024-128,10,5,15,1e-1,1,{batchKey},{cellTypeKey},4e-1")
for dLabel in datasetLabels:
    print(f"PdacMouseHuman{dLabel},128,128,1e-5,1024-128,10,5,15,1e-1,1,{batchKey},{cellTypeKey},4e-1")
for pLabel in patientLabels:
    print(f"PdacMouseHuman{pLabel},128,128,1e-5,1024-128,10,5,15,1e-1,1,{batchKey},{cellTypeKey},4e-1")