In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from matplotlib import pyplot as plt
import scipy as si
import scanpyHelpers as scH

# Input data

In [None]:
writeDir="write/"
tissue="luad"
dictLab = ["species","adataName"]
dataDicts = {"mou":dict(zip(dictLab, ["mouse",f'{writeDir}/{tissue}Mouse_shKras'])),
             #"mLg":dict(zip(dictLab, ["mouse",f'{writeDir}/lungMouse'])),
             "pdx":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}Pdx'])),
             "dos":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}Dost'])),
             "bis":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}Bischoff'])),
             "kim":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}Kim'])),
             "mda":dict(zip(dictLab, ["human",f'{writeDir}/{tissue}MDAkras'])),
             #"hLg":dict(zip(dictLab, ["human",f'{writeDir}/lungHuman']))}

mergedAdataFileQC = f'{writeDir}/{tissue}MousePdxDostBischoffKimMda_QC.h5ad'
mergedAdataFile = f'{writeDir}/{tissue}MousePdxDostBischoffKimMda.h5ad'

homologyFile = "data/hgncHM_121.csv"

## signature and holmology

In [None]:
homology = pd.read_table(homologyFile, sep=",")
homology

In [None]:
sigFile = "data/LUAD/humanLUADgeneSig.csv"
sigGenes = pd.read_csv(sigFile)
sigGenes
sigGenesHomolo=sigGenes.copy()
sigGenesM=sigGenes.copy()
humanList =  list(homology["human"])
for i in sigGenes.index:
    for j in sigGenes.columns:
        if sigGenes.loc[i,j] in humanList:
            sigGenesHomolo.loc[i,j] = f'{homology["mouse"][humanList.index(sigGenes.loc[i,j])]}/{sigGenes.loc[i,j]}'
            sigGenesM.loc[i,j] = f'{homology["mouse"][humanList.index(sigGenes.loc[i,j])]}'
sigGenesHomolo

In [None]:
sigGenesM

## helper functions

In [None]:
def getAdata(adataFile, neededObs):
    adata = sc.read_h5ad(adataFile+"_QC.h5ad")
    adataPost = sc.read_h5ad(adataFile+".h5ad")
    #import pdb; pdb.set_trace()
    #adata = adata[adataPost.obs_names,:]
    adata = adata[list(set(adata.obs_names).intersection(set(adataPost.obs_names))),:]
    adata.obs[neededObs] = adataPost.obs[neededObs].copy()
    adata.layers["counts"] = adata.X.copy()
    #adata.obs["sample"] = adata.obs["Classification"].copy()
    return(adata)

def getNormAdata(adataFile, neededObs):
    adata = getAdata(adataFile, neededObs)
    try:
        test = adata.X[0,:].todense()
        normSum = np.sum((np.exp(test)-1), axis=1)[0,0]
    except:
        test = adata.X[0,:]
        normSum = np.sum((np.exp(adata.X[0,:])-1))
    if(normSum != 1e4):
        print("norm")
        sc.pp.normalize_total(adata, target_sum=1e4)#,exclude_highly_expressed=True)#
        sc.pp.log1p(adata)
    return adata

# read each file

In [None]:
#batchKey = "dataset"
cellTypeKey = "gmmClass"

In [None]:
sigGenesL=[sigGenesM, sigGenes, sigGenes, sigGenes, sigGenes, sigGenes]
dataDicts.keys()

In [None]:
geneSpecDict = []
adatas = []
for i,datakey in enumerate(dataDicts.keys()):
    print(datakey)
    adata = getNormAdata(dataDicts[datakey]["adataName"], ["cellState"])
    adata.obs["cs"] = adata.obs.cellState.copy()
    geneSpecDict.append({"genes":np.array(adata.var_names.copy()),"species":dataDicts[datakey]["species"]})
    rawCounts = adata.layers["counts"]
    sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2500)
    #adata.var['mt'] = adata.var_names.str.startswith('mt-') 
    adata.raw = adata    
    adata = adata[:, adata.var.highly_variable]#np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]
    sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=20, n_pcs=60)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=0.2)
    
    scoreNames, newBClabel, ogLabelScoreMed = scH.scoreAndLabel(adata, sigGenesL[i], labelOfSigGenes= sigGenes.columns, ogLabel="leiden", translate=False)
    gmmClass = scH.gmmScoreGeneSig(adata.obs[scoreNames],show=False)
    adata.obs[cellTypeKey] = [ f"{x}_{datakey}" if(x == "Negative") else x for x in gmmClass]
    
    sc.pl.umap(adata, color=["leiden","cellState","cs",cellTypeKey]+scoreNames,ncols=4)

    adata = adata.raw.to_adata()
    adata.layers["counts"] = rawCounts
    adatas.append(adata)

geneSpecDict

In [None]:
scH.getOverlapGenesMulti(homology, geneSpecDict)

In [None]:
for i,genesDict in enumerate(geneSpecDict):
    adatas[i].var_names = genesDict["genes"]

In [None]:
adata = ad.concat(adatas, join="outer", label="dataset", keys=dataDicts.keys(), fill_value=0)
adata.obs_names_make_unique()

In [None]:
adata

In [None]:
labels = np.empty(len(adata.obs_names), dtype="object")
for i, cell in enumerate(adata.obs_names):
    sample, classif, label = adata.obs.loc[cell,["sample","Classification","dataset"]]
    if(not pd.isna(classif)):
        spliton = "_"
        if "-" in classif:
             spliton = "-"
        label=classif.split(spliton)[0]
    elif(not pd.isna(sample)):
        label=sample
    labels[i] = label
adata.obs = adata.obs[['dataset',"cellState","cs","gmmClass"]]
adata.obs["species"] = ["mouse" if lab=="mou" else "human" for lab in adata.obs.dataset]
adata.obs["label"] = labels   
adata.obs

In [None]:
np.unique(labels)

In [None]:
labs = adata.obs["label"].values.copy()
for i,lab in enumerate(labs):
    if lab=="mda":
        pat = adata.obs.iloc[i,:].name.split("_")[0]
        labs[i] = pat.split("-")[0]
    
adata.obs["label"] = labs   

In [None]:
adata.write(mergedAdataFileQC)

In [None]:
datasetLabels = ["pdx","dos","bis","kim","mda"]
patientLabels = ['JHU55A', 'LX278', 'LX29', 'LX29A', 'LX369', 'LX55A', 'P0006',
       'P0008', 'P0018', 'P0019', 'P0020', 'P0025', 'P0028', 'P0030',
       'P0031', 'P0034', 'P10T', 'P14T', 'P2', 'Ru210',
       'Ru318', 'Ru465', 'Ru699', 'Ru890', 'dos', 'mou', 'p018_T',
       'p019_T', 'p023_T', 'p024_T', 'p027_T', 'p030_T', 'p031_T',
       'p032_T', 'p033_T', 'p034_T']
for dlabel in datasetLabels:
    print(dlabel, "mou")
for plabel in patientLabels:
    print(plabel, "mou")

In [None]:
mergedAdataFileQC

In [None]:
adata.obs

In [None]:
adataT = sc.read_h5ad(mergedAdataFileQC)
adataT

In [None]:
adata.write(f"write/luadAdatas/mhLUAD_all.h5ad")
adata[np.logical_not(adata.obs.dataset=="pdx"),:].write(f"write/luadAdatas/mhLUAD_npdx.h5ad")
adata[np.logical_or(adata.obs.dataset=="bis",adata.obs.dataset=="mou"),:].write(f"write/luadAdatas/mhLUAD_bis.h5ad")
adata[np.logical_or(adata.obs.dataset=="pdx",adata.obs.dataset=="mou"),:].write(f"write/luadAdatas/mhLUAD_pdx.h5ad")
adata[np.logical_or(adata.obs.dataset=="dos",adata.obs.dataset=="mou"),:].write(f"write/luadAdatas/mhLUAD_dos.h5ad")
adata[np.logical_or(adata.obs.dataset=="kim",adata.obs.dataset=="mou"),:].write(f"write/luadAdatas/mhLUAD_kim.h5ad")
adata[np.logical_or(adata.obs.dataset=="mda",adata.obs.dataset=="mou"),:].write(f"write/luadAdatas/mhLUAD_mda.h5ad")


In [None]:
dirLoc = "../scCompare/scVAE_ABC/resources"
adata.write(f"{dirLoc}/mhLUAD_all.h5ad")
adata[np.logical_not(adata.obs.dataset=="pdx"),:].write(f"{dirLoc}/mhLUAD_npdx.h5ad")
adata[np.logical_or(adata.obs.dataset=="bis",adata.obs.dataset=="mou"),:].write(f"{dirLoc}/mhLUAD_bis.h5ad")
adata[np.logical_or(adata.obs.dataset=="pdx",adata.obs.dataset=="mou"),:].write(f"{dirLoc}/mhLUAD_pdx.h5ad")
adata[np.logical_or(adata.obs.dataset=="dos",adata.obs.dataset=="mou"),:].write(f"{dirLoc}/mhLUAD_dos.h5ad")
adata[np.logical_or(adata.obs.dataset=="kim",adata.obs.dataset=="mou"),:].write(f"{dirLoc}/mhLUAD_kim.h5ad")
adata[np.logical_or(adata.obs.dataset=="mda",adata.obs.dataset=="mou"),:].write(f"{dirLoc}/mhLUAD_mda.h5ad")


In [None]:
adata.obs.dataset.cat.categories


In [None]:
for i,labelName in enumerate(datasetLabels):
    #adata = sc.read_h5ad(f"write/luadAdatas/LuadMouseHuman{labelName}.h5ad")
    #sc.pl.umap(adata, color=["cellState","gmmClass"])

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(adata.obs.cellState, adata.obs.gmmClass)#, labels=[])

In [None]:
afiles = ["mhLUAD_all","mhLUAD_npdx","mhLUAD_bis","mhLUAD_pdx","mhLUAD_dos","mhLUAD_kim","mhLUAD_mda"]
print("filename,batchSize,numEpoch,learningRate,inLayerDims,lastLayer,inDiscLayer,reconCoef,klCoef,discCoef,batchName,cellName,res")
for af in afiles:
    print(f"{af},256,128,1e-3,1024-128,12,6,5,1e-1,10,{batchKey},{cellTypeKey},4e-1")
#for pLabel in patientLabels:
#    print(f"LuadMouseHuman{pLabel},128,128,1e-3,1024-128,12,6,5,1e-1,10,{batchKey},{cellTypeKey},4e-1")