In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.io
import scanpy.external as scex
import sklearn.metrics
import seaborn as sns
from matplotlib import pyplot as plt

import sys
sys.path.insert(1, '../mhCompTiss/')

import scanpyHelpers as scH

In [None]:
#adata=sc.read(results_file_post)
#adata.uns['log1p'] = {"base":None}

Set up out properties

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
%matplotlib inline

Set scanpy out-files

In [None]:
writeDir = "write/"

fileName = "pdacMouseVeh"

resultsFile = writeDir + fileName + '.h5ad'       # final output
resultsFileTotal = writeDir + fileName + 'wImm.h5ad'       # final output
resultsFileQC = writeDir + fileName + '_QC.h5ad'  # post QC (pre-analysis) 

Set figure parameters

In [None]:
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')
sc.settings.figdir = "figures/" + fileName + "/"
figName = fileName

In [None]:
adata = sc.read_h5ad(resultsFileQC)
adata

In [None]:
geneMark = ['BFP', 'CLuc', 'CreER', 'DTR', 'EGFP', 'GLuc', 'luciferase', 'mScarlet', 'tdTomato']
plotDim = 5

fig, axs = plt.subplots((len(geneMark)+1)//plotDim, plotDim, figsize=(10, 5))

for i,gene in enumerate(geneMark):
    x,y = i//plotDim, i%plotDim
    axs[x,y].hist(adata.obs[gene], bins=100, log=True)
    axs[x,y].set_title(gene)

In [None]:
adata.obs.GMM.cat.categories

In [None]:
#sc.pp.normalize_total(adata,target_sum=1e4)

In [None]:
#sc.pp.log1p(adata)

In [None]:
adata.uns['log1p']['base'] = None

In [None]:
sc.pp.highly_variable_genes(adata)#, flavor = "seurat", n_top_genes=2000)

In [None]:
minMean = 0.05
maxMean = 2.8
minDisp = 0.7

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

means = adata.var[["means"]][adata.var[["means"]] > np.exp(-14)]#adata.var[["means"]],
axs[0].hist(np.log(means), bins=100)#, log=True),
axs[0].axvline(np.log(minMean), color='k', linestyle='dashed', linewidth=1)
axs[0].axvline(np.log(maxMean), color='k', linestyle='dashed', linewidth=1)
axs[0].set_title('Gene means counts')
axs[0].set_xlabel('means')
axs[0].set_ylabel('counts')

dispNorm = adata.var[["dispersions_norm"]][adata.var[["dispersions_norm"]] > np.exp(-5)]#adata.var[["means"]],
axs[1].hist(np.log(dispNorm), bins=100)#, log=True),
axs[1].axvline(np.log(minDisp), color='k', linestyle='dashed', linewidth=1)
axs[1].set_title('Gene dispersions counts')
axs[1].set_xlabel('dispersions')
axs[1].set_ylabel('counts')

sc.pp.highly_variable_genes(adata, min_disp=minDisp, min_mean=minMean, max_mean=maxMean)
print(sum(adata.var.highly_variable))

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
#for inGene in adata.var[-8:].index.tolist():
#    adata.var.loc[inGene,"highly_variable"] = False

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('mt-') 

In [None]:
adata.raw = adata

In [None]:
#adata = adata[:, adata.var.highly_variable]
adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]

In [None]:
#sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

In [None]:
#sc.pp.scale(adata, max_value=10)

In [None]:
sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100, log=True)

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100)

In [None]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=50)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.15)

In [None]:
sc.pl.umap(adata, color=["leiden","Classification","Epcam","Ptprc","Lum","Dcn","Vim"],ncols=4)

In [None]:
sc.pl.pca(adata, color=["leiden","Classification","Epcam","Ptprc","Lum","Dcn","Vim"],ncols=4)

In [None]:
sc.pl.umap(adata, color=['CreER', 'DTR', 'EGFP', 'mScarlet', 'tdTomato'],ncols=3, cmap="bwr")

In [None]:
adata.obs["tdT+"] = ["tdT+" if t > 0.5 else "tdT-" for t in adata.obs.tdTomato] 
adata.obs["EGFP+"] = ["EGFP+" if t > 5 else "EGFP-" for t in adata.obs.EGFP] 
#adata.obs["tumor"] = ["tum" if l in ['2',"8","9"] else "other" for l in adata.obs.leiden]
adata.obs["tumor"] = ["tum" if l in ['0',"3"] else "other" for l in adata.obs.leiden]

In [None]:
sc.pl.umap(adata, color=["tumor","tdTomato","Ptprc","tdT+"],ncols=4)

In [None]:
sum(adata.obs.tdTomato>0)

In [None]:
sum(adata.obs.tumor=="tum")

In [None]:
sc.pl.pca(adata, color=["tdTomato","Ptprc","Dcn","tumor","tdT+"],ncols=3)

In [None]:
adata.write(resultsFileTotal)
#adata = sc.read(resultsFileTotal)


In [None]:
adata = adata[adata.obs.tumor=="tum"]
#adata = adata[adata.obs["tdT+"]=="tdT+"]
adata = adata.raw.to_adata()

In [None]:
sc.pp.highly_variable_genes(adata)#, flavor = "seurat", n_top_genes=2000)

In [None]:
minMean = 0.05
maxMean = 2.7
minDisp = 0.5

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

means = adata.var[["means"]][adata.var[["means"]] > np.exp(-14)]#adata.var[["means"]],
axs[0].hist(np.log(means), bins=100)#, log=True),
axs[0].axvline(np.log(minMean), color='k', linestyle='dashed', linewidth=1)
axs[0].axvline(np.log(maxMean), color='k', linestyle='dashed', linewidth=1)
axs[0].set_title('Gene means counts')
axs[0].set_xlabel('means')
axs[0].set_ylabel('counts')

dispNorm = adata.var[["dispersions_norm"]][adata.var[["dispersions_norm"]] > np.exp(-5)]#adata.var[["means"]],
axs[1].hist(np.log(dispNorm), bins=100)#, log=True),
axs[1].axvline(np.log(minDisp), color='k', linestyle='dashed', linewidth=1)
axs[1].set_title('Gene dispersions counts')
axs[1].set_xlabel('dispersions')
axs[1].set_ylabel('counts')

sc.pp.highly_variable_genes(adata, min_disp=minDisp, min_mean=minMean, max_mean=maxMean)
print(sum(adata.var.highly_variable))


In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
#for inGene in adata.var[-8:].index.tolist():
#    adata.var.loc[inGene,"highly_variable"] = False

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('mt-') 

In [None]:
adata.raw = adata

In [None]:
#adata = adata[:, adata.var.highly_variable]
adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]

In [None]:
#sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

In [None]:
#sc.pp.scale(adata, max_value=10)

In [None]:
sc.tl.pca(adata, n_comps = 200, svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 200, log=True)

In [None]:
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=["leiden","GMM"],ncols=3)

In [None]:
sc.pl.pca(adata, color=["leiden","GMM"],ncols=3, dimensions= [(0, 1)],annotate_var_explained=True)

In [None]:
sc.pl.pca(adata, color=["leiden","GMM"],ncols=3, dimensions= [(2, 1)],annotate_var_explained=True)

In [None]:
sc.pl.umap(adata, color=['EGFP+', 'tdT+','EGFP', 'tdTomato',"DTR"],ncols=2)#, cmap="bwr")

In [None]:
sc.pl.pca(adata, color=['EGFP+', 'tdT+',"leiden","GMM"],ncols=3)

In [None]:
sc.pl.umap(adata, color=["n_genes","pct_counts_mt","total_counts"])

In [None]:
#sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon', use_raw=False)
#sc.tl.filter_rank_genes_groups(adata, groupby="leiden", use_raw=False,
#                                   key_added='rank_genes_groups_filtered', 
#                                   min_in_group_fraction=0.25, min_fold_change=1, max_out_group_fraction=0.5, compare_abs=False)
#sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
#pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(15)

In [None]:
#sc.tl.dendrogram(adata,groupby="leiden", n_pcs=50)
#sc.pl.rank_genes_groups_dotplot(adata,n_genes=3)#,key="rank_genes_groups_filtered")

In [None]:
resultsFile

In [None]:
adata

In [None]:
adata.write(resultsFile)
adata

In [None]:
def plotScores(scoreMat, figCols = 4, cutoff = 1.5):
    fig, axs = plt.subplots((len(scoreMat.columns)+1)//figCols, figCols, figsize=(10, 10))
    #sigs = []
    for i,scoreCol in enumerate(scoreMat.columns):
        scoreData = scoreMat[scoreCol]
        x, y = i//figCols, i%figCols
        axs[x, y].hist(((scoreData-np.mean(scoreData))/np.std(scoreData)), bins=100)
        axs[x, y].axvline(cutoff, color='k', linestyle='dashed', linewidth=1)
        axs[x, y].set_title(scoreCol)
        #sigs.append(((scoreData-np.mean(scoreData))/np.std(scoreData)) > cutoff)

    fig.show()
    #return(sigs)
    
def scoreGeneSig(adata, geneSig, translate = False, toGenes=None, fromGenes=None):
    for j,sigName in enumerate(geneSig.columns):
        clustGenes = geneSig.iloc[:,j].dropna()
        if translate:
            for i,mGene in enumerate(clustGenes):
                indexOver = np.where(fromGenes == mGene)[0]
                if indexOver.size > 0:
                    clustGenes[i] = toGenes[indexOver[0]]
        sc.tl.score_genes(adata, clustGenes, score_name=f"{sigName}Score")

def getNewLabels(adata, ogLabels, scoreNames,labelDict):
#   ogLabelScore = pd.DataFrame(np.zeros((len(ogLabels),len(scoreNames))),index=ogLabels, columns=scoreNames)
    ogLabelScoreMe = pd.DataFrame(np.zeros((len(ogLabels),len(scoreNames))),index=ogLabels, columns=scoreNames)
    newBClabel = list(ogLabels.copy())
    
    for score in ogLabelScoreMe.columns:
        scorMe = np.mean(adata.obs[score])
        print(f"\nscore: {scorMe}\n")
        for i,leid in enumerate(ogLabelScoreMe.index):
            adataCat = adata[adata.obs.leiden==leid]
            #ogLabelScore.loc[leid,score] = scorMe
            ogLabelScoreMe.loc[leid,score] = np.round(sum(adataCat.obs[score] > scorMe)/len(adataCat.obs[score]),decimals=4)
    
    print(ogLabelScoreMe)
    
    for i,leid in enumerate(ogLabelScoreMe.index):
        if(np.max(ogLabelScoreMe.loc[leid,:])): # > 0.5 and scorMe > 0
            newBClabel[i] = labelDict[ogLabelScoreMe.columns[np.argmax(ogLabelScoreMe.loc[leid,:])]]
        else:
            newBClabel[i] = "inter"
        
    adata.obs["cellState"] = [newBClabel[int(lei)] for lei in adata.obs.leiden]
    return(newBClabel, ogLabelScoreMe)


def scoreAndLabel(adata, sigGenes, labelOfSigGenes, ogLabel="leiden",translate = False, toGenes=None, fromGenes=None):
    scoreGeneSig(adata, sigGenes, translate = translate, toGenes=toGenes, fromGenes=fromGenes)
    ogLabels = adata.obs[ogLabel].cat.categories
    scoreNames = [f"{sigName}Score" for sigName in sigGenes.columns]
    labelDict = dict(zip(scoreNames,labelOfSigGenes))
    newBClabel, ogLabelScoreMe = getNewLabels(adata, ogLabels, scoreNames, labelDict)
    return(scoreNames, newBClabel, ogLabelScoreMe)

In [None]:
adata = sc.read(resultsFile)

In [None]:
homology = pd.read_table("data/hgncHM_121.csv", sep=",")
mouseGenes = np.array(homology["mouse"])
humanGenes = np.array(homology["human"])
homology

In [None]:
sigGenes = pd.read_csv("data/PDAC/basClaSigGenes.csv")
sigGenes = sigGenes[["scBasal","scClassical","EMT"]]

In [None]:
#set(sigGenes["basal"]).intersection(set(sigGenes["scBasal"]))

In [None]:
#set(sigGenes["classical"]).intersection(set(sigGenes["scClassical"]))

In [None]:
#set(sigGenes["scBasal"]).intersection(set(sigGenes["EMT"]))

In [None]:
#set(sigGenes["scClassical"]).intersection(set(sigGenes["EMT"]))

In [None]:
scoreNames, newBClabel, ogLabelScoreMe = scoreAndLabel(adata, sigGenes, 
#                                                            labelOfSigGenes= ["basal","basal","classical","classical","EMT"],
                                                            labelOfSigGenes= ["basal","classical","EMT"],
                                                            ogLabel="leiden",
                                                            translate=True, fromGenes=humanGenes, toGenes=mouseGenes)

sc.pl.umap(adata, color=scoreNames+["cellState"], ncols=4, cmap="bwr")


In [None]:
sc.pl.umap(adata, color=scoreNames+["cellState"],ncols=4,cmap="bwr")

In [None]:
sc.pl.pca(adata, color=scoreNames+["cellState"],ncols=4,cmap="bwr")

In [None]:
#sc.pl.heatmap(adata, scoreNames, groupby="treatment", log=True)

In [None]:
#sc.tl.leiden(adata, resolution=0.2)
sc.pl.umap(adata, color = ["leiden","cellState"], legend_loc="on data")

In [None]:
scoreMat = adata.obs[[f"{sigName}Score" for sigName in sigGenes.columns]]

plotScores(scoreMat,figCols = 2)

In [None]:
classLabDict = dict(zip(sigGenes.columns,["basal","classical","EMT"]))#"basal","classical",
markers = pd.DataFrame(np.empty((200*12,2), dtype="object"), columns=["genes", "clustName"])
for i,clust in enumerate(sigGenes.columns):
    for j,gene in enumerate(sigGenes.index):
        gene = sigGenes.loc[gene,clust]
        indexOver = np.where(humanGenes == gene)[0]
        if indexOver.size > 0:
            gene = mouseGenes[indexOver[0]]
        markers.iloc[i*200+j] = gene,classLabDict[clust]
        
markers = markers.dropna()
markers

In [None]:
markers = markers.drop_duplicates()
markers

In [None]:
adata

In [None]:
import decoupler as dc

In [None]:
dc.run_ora(
    mat=adata,
    net=markers,
    source='clustName',
    target='genes',
    min_n=3,
    verbose=True
)

In [None]:
acts = dc.get_acts(adata, obsm_key='ora_estimate')

# We need to remove inf and set them to the maximum value observed for pvals=0
acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e

acts

In [None]:
sc.pl.umap(acts, color=scoreNames+['leiden'], cmap='RdBu_r')

In [None]:
sc.pl.violin(acts, keys=scoreNames, groupby='leiden')

In [None]:
df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')
df

In [None]:
n_ctypes = 3
ctypes_dict = df.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict

In [None]:
#sc.pl.matrixplot(acts, ctypes_dict, 'leiden', dendrogram=True, standard_scale='var',colorbar_title='Z-scaled scores', cmap='RdBu_r')

In [None]:
annotation_dict = df.groupby('group').head(1).set_index('group')['names'].to_dict()
annotation_dict

In [None]:
# Add cell type column based on annotation
adata.obs['decoupler'] = [annotation_dict[clust] for clust in adata.obs['leiden']]

# Visualize
sc.pl.umap(adata, color=['decoupler','cellState', "leiden"])

In [None]:
sc.pl.pca(adata, color=["leiden","GMM","cellState"],ncols=4)

In [None]:
scH.addIndvLabel(adata, [f"{sigName}Score" for sigName in sigGenes.columns], obsLabel="zsig", cutoff=-5)

In [None]:
sc.pl.umap(adata, color=["leiden","zsig"])#, legend_loc="on data")

In [None]:
sc.pl.violin(adata, keys=scoreNames, groupby='leiden')

In [None]:
adata.write(resultsFile)
adata

In [None]:
resultsFile

In [None]:
#adata.obs.treatment

In [None]:
#ax=sc.pl.umap(adata,color=['treatment'],groups=['mrtx'], show=False)

# We can change the 'NA' in the legend that represents all cells outside of the
# specified groups
#legend_texts=ax.get_legend().get_texts()
# Find legend object whose text is "NA" and change it
#for legend_text in legend_texts:
#    if legend_text.get_text()=="NA":
#        legend_text.set_text('mtrx')

In [None]:
#sc.pl.umap(acts, color=scoreNames+['leiden'], cmap='RdBu_r')
sc.pl.pca(adata, color=scoreNames+["leiden","GMM","cellState"],ncols=4)

In [None]:
sc.pl.umap(adata, color=scoreNames+["leiden","cellState","tdT+","Classification"],ncols=4)

In [None]:
sc.pl.umap(adata, color=["leiden","cellState","tdT+","EGFP+"],ncols=4)

In [None]:
sc.pl.pca(adata, color=["leiden","cellState","tdT+","EGFP+"],ncols=4)

In [None]:
sc.pl.umap(adata, color=scoreNames+["cellState"],ncols=3,cmap="bwr")

In [None]:
sc.pl.pca(adata, color=scoreNames+["cellState"],ncols=3,cmap="bwr")

drug treated shoudl have EGFP in only classcial

In [None]:
treatColor = {"veh":1,"mrtx":0}
treatColors = [adata.uns["treatment_colors"][treatColor[i]] for i in adata.obs["treatment"]]

In [None]:
plt.scatter(adata.obs["scClassicalScore"],adata.obs["scBasalScore"], c=treatColors)

In [None]:
sc.pl.violin(adata, keys=scoreNames, groupby="treatment")

In [None]:
resultsFile

In [None]:
adata.write(resultsFile)

In [None]:
adata = sc.read(resultsFile)

In [None]:
qcdata = sc.read_h5ad(resultsFileQC)
print(sum(qcdata.X.todense()))

rdata = adata.raw.to_adata()

qcdata = qcdata[rdata.obs_names,:]
rdata.layers["counts"] = qcdata.X.copy()
rdata.obs["cs"] = rdata.obs.cellState
rdata.obs = rdata.obs[['leiden','cs']]
rdata.write('write/pdacMouseVeh_Labeled.h5ad')
rdata

In [None]:
adata