In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.io
import scanpy.external as scex
import sklearn.metrics
import seaborn as sns
from matplotlib import pyplot as plt


In [None]:
#adata=sc.read(results_file_post)
#adata.uns['log1p'] = {"base":None}

Set up out properties

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
%matplotlib inline

Set scanpy out-files

In [None]:
writeDir = "write/"

fileName = "luadMouse_shKras"

resultsFile = writeDir + fileName + '.h5ad'       # final output
resultsFileQC = writeDir + fileName + '_QC.h5ad'  # post QC (pre-analysis) 

Set figure parameters

In [None]:
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')
sc.settings.figdir = "figures/" + fileName + "/"
figName = fileName

In [None]:
adata = sc.read(resultsFileQC)
adata

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5))

mKate2 = adata.obs["mKate2"]
axs[0].hist(mKate2, bins=100)#, log=True),
#axs[0].axvline(np.log(minMean), color='k', linestyle='dashed', linewidth=1)
axs[0].set_title('mKate2')
axs[0].set_xlabel('exp')
axs[0].set_ylabel('counts')
axs[0].set_yscale('log')


shRNA = adata.obs["shRNA"]
axs[1].hist(np.log(shRNA+1), bins=100)#, log=True),
#axs[1].axvline(np.log(minDisp), color='k', linestyle='dashed', linewidth=1)
axs[1].set_title('shRNA')
axs[1].set_xlabel('exp')
axs[1].set_ylabel('counts')
axs[1].set_yscale('log')

fig.show()


In [None]:
adata.obs["mKatePos"] =  ["1" if v>1 else "0" for v in adata.obs["mKate2"] ]
adata.obs["shRNAPos"] =  ["1" if np.log(v+1)>2 else "0" for v in adata.obs["shRNA"] ]

In [None]:
sum(adata.obs["mKatePos"]=="1" ), sum(adata.obs["shRNAPos"]=="1")

In [None]:
#adata.obs["mKatePos"] =  [1 if v>0.5 else 0 for v in adata.obs["mKate2"] ]
adata = adata[np.logical_and(adata.obs.treatment=="shRen713",adata.obs.mKatePos=="1")]
adata

In [None]:
sc.pp.normalize_total(adata,target_sum=1e4)

In [None]:
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata)#, flavor = "seurat", n_top_genes=2000)

In [None]:
minMean = 0.075
maxMean = 2.7
minDisp = 0.5

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

means = adata.var[["means"]][adata.var[["means"]] > np.exp(-14)]#adata.var[["means"]],
axs[0].hist(np.log(means), bins=100)#, log=True),
axs[0].axvline(np.log(minMean), color='k', linestyle='dashed', linewidth=1)
axs[0].axvline(np.log(maxMean), color='k', linestyle='dashed', linewidth=1)
axs[0].set_title('Gene means counts')
axs[0].set_xlabel('means')
axs[0].set_ylabel('counts')

dispNorm = adata.var[["dispersions_norm"]][adata.var[["dispersions_norm"]] > np.exp(-5)]#adata.var[["means"]],
axs[1].hist(np.log(dispNorm), bins=100)#, log=True),
axs[1].axvline(np.log(minDisp), color='k', linestyle='dashed', linewidth=1)
axs[1].set_title('Gene dispersions counts')
axs[1].set_xlabel('dispersions')
axs[1].set_ylabel('counts')

sc.pp.highly_variable_genes(adata, min_disp=minDisp, min_mean=minMean, max_mean=maxMean)#, batch_key="PDX")
print(sum(adata.var.highly_variable))


In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
#for inGene in adata.var[-8:].index.tolist():
#    adata.var.loc[inGene,"highly_variable"] = False

In [None]:
#adata.var["mt"] = ["MT-" in g for g in adata.var_names]

In [None]:
adata.raw = adata

In [None]:
#adata = adata[:, adata.var.highly_variable]
adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]

In [None]:
#sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

In [None]:
#sc.pp.scale(adata, max_value=10)

In [None]:
sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100, log=True)

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100)

In [None]:
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=60)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.35)

In [None]:
#adata = sc.read_h5ad("write/mouseLUADshRen.h5ad")

In [None]:
sc.pl.umap(adata, color=["leiden","treatment","time","Ptprc","Epcam","Pecam1","Vim","Fbn1"],ncols=2)

In [None]:
sc.pl.umap(adata, color=["Slc4a11","Hopx","Scgb1a1","Cap1"],ncols=4)

In [None]:
sc.pl.pca(adata, color=["leiden","time"], ncols=2)

In [None]:
sc.pl.umap(adata, color=["mKate2","shRNA","mKatePos","shRNAPos"], ncols=2)

In [None]:
adata.uns['log1p']['base']=None

In [None]:
#adata = adata.raw.to_adata()
#adata = adata[np.logical_not(adata.obs.leiden=="5")]

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon', use_raw=False)
#sc.tl.filter_rank_genes_groups(adata, groupby="leiden", use_raw=False,
#                                   key_added='rank_genes_groups_filtered', 
#                                  min_in_group_fraction=0.25, min_fold_change=1, max_out_group_fraction=0.5, compare_abs=False)
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(15)

In [None]:
sc.tl.dendrogram(adata,groupby="leiden", n_pcs=60)
sc.pl.rank_genes_groups_dotplot(adata,n_genes=3)#,key="rank_genes_groups_filtered")

In [None]:
resultsFile

In [None]:
#adata.obs["id"] = adata.obs_names

In [None]:
adata.write(resultsFile)
adata

In [None]:
homology = pd.read_table("data/hgncHM_121.csv", sep=",")
homology

In [None]:
import scanpyHelpers as scH
adata = sc.read_h5ad(resultsFile)
adata

In [None]:
sigFile = "data/LUAD/simpLUADcs.csv"
#sigGenes = pd.read_excel(sigFile)
sigGenes = pd.read_excel("data/LUAD/LUADhpcs.xlsx")
#pd.read_csv(sigFile)
sigGenes #= sigGenes[:50]

In [None]:
sigGenes.columns

In [None]:
scoreNames, newBClabel, ogLabelScoreMed = scH.scoreAndLabel(adata, sigGenes, labelOfSigGenes= sigGenes.columns, ogLabel="leiden")
                                                            #translate=Fasle, humanGenes=humanGenes, mouseGenes=mouseGenes)
sc.pl.umap(adata, color=scoreNames+["cellState"], ncols=4)
                                                            

In [None]:
for gs in sigGenes:
    sc.tl.score_genes(adata,sigGenes[gs].dropna(),score_name=f"{gs}Score")

In [None]:
scoreMat = adata.obs[[f"{sigName}Score" for sigName in sigGenes.columns]]
scoreMat

In [None]:
def plotScores(scoreMat, figCols = 4, cutoff = 1.5):
    fig, axs = plt.subplots((len(scoreMat.columns)+1)//figCols, figCols, figsize=(10, 10))
    #sigs = []
    for i,scoreCol in enumerate(scoreMat.columns):
        scoreData = scoreMat[scoreCol]
        #print(np.percentile(scoreData,90))
        x, y = i//figCols, i%figCols
        #axs[x, y].hist(((scoreData-np.mean(scoreData))/np.std(scoreData)), bins=100)
        axs[x, y].hist(scoreData, bins=100)
        #axs[x, y].axvline(0, color='k', linestyle='dashed', linewidth=1)
        axs[x, y].axvline(np.percentile(scoreData,90), color='k', linestyle='dashed', linewidth=1)
        axs[x, y].set_title(scoreCol)
        #sigs.append(((scoreData-np.mean(scoreData))/np.std(scoreData)) > cutoff)

    fig.show()
    #return(sigs)
plotScores(scoreMat)

In [None]:
def zScores(scoreMat, cutoff = 1.5):
    sigs = []
    for i,scoreCol in enumerate(scoreMat.columns):
        scoreData = scoreMat[scoreCol]
        zscore = ((scoreData-np.mean(scoreData))/np.std(scoreData))
        sigs.append(zscore )
    
    sigScore = pd.DataFrame(sigs).T
    simple = []
    for i,cell in enumerate(sigScore.index):
        names = scoreMat.columns#np.array([str(x+1) for x in range(len(sigScore.columns))])
        
        #print((sigScore.loc[cell]> cutoff).values)
        sigNames = names[np.array((sigScore.loc[cell]> cutoff).values)]
        sigName = "out"
        if len(sigNames) > 0:
            sigName = str(names[np.argmax(sigScore.loc[cell])])[:-5]
            #sigName = "-".join(sigNames)
        #simple.append("-".join(sigNames))
        simple.append(sigName)
    return(np.array(simple))

def topPercent(scoreMat, cutoff = .80):
    perc = pd.DataFrame(np.zeros(scoreMat.shape),columns=scoreMat.columns,index=scoreMat.index)
    for i,scoreCol in enumerate(scoreMat.columns):
        scoreData = scoreMat[scoreCol]
        for j, cell in enumerate(scoreData.index):
            perc.loc[cell,scoreCol] = sum(scoreData<scoreData[cell])/len(scoreData)
    simple=[]
    for i,cell in enumerate(perc.index):
        names = perc.columns
        sigNames = names[np.array((perc.loc[cell]> cutoff).values)]
        sigName = "out"
        if len(sigNames) > 0:
            sigName = str(names[np.argmax(perc.loc[cell])])[:-5]
        simple.append(sigName)
    return(np.array(simple))

In [None]:
adata.obs["zsig"] = zScores(scoreMat, cutoff = 1)
adata.obs["topPer"] = topPercent(scoreMat, cutoff=0.8)

In [None]:
adata.obs[["topPer","zsig"]]

In [None]:
sc.pl.umap(adata, color=["leiden","zsig","topPer"])#, legend_loc="on data")

In [None]:
clustToLabel = {"Cluster 1":"AT2-like",
 "Cluster 2":"AT2-like",
 "Cluster 3":"AT1-like",
 "Cluster 4":"Endoderm-like",
 "Cluster 5":"HPCS",
 "Cluster 6":"GiEpi",
 "Cluster 7":"Prolif",
 "Cluster 8":"EmbLiv",
 "Cluster 9":"Rib",
 "Cluster 10":"Gastro",
 "Cluster 11":"EMT",
 "Cluster 12":"Adv"}

In [None]:
from collections import Counter

for gs in sigGenes:
    sc.tl.score_genes(adata,sigGenes[gs].dropna(),score_name=f"{gs}Score")
scoreMat = adata.obs[[f"{sigName}Score" for sigName in sigGenes.columns]]
adata.obs["zsig"] = zScores(scoreMat, cutoff = 1)
relabelDict = {}
for li in adata.obs.leiden.cat.categories:
    mostCommon = Counter(adata[adata.obs.leiden==li].obs.zsig).most_common(2)
    relabel = ""
    if(mostCommon[0][0] in clustToLabel.keys()):
        relabel = clustToLabel[mostCommon[0][0]]
    else:
        relabel = clustToLabel[mostCommon[1][0]]
    relabelDict[li] = relabel
print(relabelDict)
relabelDict = {'0': 'AT2-like', '1': 'Rib', '2': 'Endoderm-like', '3': 'AT1-like', '4': 'HPCS', '5': 'Adv', '6': 'Endoderm-like', '7': 'EMT'}

adata.obs["cs"] = [relabelDict[li] for li in adata.obs.leiden]

In [None]:
sc.pl.umap(adata, color=["leiden","zsig","cs"])#, legend_loc="on data")

In [None]:
sc.pl.umap(adata, color=[f"{sigName}Score" for sigName in sigGenes.columns], cmap="bwr", ncols=4)

In [None]:
adata.write("write/mouseLUADshRen.h5ad")
adata = sc.read_h5ad("write/mouseLUADshRen.h5ad")

In [None]:
resultsFileQC

In [None]:
qcdata = sc.read_h5ad(resultsFileQC)
print(sum(qcdata.X.todense()))

rdata = adata.raw.to_adata()

qcdata = qcdata[rdata.obs_names,:]
rdata.layers["counts"] = qcdata.X.copy()
rdata.obs = rdata.obs[['leiden','cs']]
rdata.write('write/luadMouse_shRen_Labeled.h5ad')

In [None]:
markers = pd.DataFrame(np.empty((500*12,2), dtype="object"), columns=["genes", "clustName"])
for i,clust in enumerate(sigGenes.columns):
    for j,gene in enumerate(sigGenes.index):
        markers.iloc[i*500+j] = sigGenes.loc[gene,clust],clust
        
markers = markers.dropna()
markers

In [None]:
import decoupler as dc

In [None]:
dc.run_ora(
    mat=adata,
    net=markers,
    source='clustName',
    target='genes',
    min_n=3,
    verbose=True
)

In [None]:
adata.obsm['ora_estimate']


In [None]:
acts = dc.get_acts(adata, obsm_key='ora_estimate')

# We need to remove inf and set them to the maximum value observed for pvals=0
acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e

acts

In [None]:
scoreDecop = pd.DataFrame(acts.X, columns = adata.obsm['ora_estimate'].columns, index=adata.obs_names)
plotScores(scoreDecop, figCols = 3)

In [None]:
#sc.pl.umap(acts, color=scoreNames+['leiden'], cmap='RdBu_r')
sc.pl.violin(acts, keys=scoreNames, groupby='leiden')

In [None]:
df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')
df

In [None]:
n_ctypes = 3
ctypes_dict = df.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict

In [None]:
sc.pl.matrixplot(acts, ctypes_dict, 'leiden', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')


In [None]:
annotation_dict = df.groupby('group').head(1).set_index('group')['names'].to_dict()
annotation_dict

In [None]:
# Add cell type column based on annotation
adata.obs['decoupler'] = [annotation_dict[clust] for clust in adata.obs['leiden']]

# Visualize
sc.pl.umap(adata, color=['decoupler','cellState', "leiden"])

In [None]:
sc.pl.umap(adata, color=['decoupler','cellState', "leiden","zsig"])

In [None]:
#adata.write(resultsFile)
adata

In [None]:
#adata = sc.read(resultsFile)

In [None]:
intGenes = ["Plaur", "Procr", "F2r", "Thbd", "Cldn4"]
sc.pl.umap(adata, color=intGenes+["leiden","cellState"])
sc.pl.pca(adata, color=intGenes+["leiden","cellState"])

In [None]:
plotScores(np.log10(scoreMat+1))

In [None]:
len(scoreMat)

In [None]:
scoreMat.columns

In [None]:
import sklearn.metrics
from matplotlib import pyplot as plt
from sklearn.mixture import GaussianMixture as GMM
from scipy.stats import norm
import pdb

In [None]:
scoreNames = scoreMat.columns
numScores = len(scoreNames)
plotLen = 4
fig, axs = plt.subplots(2,plotLen)
plt.rcParams["figure.figsize"] = (15,5)

dfScoreBoundry = pd.DataFrame(np.zeros(numScores),scoreNames, columns=["boundry"])
gmm = GMM(n_components = 2, random_state=10, covariance_type = 'full', n_init=1, means_init=[[0],[0.5]])
#binEx = np.arange(0.5,10,10/200).reshape(-1,1)

for i, scoreName in enumerate(scoreNames):
    #scoreCount = np.array(np.log10(adata.obs[scoreName]+1)).reshape(-1, 1)
    scoreCount = np.array(adata.obs[scoreName]).reshape(-1, 1)
    fitGMM = gmm.fit(scoreCount)
    mean = fitGMM.means_  
    covs  = fitGMM.covariances_
    weights = fitGMM.weights_
    #print(mean)
    binEx = np.arange(min(mean),max(mean),0.1).reshape(-1,1)
    fitGmmBound = fitGMM.predict(binEx)
    #pdb.set_trace()
    print(fitGmmBound)
    try:
        scoreBoundry = binEx[np.where(fitGmmBound == 1)[0][0]][0]
    except:
        scoreBoundry = 1
    #naiveBoundry = np.log10(int(scoreDisc.loc["90%",scoreName])+1)
    
    dfScoreBoundry.loc[scoreName] = scoreBoundry
    
    x_axis = np.arange(-.25, 0.75, 0.05)
    y_axis0 = norm.pdf(x_axis, float(mean[0][0]), np.sqrt(float(covs[0][0][0])))*weights[0] # 1st gaussian
    y_axis1 = norm.pdf(x_axis, float(mean[1][0]), np.sqrt(float(covs[1][0][0])))*weights[1] # 2nd gaussian

    # Plot 2
    x,y = i//plotLen, i%plotLen
    axs[x,y].set_title(scoreName)
    #axs[x,y].axvline(naiveBoundry, c='C3', linestyle='dashed', linewidth=1) #red
    axs[x,y].axvline(scoreBoundry, c='C2', linestyle='dashed', linewidth=1)  #green
    axs[x,y].hist(scoreCount, density=True, color='black', bins=100)        
    axs[x,y].plot(x_axis, y_axis0, lw=3, c='C6')                            #pink
    axs[x,y].plot(x_axis, y_axis1, lw=3, c='C1')                            #orange
    axs[x,y].plot(x_axis, y_axis0+y_axis1, lw=3, c='C0', ls=':')            #dotted blue
    
plt.tight_layout(pad=1.0)
plt.show()

In [None]:
scoreIDs = scoreMat.copy()
scoreID = np.array(scoreNames)
for scoreName in scoreNames:
    print(scoreName)
    print(dfScoreBoundry.loc[scoreName].values[0])
    scoreIDs.loc[:,scoreName] = scoreMat.loc[:,scoreName] > dfScoreBoundry.loc[scoreName].values[0]
scoreIDs

In [None]:
classification = np.empty(len(adata), dtype="object")
i = 0
for cellBar, scoreBool in scoreIDs.iterrows():
    numscorees = sum(scoreBool)
    if (numscorees == 1):
        classif = scoreID[scoreBool.values][0][:-5]#.values
    elif (numscorees > 1):
        classif = "Doublet"
    else:
        classif = "Negative"
    classification[i] = classif
    i = i + 1

In [None]:
adata.obs["GMM"] = classif
adata.obs["GMM"].value_counts()

In [None]:
sc.pl.heatmap(adata, scoreNames, groupby="GMM", log=True)#, save = f"_{figName}_score.png")


In [None]:
sc.pl.umap(adata, color=['GMM','cellState', "leiden","zsig"])

In [None]:
len(scoreMat.columns)

In [None]:
def gmmScoreGeneSig(scoreMat, meansInit=[[0],[0.5]],plotLen = 3, show=False):
    scoreNames = scoreMat.columns
    numScores = len(scoreNames)
    if(show):
        fig, axs = plt.subplots((numScores//plotLen)+1,plotLen)
        plt.rcParams["figure.figsize"] = (15,5)

    dfScoreBoundry = pd.DataFrame(np.zeros(numScores),scoreNames, columns=["boundry"])
    gmm = GMM(n_components = 2, random_state=10, covariance_type = 'full', n_init=2, means_init=meansInit)
    #binEx = np.arange(0.5,10,10/200).reshape(-1,1)

    for i, scoreName in enumerate(scoreNames):
        scoreCount = np.array(scoreMat[scoreName]).reshape(-1, 1)
        fitGMM = gmm.fit(scoreCount)
        mean = fitGMM.means_  
        covs  = fitGMM.covariances_
        weights = fitGMM.weights_
        print(mean)
        binEx = np.arange(min(mean),max(mean),0.01).reshape(-1,1)
        fitGmmBound = fitGMM.predict(binEx)
        #pdb.set_trace()
        print(fitGmmBound)
        try:
            scoreBoundry = binEx[np.where(fitGmmBound == 1)[0][0]][0]
        except:
            scoreBoundry = max(scoreCount)
        #naiveBoundry = np.log10(int(scoreDisc.loc["90%",scoreName])+1)

        dfScoreBoundry.loc[scoreName] = scoreBoundry
        
        if(show):
            x_axis = np.arange(min(scoreCount), max(scoreCount), 0.05)
            y_axis0 = norm.pdf(x_axis, float(mean[0][0]), np.sqrt(float(covs[0][0][0])))*weights[0] # 1st gaussian
            y_axis1 = norm.pdf(x_axis, float(mean[1][0]), np.sqrt(float(covs[1][0][0])))*weights[1] # 2nd gaussian

            # Plot 2
            x,y = i//plotLen, i%plotLen
            axs[x,y].set_title(scoreName)
            #axs[x,y].axvline(naiveBoundry, c='C3', linestyle='dashed', linewidth=1) #red
            axs[x,y].axvline(scoreBoundry, c='C2', linestyle='dashed', linewidth=1)  #green
            axs[x,y].hist(scoreCount, density=True, color='black', bins=100)        
            axs[x,y].plot(x_axis, y_axis0, lw=3, c='C6')                            #pink
            axs[x,y].plot(x_axis, y_axis1, lw=3, c='C1')                            #orange
            axs[x,y].plot(x_axis, y_axis0+y_axis1, lw=3, c='C0', ls=':')            #dotted blue

    if(show):
        plt.tight_layout(pad=1.0)
        plt.show()
    
    scoreIDs = scoreMat.copy()
    scoreID = np.array(scoreNames)
    for scoreName in scoreNames:
        print(scoreName)
        print(dfScoreBoundry.loc[scoreName].values[0])
        scoreIDs.loc[:,scoreName] = scoreMat.loc[:,scoreName] > dfScoreBoundry.loc[scoreName].values[0]
        
    classification = np.empty(len(scoreMat), dtype="object")
    i = 0
    for cellBar, scoreBool in scoreIDs.iterrows():
        numscorees = sum(scoreBool)
        if (numscorees == 1):
            classif = scoreID[scoreBool.values][0][:-5]#.values
        elif (numscorees > 1):
            classif = "Doublet"
        else:
            classif = "Negative"
        classification[i] = classif
        i = i + 1
        
    return(classification)

In [None]:
classif = gmmScoreGeneSig(scoreMat, show=True)

In [None]:
classification == classif

In [None]:
from collections import Counter

In [None]:
Counter(classification)

In [None]:
Counter(classif)