In [None]:
# Hwnag per patient KRAS investigation

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.io
import scanpy.external as scex
import sklearn.metrics
import seaborn as sns
from matplotlib import pyplot as plt
import scanpyHelpers as scH

import decoupler as dc

In [None]:
#adata=sc.read(results_file_post)
#adata.uns['log1p'] = {"base":None}

In [None]:
sc.settings.verbosity = 0             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
%matplotlib inline

In [None]:
writeDir = "write/"

fileName = "pdacHwang"

resultsFileQC = writeDir + fileName + '_QC.h5ad'  # post QC (pre-analysis) 

In [None]:
sigGenes = pd.read_csv("data/PDAC/basClaSigGenes.csv")
sigGenes = sigGenes[["scBasal","scClassical"]]

classLabDict = dict(zip(sigGenes.columns,["basal","classical"]))
markers = pd.DataFrame(np.empty((200*12,2), dtype="object"), columns=["genes", "clustName"])
for i,clust in enumerate(sigGenes.columns):
    for j,gene in enumerate(sigGenes.index):
        gene = sigGenes.loc[gene,clust]
        #indexOver = np.where(humanGenes == gene)[0]
        #if indexOver.size > 0:
        #    gene = mouseGenes[indexOver[0]]
        markers.iloc[i*200+j] = gene,classLabDict[clust]
        
markers = markers.dropna()
markers = markers.drop_duplicates()
markers

markKRAS = ['DUSP6',  'ETV1', 'ETV5', 'CCND1', 'SPRY2']

homology = pd.read_table("data/hgncHM_121.csv", sep=",")
homology

In [None]:
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')
sc.settings.figdir = "figures/" + fileName + "/"
figName = fileName

In [None]:
adata = sc.read(resultsFileQC)
adata

In [None]:
resultsFileQC

In [None]:
adata.obs

In [None]:
adata.obs["treated"] = [x[0] for x in adata.obs["sample"]]

In [None]:
adata = adata[adata.obs.cell_type=="Malignant"]
adata = adata[adata.obs.treated=="U"]

adata

In [None]:
from collections import Counter
Counter(adata.obs["sample"])

In [None]:
listPatients = adata.obs["sample"].cat.categories.values

In [None]:
adataAll = adata.copy()

In [None]:
adataAll

In [None]:
allPatients = []
allPatCellstate = []
allPatKRASscores = []

for patient in listPatients:    
    print(patient)
    adata = adataAll[adataAll.obs["sample"]==patient]
    resultsFile = writeDir  + fileName + "_" +patient+'.h5ad'       # final output
    
    sc.pp.normalize_total(adata,target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, flavor = "seurat", n_top_genes=2000)
    adata.raw = adata
    adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]
    sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=20, n_pcs=40)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=0.4)
    scoreNames, newBClabel, ogLabelScoreMed = scH.scoreAndLabel(adata, sigGenes, labelOfSigGenes= ["basal","classical"], ogLabel="leiden")
    
    #dc.run_ora(mat=adata,net=markers,source='clustName',target='genes',min_n=3, verbose=True)
    #acts = dc.get_acts(adata, obsm_key='ora_estimate')
    # We need to remove inf and set them to the maximum value observed for pvals=0
    #acts_v = acts.X.ravel()
    #max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
    #acts.X[~np.isfinite(acts.X)] = max_e
    #df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')
    #annotation_dict = df.groupby('group').head(1).set_index('group')['names'].to_dict()
    #adata.obs['decoupler'] = [annotation_dict[clust] for clust in adata.obs['leiden']]

    gmmClass = scH.gmmScoreGeneSig(adata.obs[scoreNames],show=False)
    adata.obs["gmmClass"] = gmmClass
    
    clasBasPCs = scH.findDiffPCs(adata, label1="scClassicalScore", label2="scBasalScore", lenPCs=20, show=False)
    
    sc.pl.pca(adata, color=['gmmClass', 'leiden', 'scBasalScore', "scClassicalScore"], cmap="bwr", dimensions=clasBasPCs[:2])
    #sc.pl.umap(acts, color=scoreNames+['leiden','decoupler'], cmap='bwr')
    
    sc.tl.score_genes(adata, markKRAS, score_name="KRASscore")
    #sc.pl.MatrixPlot(adata, markKRAS, groupby="gmmClass",standard_scale="var").show()
    sc.pl.matrixplot(adata, markKRAS, groupby="gmmClass")
    
    plt.rcParams["axes.grid"] = False
    
    krasMarkGex = np.asarray(adata.raw.to_adata()[:,markKRAS].X.todense()).T
    krasMarkGex = krasMarkGex[:,np.argsort(adata.obsm["X_pca"][:,clasBasPCs[0]])]

    
    sc.tl.rank_genes_groups(adata, 'gmmClass', method='wilcoxon', use_raw=True,key_added='gmmClass')
    sc.pl.rank_genes_groups_matrixplot(adata, key = 'gmmClass', var_names=markKRAS, 
                                       values_to_plot="scores", cmap='bwr', colorbar_title='z-scores', dendrogram = False)

    fig, ax = plt.subplots(1,1)
    sns.heatmap(krasMarkGex, yticklabels=markKRAS ,cmap="viridis",xticklabels=[], ax=ax)   
    ax.set_xlabel(f"  ----PC{clasBasPCs[0]+1}--->  ")
    ax.set_ylabel(patient)
    plt.show()

    #sc.tl.rank_genes_groups(adata, "gmmClass")
    #sc.pl.rank_genes_groups_violin(adata, groups="scBasal",gene_names=markKRAS)#, strip=False)

    sc.pl.violin(adata, keys="KRASscore", groupby="gmmClass", inner = "box", stripplot=False)
    
    allGene = []
    allCellState = []
    allValue = []
    adataR = adata.raw.to_adata()
    adataR = adataR[np.logical_not(adataR.obs["gmmClass"]=="Negative"),:]
    for kGene in markKRAS:
        if(sum(adataR.var_names==kGene)):
            geneExp = np.asarray(adataR[:,adataR.var_names==kGene].X.todense()).flatten()
            label = adataR.obs["gmmClass"][geneExp>0]
            geneExpPos = geneExp[geneExp>0]
    
            allGene = allGene+[kGene]*len(geneExpPos)
            allCellState = allCellState+list(label.values)
            allValue = allValue+list(geneExpPos)
    krasExp = pd.DataFrame([allGene,allCellState,allValue], index=["Gene","CellState","Value"]).T
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.violinplot(krasExp, x="Gene", hue="CellState", y="Value", gap=.2, split=True, ax=ax)
    ax.set_title("KRAS Signature Genes")
    ax.legend(loc="center",bbox_to_anchor=(1.1, 0.5))
    plt.show()


    allPatients = allPatients+[patient]*len(adataR.obs["gmmClass"].values)
    allPatCellstate = allPatCellstate+list(adataR.obs["gmmClass"].values)
    allPatKRASscores = allPatKRASscores+list(adataR.obs["KRASscore"].values)

    adata.write(f'write/pdacHwang_{patient}.h5ad')
    
krasScoreExp = pd.DataFrame([allPatients, allPatCellstate, allPatKRASscores], index=["Patient","CellState","Value"]).T
fig, ax = plt.subplots(figsize=(10, 4))
sns.violinplot(krasScoreExp, x="Patient", hue="CellState", y="Value", gap=.2, split=True, ax=ax)
ax.set_title("KRAS Signature Genes")
ax.legend(loc="center",bbox_to_anchor=(1.1, 0.5))
plt.show()

In [None]:
Counter(allPatients)

In [None]:
sc.pl.violin(adata[np.logical_not(adata.obs["gmmClass"]=="Negative"),:], keys="KRASscore", groupby="gmmClass", inner = "box", stripplot=False)

In [None]:
krasScoreExp = pd.DataFrame([allPatients, allPatCellstate, allPatKRASscores], index=["Patient","CellState","Value"]).T
fig, ax = plt.subplots(figsize=(20, 10))
sns.violinplot(krasScoreExp, x="Patient", hue="CellState", y="Value", inner="box", gap=.2, split=True, ax=ax)
ax.set_title("KRAS Signature Genes")
ax.legend(loc="center",bbox_to_anchor=(1.1, 0.5))
plt.show()

In [None]:
len(ccnd1ExpPos)

In [None]:
allGene = []
allCellState = []
allValue = []
adataR = adata.raw.to_adata()
for kGene in markKRAS:
    if(sum(adataR.var_names==kGene)):
        geneExp = np.asarray(adataR[np.logical_not(adataR.obs["gmmClass"]=="Negative"),adataR.var_names==kGene].X.todense()).flatten()
        label = adataR[np.logical_not(adataR.obs["gmmClass"]=="Negative"),:].obs["gmmClass"][geneExp>0]
        geneExpPos = geneExp[geneExp>0]

        allGene = allGene+[kGene]*len(geneExpPos)
        allCellState = allCellState+list(label.values)
        allValue = allValue+list(geneExpPos)

krasExp = pd.DataFrame([allGene,allCellState,allValue], index=["Gene","CellState","Value"]).T

fig, ax = plt.subplots()
sns.violinplot(krasExp, x="Gene", hue="CellState", y="Value", gap=.2, split=True, ax=ax)
ax.set_title("KRAS Signature Genes")
ax.legend(loc="center",bbox_to_anchor=(1.1, 0.5))
plt.show()

In [None]:
adata = adata[np.logical_not(adata.obs["gmmClass"]=="Negative"),:]
krasExp = pd.DataFrame([["Px"]*len(adata.obs["KRASscore"]),adata.obs["gmmClass"],adata.obs["KRASscore"]], index=["Patient","CellState","Value"]).T

In [None]:
fig, ax = plt.subplots()
sns.violinplot(krasExp, x="Patient", hue="CellState", y="Value", gap=.2, split=True, ax=ax)
ax.set_title("KRAS Signature Genes")
ax.legend(loc="center",bbox_to_anchor=(1.1, 0.5))
plt.show()

In [None]:
["CCND1"]*2+["adsf"]*8

In [None]:
for patient in listPatients:    
    print(patient)
    adata = adataAll[adataAll.obs["sample"]==patient]
    resultsFile = writeDir + fileName + "_" +patient+'.h5ad'       # final output
    
    sc.pp.normalize_total(adata,target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, flavor = "seurat", n_top_genes=2000)
    adata.raw = adata
    adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]
    sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=20, n_pcs=40)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=0.4)
    scoreNames, newBClabel, ogLabelScoreMed = scH.scoreAndLabel(adata, sigGenes, labelOfSigGenes= ["basal","classical"], ogLabel="leiden")
    
    #dc.run_ora(mat=adata,net=markers,source='clustName',target='genes',min_n=3, verbose=True)
    #acts = dc.get_acts(adata, obsm_key='ora_estimate')
    # We need to remove inf and set them to the maximum value observed for pvals=0
    #acts_v = acts.X.ravel()
    #max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
    #acts.X[~np.isfinite(acts.X)] = max_e
    #df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')
    #annotation_dict = df.groupby('group').head(1).set_index('group')['names'].to_dict()
    #adata.obs['decoupler'] = [annotation_dict[clust] for clust in adata.obs['leiden']]

    gmmClass = gmmScoreGeneSig(adata.obs[scoreNames],show=False)
    adata.obs["gmmClass"] = gmmClass
    
    clasBasPCs = scH.findDiffPCs(adata, label1="scClassicalScore", label2="scBasalScore", lenPCs=20, show=True)
    
    sc.pl.pca(adata, color=['gmmClass', 'leiden', 'scBasalScore', "scClassicalScore"], cmap="bwr", dimensions=clasBasPCs[:2])
    #sc.pl.umap(acts, color=scoreNames+['leiden','decoupler'], cmap='bwr')
    
    sc.tl.score_genes(adata, markKRAS, score_name="KRASscore")
    sc.pl.MatrixPlot(adata, markKRAS, groupby="decoupler",standard_scale="var").show()
    sc.pl.MatrixPlot(adata, markKRAS, groupby="decoupler").show()
    
    plt.rcParams["axes.grid"] = False
    
    krasMarkGex = np.asarray(adata.raw.to_adata()[:,markKRAS].X.todense()).T
    krasMarkGex = krasMarkGex[:,np.argsort(adata.obsm["X_pca"][:,clasBasPCs[0]])]
    fig, ax = plt.subplots(1,1)
    
    sns.heatmap(krasMarkGex, yticklabels=markKRAS ,cmap="viridis",xticklabels=[], ax=ax)   
    ax.set_xlabel(f"(classical)  ----PC{clasBasPCs[0]}--->  (basal)")
    ax.set_ylabel(patient)
    
    plt.show()
    
    sc.tl.rank_genes_groups(adata, 'gmmClass', method='wilcoxon', use_raw=True,key_added='gmmClass')
    sc.pl.rank_genes_groups_matrixplot(adata, key = 'gmmClass', var_names=markKRAS, 
                                       values_to_plot="scores", cmap='bwr', colorbar_title='z-scores', dendrogram = False)