In [None]:
# Hwnag per patient KRAS investigation

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.io
import scanpy.external as scex
import sklearn.metrics
import seaborn as sns
from matplotlib import pyplot as plt
import scanpyHelpers as scH

import decoupler as dc

In [None]:
#adata=sc.read(results_file_post)
#adata.uns['log1p'] = {"base":None}

In [None]:
sc.settings.verbosity = 0             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
%matplotlib inline

In [None]:
writeDir = "write/"

fileName = "pdacHuman"

resultsFileQC = writeDir + fileName + '_QC.h5ad'  # post QC (pre-analysis) 

In [None]:
sigGenes = pd.read_csv("data/PDAC/basClaSigGenes.csv")
sigGenes = sigGenes[["scBasal","scClassical"]]

classLabDict = dict(zip(sigGenes.columns,["basal","classical"]))
markers = pd.DataFrame(np.empty((200*12,2), dtype="object"), columns=["genes", "clustName"])
for i,clust in enumerate(sigGenes.columns):
    for j,gene in enumerate(sigGenes.index):
        gene = sigGenes.loc[gene,clust]
        #indexOver = np.where(humanGenes == gene)[0]
        #if indexOver.size > 0:
        #    gene = mouseGenes[indexOver[0]]
        markers.iloc[i*200+j] = gene,classLabDict[clust]
        
markers = markers.dropna()
markers = markers.drop_duplicates()
markers

homology = pd.read_table("data/hgncHM_121.csv", sep=",")
homology

In [None]:
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')
sc.settings.figdir = "figures/" + fileName + "/"
figName = fileName

In [None]:
adata = sc.read_h5ad(resultsFileQC)
adata

In [None]:
adata = adata[adata.obs["tumor"]=="tumor"]
tumorcells = ['Acinar cell', 'Ductal cell type 1', 'Ductal cell type 2','Endocrine cell']
adata = adata[[t in tumorcells for t in adata.obs.celltypes]]
rawAdata = adata

In [None]:
from collections import Counter
Counter(adata.obs["samples"])

In [None]:
#mostCom = ["T14","T17","T9","T22","T18"]
#mcAdata = adata[[(sam in mostCom) for sam in adata.obs.samples],:]
#adata = adata[[ct == "Ductal cell type 2" for ct in adata.obs.celltypes]]
#adata = mcAdata

In [None]:
listPatients = adata.obs["samples"].cat.categories.values

In [None]:
adataAll = adata.copy()

In [None]:
adataAll

In [None]:
csHold = pd.DataFrame()
allPatients = []

for patient in listPatients:    
    print(patient)
    adata = adataAll[adataAll.obs["samples"]==patient]
    #print(adata)
    resultsFile = writeDir  + fileName + "_" +patient+'.h5ad'       # final output
    
    sc.pp.normalize_total(adata,target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, flavor = "seurat", n_top_genes=2000)
    adata.raw = adata
    adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]
    sc.tl.pca(adata, n_comps = 50, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=20, n_pcs=40)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=0.7)
    scoreNames, newBClabel, ogLabelScoreMed = scH.scoreAndLabel(adata, sigGenes, labelOfSigGenes= ["basal","classical","EMT"], ogLabel="leiden")

    scoreMat = adata.obs[[f"{sigName}Score" for sigName in sigGenes.columns]]
    adata.obs["zsig"] = scH.zScores(scoreMat, cutoff = 0)

    relabelDict = {}
    for li in adata.obs.leiden.cat.categories:
        mostCommon = Counter(adata[adata.obs.leiden==li].obs.zsig).most_common(2)
        relabelDict[li] = mostCommon[0][0]
    print(relabelDict)
    #if(patient == "T14"):
    #    relabelDict = {'0': 'scClassical', '1': 'scBasal', '2': 'scClassical', '3': 'scClassical', 
    #     '4': 'scClassical', '5': 'scClassical', '6': 'scClassical', '7': 'scClassical', 
    #     '8': 'scClassical', '9': 'scClassical', '10': 'scClassical', '11': 'scClassical', '12': 'scBasal', '13': 'scClassical'}
    #elif(patient == "T17"):
    #    relabelDict = {'0': 'scClassical', '1': 'scBasal', '2': 'scBasal', '3': 'scClassical', 
    #     '4': 'out', '5': 'scBasal', '6': 'scClassical', '7': 'out', '8': 'scClassical', '9': 'scClassical'}
    #elif(patient == "T18"):
    #    relabelDict ={'0': 'scClassical', '1': 'scBasal', '2': 'scClassical', '3': 'scClassical', 
    #                  '4': 'scBasal', '5': 'out', '6': 'scClassical', '7': 'scClassical', '8': 'scClassical', '9': 'EMT'}

    adata.obs["cs"] = [relabelDict[li] for li in adata.obs.leiden]
    csHold = pd.concat([csHold,adata.obs["cs"]])
    sc.pl.umap(adata, color=["leiden","celltypes","zsig","cs"]+[f"{sigName}Score" for sigName in sigGenes.columns], 
               cmap="bwr",ncols=4)
    #adata.write(f'write/pdacHwang_{patient}.h5ad')
    

In [None]:
adataAll.obs = adataAll.obs.join(csHold)
adata = adataAll

In [None]:
sc.pp.normalize_total(adata,target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, flavor = "seurat", n_top_genes=2000)
adata.raw = adata
adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]
sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=40)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=0.6)
sc.pl.umap(adata, color=["leiden","cs","celltypes"], cmap="bwr",ncols=3)


In [None]:
cellstates = np.array(adata.obs.cs.values.copy())
adata.obs["csSimp"] = cellstates
for i,ct in enumerate(adata.obs.celltypes.values):
    if ct != "Ductal cell type 2":
        cellstates[i] = ct
adata.obs["cs"] = cellstates

In [None]:
#adata.write("write/pdacHumanSmall.h5ad")

In [None]:
qcdata = sc.read_h5ad(resultsFileQC)
print(sum(qcdata.X.todense()))

rdata = adata.raw.to_adata()

qcdata = qcdata[rdata.obs_names,:]
rdata.layers["counts"] = qcdata.X.copy()
rdata.obs = rdata.obs[['samples','leiden','cs',"csSimp","celltypes"]]
rdata.write('write/pdacHumanAll_Labeled.h5ad')
rdata

In [None]:
rdata[[ct=="Ductal cell type 2" for ct in rdata.obs.celltypes],:].write('write/pdacHumanDCT2_Labeled.h5ad')

In [None]:
sc.pl.umap(adata, color=["samples","cs","celltypes","csSimp"], cmap="bwr",ncols=3)

In [None]:
#adata.write("write/pdacHumanAllLab.h5ad")

In [None]:
adata.obs.samples.value_counts()

In [None]:
mostCom = ["T14","T17","T9","T22","T18"]
mcAdata = adata[[(sam in mostCom) for sam in adata.obs.samples],:]
mcAdata = mcAdata[[ct == "Ductal cell type 2" for ct in mcAdata.obs.celltypes]]

In [None]:
sc.pl.umap(mcAdata, color=["samples","cs","celltypes","csSimp"], cmap="bwr",ncols=3)

In [None]:
adata = sc.read_h5ad("write/pdacHumanAllLab.h5ad")
sc.pl.umap(adata, color=["samples","cs","celltypes","csSimp"], cmap="bwr",ncols=3)

qcdata = sc.read_h5ad(resultsFileQC)
print(sum(qcdata.X.todense()))

rdata = adata.raw.to_adata()

qcdata = qcdata[rdata.obs_names,:]
rdata.layers["counts"] = qcdata.X.copy()
rdata.obs = rdata.obs[['samples','leiden','cs']]
rdata.write('write/pdacHumanAll_Labeled.h5ad')
rdata