In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.io
import scanpy.external as scex
import sklearn.metrics
import seaborn as sns
from matplotlib import pyplot as plt
import scanpyHelpers as scH

In [None]:
#adata=sc.read(results_file_post)
#adata.uns['log1p'] = {"base":None}

Set up out properties

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
%matplotlib inline

Set scanpy out-files

In [None]:
writeDir = "write/"

fileName = "pdacHwang"

resultsFile = writeDir + fileName + '.h5ad'       # final output
resultsFileQC = writeDir + fileName + '_QC.h5ad'  # post QC (pre-analysis) 

Set figure parameters

In [None]:
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')
sc.settings.figdir = "figures/" + fileName + "/"
figName = fileName

In [None]:
adata = sc.read(resultsFileQC)
adata

In [None]:
adata.obs

In [None]:
adata.obs["treated"] = [x[0] for x in adata.obs["sample"]]

In [None]:
adata = adata[adata.obs.cell_type=="Malignant"]
adata = adata[adata.obs.treated=="U"]
adata

In [None]:
from collections import Counter
Counter(adata.obs["sample"])

In [None]:
sc.pp.normalize_total(adata,target_sum=1e4)

In [None]:
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata)#, flavor = "seurat", n_top_genes=2000)

In [None]:
minMean = 0.05
maxMean = 2.5
minDisp = 0.7

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

means = adata.var[["means"]][adata.var[["means"]] > np.exp(-14)]#adata.var[["means"]],
axs[0].hist(np.log(means), bins=100)#, log=True),
axs[0].axvline(np.log(minMean), color='k', linestyle='dashed', linewidth=1)
axs[0].axvline(np.log(maxMean), color='k', linestyle='dashed', linewidth=1)
axs[0].set_title('Gene means counts')
axs[0].set_xlabel('means')
axs[0].set_ylabel('counts')

dispNorm = adata.var[["dispersions_norm"]][adata.var[["dispersions_norm"]] > np.exp(-5)]#adata.var[["means"]],
axs[1].hist(np.log(dispNorm), bins=100)#, log=True),
axs[1].axvline(np.log(minDisp), color='k', linestyle='dashed', linewidth=1)
axs[1].set_title('Gene dispersions counts')
axs[1].set_xlabel('dispersions')
axs[1].set_ylabel('counts')

sc.pp.highly_variable_genes(adata, min_disp=minDisp, min_mean=minMean, max_mean=maxMean)
print(sum(adata.var.highly_variable))

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
#for inGene in adata.var[-8:].index.tolist():
#    adata.var.loc[inGene,"highly_variable"] = False

In [None]:
adata.raw = adata

In [None]:
#adata = adata[:, adata.var.highly_variable]
adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]

In [None]:
#sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

In [None]:
#sc.pp.scale(adata, max_value=10)

In [None]:
sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100, log=True)

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100)

In [None]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=60)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.3)

In [None]:
adata.obs

In [None]:
sc.pl.umap(adata, color=["leiden","cell_subtype","sample","cell_type","treated","PTPRC","COL1A1"],ncols=3)

In [None]:
sc.pl.umap(adata, color=["VIM","FBN1","EPCAM","PECAM1"],ncols=4)

In [None]:
sc.pl.umap(adata, color=["n_genes","pct_counts_mt","total_counts"])

In [None]:
resultsFile

In [None]:
adata

In [None]:
adata.write(resultsFile)
adata

In [None]:
adata = sc.read(resultsFile)
adata

In [None]:
homology = pd.read_table("data/hgncHM_121.csv", sep=",")
homology

In [None]:
sigGenes = pd.read_csv("data/PDAC/basClaSigGenes.csv")
scoreNames, newBClabel, ogLabelScoreMed = scH.scoreAndLabel(adata, sigGenes, labelOfSigGenes= ["classical","classical","basal","basal","EMT"], ogLabel="leiden")
sc.pl.umap(adata, color=scoreNames, ncols=2)


In [None]:
sc.pl.umap(adata, color=scoreNames+["cellState"],ncols=2)

In [None]:
sc.tl.leiden(adata, resolution=0.2)
sc.pl.umap(adata, color = ["leiden"], legend_loc="on data")

In [None]:
sc.pl.umap(adata, color = ["leiden","cellState"], legend_loc="on data")

In [None]:
sigGenes

In [None]:
classLabDict = dict(zip(sigGenes.columns,["basal","basal","classical","classical","EMT"]))
markers = pd.DataFrame(np.empty((200*12,2), dtype="object"), columns=["genes", "clustName"])
for i,clust in enumerate(sigGenes.columns):
    for j,gene in enumerate(sigGenes.index):
        gene = sigGenes.loc[gene,clust]
        #indexOver = np.where(humanGenes == gene)[0]
        #if indexOver.size > 0:
        #    gene = mouseGenes[indexOver[0]]
        markers.iloc[i*200+j] = gene,classLabDict[clust]
        
markers = markers.dropna()
markers

In [None]:
markers = markers.drop_duplicates()

In [None]:
import decoupler as dc

In [None]:
dc.run_ora(
    mat=adata,
    net=markers,
    source='clustName',
    target='genes',
    min_n=3,
    verbose=True
)

In [None]:
acts = dc.get_acts(adata, obsm_key='ora_estimate')

# We need to remove inf and set them to the maximum value observed for pvals=0
acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e

acts

In [None]:
sc.pl.umap(acts, color=scoreNames+['leiden'], cmap='RdBu_r')
sc.pl.violin(acts, keys=scoreNames, groupby='leiden')

In [None]:
df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')
df

In [None]:
n_ctypes = 3
ctypes_dict = df.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict

In [None]:
sc.pl.matrixplot(acts, ctypes_dict, 'leiden', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')

In [None]:
annotation_dict = df.groupby('group').head(1).set_index('group')['names'].to_dict()
annotation_dict

In [None]:
# Add cell type column based on annotation
adata.obs['decoupler'] = [annotation_dict[clust] for clust in adata.obs['leiden']]

# Visualize
sc.pl.umap(adata, color=['decoupler','cellState', "leiden"])

In [None]:
adata.write(resultsFile)
adata

In [None]:
resultsFile

In [None]:
fig, axs = plt.subplots(1,len(scores), figsize=(10, 5))

for i,score in enumerate(scores):
    axs[i].hist(adata.obs[score], bins=100)#, log=True),
    axs[i].set_title(score)
    axs[i].axvline(np.median(adata.obs[score]), color='k', linestyle='dashed', linewidth=1)

fig.show()

In [None]:
cats = adata.obs.leiden.cat.categories
scores = ["scClassicalScore","classicalScore","basalScore","scBasalScore","emtScore"]
label = ["classical","classical","basal","basal","EMT"]
labelDict = dict(zip(scores,label))
catScoreMed = pd.DataFrame(np.zeros((len(cats),len(scores))),index=cats, columns=scores)
newBClabel = list(adata.obs.leiden.cat.categories.copy())

for i,leid in enumerate(catScoreMed.index):
    adataCat = adata[adata.obs.leiden==leid]
    for score in catScoreMed.columns:
        scorMed = np.round(np.median(adataCat.obs[score]),decimals=4)
        catScoreMed.loc[leid,score] = np.round(sum(adata.obs[score] > scorMed)/len(adata.obs[score]),decimals=4)
    if(np.max(catScoreMed.loc[leid,:]) > 0.5):
        newBClabel[i] = labelDict[catScoreMed.columns[np.argmax(catScoreMed.loc[leid,:])]]
    else:
        newBClabel[i] = "inter"
    
#catScoreMed
adata.obs["cellState"] = [newBClabel[int(lei)] for lei in adata.obs.leiden]