In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.io
import scanpy.external as scex
import sklearn.metrics
import seaborn as sns
from matplotlib import pyplot as plt


In [None]:
#adata=sc.read(results_file_post)
#adata.uns['log1p'] = {"base":None}

Set up out properties

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
%matplotlib inline

Set scanpy out-files

In [None]:
writeDir = "write/"

fileName = "pdacMouse"

resultsFile = writeDir + fileName + "untreated" + '.h5ad'       # final output
resultsFileQC = writeDir + fileName + '_QC.h5ad'  # post QC (pre-analysis) 

Set figure parameters

In [None]:
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')
sc.settings.figdir = "figures/" + fileName + "/"
figName = fileName

In [None]:
adata = sc.read(resultsFileQC)
adata

In [None]:
print(adata.obs["treatment"].cat.categories)
print(adata.obs["tumor"].cat.categories)
print(adata.obs["10X_version"].cat.categories)

In [None]:
adata = adata[adata.obs["treatment"]=="untreated"]
adata = adata[adata.obs["tumor"]=="autochthonous"]
adata = adata[adata.obs["10X_version"]=="v3"]#,adata.obs["treatment"]=="vehicle"
#adata = adata[np.logical_not(adata.obs["concat"]=="17")]#immune cells

In [None]:
sc.pp.normalize_total(adata,target_sum=1e4)

In [None]:
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata)#, flavor = "seurat", n_top_genes=2000)

In [None]:
minMean = 0.025
maxMean = 3
minDisp = 0.6

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

means = adata.var[["means"]][adata.var[["means"]] > np.exp(-14)]#adata.var[["means"]],
axs[0].hist(np.log(means), bins=100)#, log=True),
axs[0].axvline(np.log(minMean), color='k', linestyle='dashed', linewidth=1)
axs[0].axvline(np.log(maxMean), color='k', linestyle='dashed', linewidth=1)
axs[0].set_title('Gene means counts')
axs[0].set_xlabel('means')
axs[0].set_ylabel('counts')

dispNorm = adata.var[["dispersions_norm"]][adata.var[["dispersions_norm"]] > np.exp(-5)]#adata.var[["means"]],
axs[1].hist(np.log(dispNorm), bins=100)#, log=True),
axs[1].axvline(np.log(minDisp), color='k', linestyle='dashed', linewidth=1)
axs[1].set_title('Gene dispersions counts')
axs[1].set_xlabel('dispersions')
axs[1].set_ylabel('counts')

sc.pp.highly_variable_genes(adata, min_disp=minDisp, min_mean=minMean, max_mean=maxMean)
print(sum(adata.var.highly_variable))

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata.raw = adata

In [None]:
#adata = adata[:, adata.var.highly_variable]
adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]

In [None]:
#sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

In [None]:
#sc.pp.scale(adata, max_value=10)

In [None]:
sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100, log=True)

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100)

In [None]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=60)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.01)

In [None]:
sc.pl.umap(adata, color=["leiden", 'tumor', "10X_version", 'treatment','hash',"mouse",'concat',"Ptprc","Epcam","Dcn","Vim"],ncols=4)

In [None]:
#adata = adata[np.logical_not(adata.obs.leiden=="2")]
adata = adata[adata.obs.leiden=="1"]
adata = adata.raw.to_adata()

In [None]:
minMean = 0.025
maxMean = 3
minDisp = 0.6

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

means = adata.var[["means"]][adata.var[["means"]] > np.exp(-14)]#adata.var[["means"]],
axs[0].hist(np.log(means), bins=100)#, log=True),
axs[0].axvline(np.log(minMean), color='k', linestyle='dashed', linewidth=1)
axs[0].axvline(np.log(maxMean), color='k', linestyle='dashed', linewidth=1)
axs[0].set_title('Gene means counts')
axs[0].set_xlabel('means')
axs[0].set_ylabel('counts')

dispNorm = adata.var[["dispersions_norm"]][adata.var[["dispersions_norm"]] > np.exp(-5)]#adata.var[["means"]],
axs[1].hist(np.log(dispNorm), bins=100)#, log=True),
axs[1].axvline(np.log(minDisp), color='k', linestyle='dashed', linewidth=1)
axs[1].set_title('Gene dispersions counts')
axs[1].set_xlabel('dispersions')
axs[1].set_ylabel('counts')

sc.pp.highly_variable_genes(adata, min_disp=minDisp, min_mean=minMean, max_mean=maxMean)
print(sum(adata.var.highly_variable))

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
adata.raw = adata

In [None]:
#adata = adata[:, adata.var.highly_variable]
adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]

In [None]:
sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')

In [None]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=60)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.2)

In [None]:
sc.pl.umap(adata, color=["leiden", 'tumor', "10X_version", 'treatment',"mouse",'concat',"Ptprc","Dcn"],ncols=4)

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon', use_raw=False)
#sc.tl.filter_rank_genes_groups(adata, groupby="leiden", use_raw=False,
#                                   key_added='rank_genes_groups_filtered', 
#                                   min_in_group_fraction=0.25, min_fold_change=1, max_out_group_fraction=0.5, compare_abs=False)
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(15)

In [None]:
sc.pl.umap(adata, color=["leiden","Ptprc","Lyz2"], legend_loc="on data")

In [None]:
#sc.tl.dendrogram(adata,groupby="leiden", n_pcs=60)
#sc.pl.rank_genes_groups_dotplot(adata,n_genes=3)#,key="rank_genes_groups_filtered")

In [None]:
resultsFile

In [None]:
adata.write(resultsFile)
adata

In [None]:
adata = sc.read(resultsFile)
adata.uns['log1p']['base'] = None

In [None]:
homology = pd.read_table("data/hgncHM_121.csv", sep=",")
humanGenes = np.array(homology.loc[:,"human"])
mouseGenes = np.array(homology.loc[:,"mouse"])
homology

In [None]:
mouseGenes

In [None]:
import scanpyHelpers as scH

In [None]:
sigGenes = pd.read_csv("data/PDAC/basClaSigGenes.csv")
scoreNames, newBClabel, ogLabelScoreMed = scH.scoreAndLabel(adata, sigGenes, 
                                    labelOfSigGenes= ["basal","basal","classical","classical","EMT"], 
                                    ogLabel="leiden",
                                    translate = True, fromGenes=humanGenes, toGenes=mouseGenes)

In [None]:
sc.pl.umap(adata, color=scoreNames+["cellState"], ncols=2)

In [None]:
scoreNames

In [None]:
scH.gmmScoreGeneSig(adata.obs[['scBasalScore','scClassicalScore','EMTScore']], meansInit=[[0],[0.5]],plotLen = 3, show=True)

In [None]:
adata.write(resultsFile)
adata

In [None]:
sigGenes

In [None]:
classLabDict = dict(zip(sigGenes.columns,["basal","basal","classical","classical","EMT"]))
markers = pd.DataFrame(np.empty((500*12,2), dtype="object"), columns=["genes", "clustName"])
for i,clust in enumerate(sigGenes.columns):
    for j,gene in enumerate(sigGenes.index):
        gene = sigGenes.loc[gene,clust]
        indexOver = np.where(humanGenes == gene)[0]
        if indexOver.size > 0:
            gene = mouseGenes[indexOver[0]]
        markers.iloc[i*200+j] = gene,classLabDict[clust]
        
markers = markers.dropna()
markers

In [None]:
markers = markers.drop_duplicates()

In [None]:
import decoupler as dc

In [None]:
dc.run_ora(
    mat=adata,
    net=markers,
    source='clustName',
    target='genes',
    min_n=3,
    verbose=True
)

In [None]:
acts = dc.get_acts(adata, obsm_key='ora_estimate')

# We need to remove inf and set them to the maximum value observed for pvals=0
acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e

acts

In [None]:
sc.pl.umap(acts, color=scoreNames+['leiden'], cmap='RdBu_r')
sc.pl.violin(acts, keys=scoreNames, groupby='leiden')

In [None]:
df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')
df

In [None]:
n_ctypes = 3
ctypes_dict = df.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict

In [None]:
sc.pl.matrixplot(acts, ctypes_dict, 'leiden', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')

In [None]:
annotation_dict = df.groupby('group').head(1).set_index('group')['names'].to_dict()
annotation_dict

In [None]:
# Add cell type column based on annotation
adata.obs['decoupler'] = [annotation_dict[clust] for clust in adata.obs['leiden']]

# Visualize
sc.pl.umap(adata, color=['decoupler','cellState', "leiden","Vim"])

In [None]:
resultsFile

In [None]:
adata.write(resultsFile)
adata

In [None]:
adata = sc.read_h5ad('write/pdacMouseuntreated.h5ad')

In [None]:
sc.pl.umap(adata, color=["decoupler",'cellState', "leiden",'scBasalScore', 'scClassicalScore', 'EMTScore'], cmap="bwr")

In [None]:
qcdata = sc.read_h5ad(resultsFileQC)
print(sum(qcdata.X.todense()))

rdata = adata.raw.to_adata()

qcdata = qcdata[rdata.obs_names,:]
rdata.layers["counts"] = qcdata.X.copy()
rdata.obs["cs"] = rdata.obs.decoupler
rdata.obs = rdata.obs[['leiden','cs']]
rdata.write('write/pdacMouse_Labeled.h5ad')
rdata

In [None]:
resultsFileQC