In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.io
import scanpy.external as scex
import sklearn.metrics
import seaborn as sns
from matplotlib import pyplot as plt
import scanpyHelpers as scH


In [None]:
#adata=sc.read(results_file_post)
#adata.uns['log1p'] = {"base":None}

Set up out properties

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
%matplotlib inline

Set scanpy out-files

In [None]:
writeDir = "write/"

fileName = "luadDost"

resultsFile = writeDir + fileName + '.h5ad'       # final output
resultsFileQC = writeDir + fileName + '_QC.h5ad'  # post QC (pre-analysis) 

Set figure parameters

In [None]:
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')
sc.settings.figdir = "figures/" + fileName + "/"
figName = fileName

In [None]:
adata = sc.read_h5ad(resultsFileQC)
adata

In [None]:
adata = adata[adata.obs.source=="1"]

In [None]:
sc.pp.normalize_total(adata,target_sum=1e4)

In [None]:
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata)#, flavor = "seurat", n_top_genes=2000)

In [None]:
minMean = 0.075
maxMean = 2.7
minDisp = 0.65

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

means = adata.var[["means"]][adata.var[["means"]] > np.exp(-14)]#adata.var[["means"]],
axs[0].hist(np.log(means), bins=100)#, log=True),
axs[0].axvline(np.log(minMean), color='k', linestyle='dashed', linewidth=1)
axs[0].axvline(np.log(maxMean), color='k', linestyle='dashed', linewidth=1)
axs[0].set_title('Gene means counts')
axs[0].set_xlabel('means')
axs[0].set_ylabel('counts')

dispNorm = adata.var[["dispersions_norm"]][adata.var[["dispersions_norm"]] > np.exp(-5)]#adata.var[["means"]],
axs[1].hist(np.log(dispNorm), bins=100)#, log=True),
axs[1].axvline(np.log(minDisp), color='k', linestyle='dashed', linewidth=1)
axs[1].set_title('Gene dispersions counts')
axs[1].set_xlabel('dispersions')
axs[1].set_ylabel('counts')

sc.pp.highly_variable_genes(adata, min_disp=minDisp, min_mean=minMean, max_mean=maxMean)
print(sum(adata.var.highly_variable))

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
#for inGene in adata.var[-8:].index.tolist():
#    adata.var.loc[inGene,"highly_variable"] = False

In [None]:
adata.raw = adata

In [None]:
#adata = adata[:, adata.var.highly_variable]
adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]

In [None]:
#sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

In [None]:
#sc.pp.scale(adata, max_value=10)

In [None]:
sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100, log=True)

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100)

In [None]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=40)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.2)

In [None]:
adata

In [None]:
sc.pl.umap(adata, color=["leiden","source","PTPRC","PECAM1","EPCAM","DCN","VIM"],ncols=4)

In [None]:
sc.pl.pca(adata, color=["leiden","source","PTPRC","PECAM1","EPCAM","DCN","VIM"],ncols=4)

In [None]:
sc.pl.umap(adata, color=["n_genes","pct_counts_mt","total_counts"])

In [None]:
sc.pl.violin(adata, ["PTPRC","DCN","EPCAM","PECAM1"], groupby="leiden")

In [None]:
label = []
for l in adata.obs.leiden:
    if l in ["0","3"]:
        label.append("tumor")
    elif l in ["2","7"]:
        label.append("epith")
    elif l in ["5","6"]:
        label.append("fibr")
    else:
        label.append("imm")

In [None]:
adata.obs["label"] = label

In [None]:
sc.pl.pca(adata, color=["leiden","source","PTPRC","PECAM1","EPCAM","DCN","VIM","label"],ncols=4)

In [None]:
sc.pl.umap(adata, color=["leiden","source","PTPRC","PECAM1","EPCAM","DCN","VIM","label"],ncols=4)

In [None]:
adata = adata.raw.to_adata()

In [None]:
adata = adata[[l =="tumor" for l in adata.obs.label]]

In [None]:
sc.pp.highly_variable_genes(adata)#, flavor = "seurat", n_top_genes=2000)

In [None]:
minMean = 0.075
maxMean = 2.6
minDisp = 0.8

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

means = adata.var[["means"]][adata.var[["means"]] > np.exp(-14)]#adata.var[["means"]],
axs[0].hist(np.log(means), bins=100)#, log=True),
axs[0].axvline(np.log(minMean), color='k', linestyle='dashed', linewidth=1)
axs[0].axvline(np.log(maxMean), color='k', linestyle='dashed', linewidth=1)
axs[0].set_title('Gene means counts')
axs[0].set_xlabel('means')
axs[0].set_ylabel('counts')

dispNorm = adata.var[["dispersions_norm"]][adata.var[["dispersions_norm"]] > np.exp(-5)]#adata.var[["means"]],
axs[1].hist(np.log(dispNorm), bins=100)#, log=True),
axs[1].axvline(np.log(minDisp), color='k', linestyle='dashed', linewidth=1)
axs[1].set_title('Gene dispersions counts')
axs[1].set_xlabel('dispersions')
axs[1].set_ylabel('counts')

sc.pp.highly_variable_genes(adata, min_disp=minDisp, min_mean=minMean, max_mean=maxMean)
print(sum(adata.var.highly_variable))

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
#for inGene in adata.var[-8:].index.tolist():
#    adata.var.loc[inGene,"highly_variable"] = False

In [None]:
adata.raw = adata

In [None]:
#adata = adata[:, adata.var.highly_variable]
adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]

In [None]:
#sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

In [None]:
#sc.pp.scale(adata, max_value=10)

In [None]:
sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100, log=True)

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100)

In [None]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=40)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.3)

In [None]:
sc.pl.umap(adata, color=["leiden","source","EPCAM","PECAM1","VIM","SLC4A11","CAPS","SCGB1A1"],ncols=3)

In [None]:
sc.pl.umap(adata, color=["n_genes","pct_counts_mt","total_counts"])

In [None]:
adata.uns['log1p']['base']=None

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon', use_raw=False)
#sc.tl.filter_rank_genes_groups(adata, groupby="leiden", use_raw=False,
#                                   key_added='rank_genes_groups_filtered', 
#                                   min_in_group_fraction=0.25, min_fold_change=1, max_out_group_fraction=0.5, compare_abs=False)
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(15)

In [None]:
sc.tl.dendrogram(adata,groupby="leiden", n_pcs=60)
sc.pl.rank_genes_groups_dotplot(adata,n_genes=3)#,key="rank_genes_groups_filtered")

In [None]:
resultsFile

In [None]:
adata

In [None]:
adata.write(resultsFile)
adata

In [None]:
homology = pd.read_table("data/hgncHM_121.csv", sep=",")
mouseGenes = np.array(homology["mouse"])
humanGenes = np.array(homology["human"])
homology

In [None]:
import scanpyHelpers as scH

In [None]:
adata = sc.read_h5ad(resultsFile)
adata

In [None]:
sigFile = "data/LUAD/humanLUADgeneSig.csv"
sigGenes = pd.read_csv(sigFile)
sigGenes

In [None]:
sigGenes.columns

In [None]:
scoreNames, newBClabel, ogLabelScoreMed = scH.scoreAndLabel(adata, sigGenes, labelOfSigGenes= sigGenes.columns, ogLabel="leiden")
sc.pl.umap(adata, color=scoreNames+["cellState","leiden"], ncols=4)

In [None]:
scH.plotScores(adata.obs[scoreNames], figCols = 2, cutoff = 1.5)

In [None]:
sc.pl.pca(adata, color=scoreNames+["leiden","cellState"], ncols=4)

In [None]:
markers = pd.DataFrame(np.empty((500*12,2), dtype="object"), columns=["genes", "clustName"])
for i,clust in enumerate(sigGenes.columns):
    for j,gene in enumerate(sigGenes.index):
        gene = sigGenes.loc[gene,clust]
        indexOver = np.where(mouseGenes == gene)[0]
        if indexOver.size > 0:
            gene = humanGenes[indexOver[0]]
        markers.iloc[i*500+j] = gene,clust
        
markers = markers.dropna()
markers

In [None]:
import decoupler as dc

In [None]:
dc.run_ora(
    mat=adata,
    net=markers,
    source='clustName',
    target='genes',
    min_n=3,
    verbose=True
)

In [None]:
adata.obsm['ora_estimate']


In [None]:
acts = dc.get_acts(adata, obsm_key='ora_estimate')

# We need to remove inf and set them to the maximum value observed for pvals=0
acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e

acts

In [None]:
sc.pl.umap(acts, color=scoreNames+['leiden'], cmap='RdBu_r')
sc.pl.violin(acts, keys=scoreNames, groupby='leiden')

In [None]:
df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')
df

In [None]:
n_ctypes = 3
ctypes_dict = df.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict

In [None]:
sc.pl.matrixplot(acts, ctypes_dict, 'leiden', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')


In [None]:
annotation_dict = df.groupby('group').head(1).set_index('group')['names'].to_dict()
annotation_dict

In [None]:
# Add cell type column based on annotation
adata.obs['decoupler'] = [annotation_dict[clust] for clust in adata.obs['leiden']]

# Visualize
sc.pl.umap(adata, color=['decoupler','cellState', "leiden"])

In [None]:
adata.write(resultsFile)
adata

In [None]:
resultsFile

In [None]:
adata = sc.read(resultsFile)

In [None]:
scH.addIndvLabel(adata, [f"{sigName}Score" for sigName in sigGenes.columns], obsLabel="zsig", cutoff=0.5)

In [None]:
sc.pl.umap(adata, color=["leiden",'decoupler','zsig','cellState'])

In [None]:
adata.write(resultsFile)
adata

In [None]:
clustToLabel = {"Cluster 1":"AT2-like",
 "Cluster 2":"AT2-like",
 "Cluster 3":"AT1-like",
 "Cluster 4":"Endoderm-like",
 "Cluster 5":"HPCS",
 "Cluster 6":"Endoderm-lik",
 "Cluster 7":"Prolif",
 "Cluster 8":"EmbLiv",
 "Cluster 9":"Rib",
 "Cluster 10":"Gastro",
 "Cluster 11":"EMT",
 "Cluster 12":"Adv",
 "out":"unlab"}

homology = pd.read_table("../mhCompTiss/data/hgncHM_121.csv", sep=",")
mouseGenes = np.array(homology["mouse"])
humanGenes = np.array(homology["human"])
m2h = dict(zip(mouseGenes,humanGenes))
h2m = dict(zip(humanGenes,mouseGenes))

from collections import Counter
sigFile = "data/LUAD/simpLUADcs.csv"
sigGenes = pd.read_excel("data/LUAD/LUADhpcs.xlsx")
sigGenes = sigGenes[['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5',
                     'Cluster 6', 
                     'Cluster 7', 'Cluster 8',  
                     'Cluster 9','Cluster 10','Cluster 11', 
                     'Cluster 12']]

adata = sc.read_h5ad(resultsFile)

sc.tl.leiden(adata, resolution=0.6)

for gs in sigGenes:
    sc.tl.score_genes(adata,[m2h[g] for g in sigGenes[gs].dropna() if g in m2h.keys()],score_name=f"{gs}Score")
scoreMat = adata.obs[[f"{sigName}Score" for sigName in sigGenes.columns]]

adata.obs["zsig"] = scH.zScores(scoreMat, cutoff = 1)
relabelDict = {}
for li in adata.obs.leiden.cat.categories:
    mostCommon = Counter(adata[adata.obs.leiden==li].obs.zsig).most_common(2)
    relabel = ""
#    if(mostCommon[0][0] in clustToLabel.keys()):
    relabel = clustToLabel[mostCommon[0][0]]
#    else:
#        relabel = clustToLabel[mostCommon[1][0]]
    relabelDict[li] = relabel
relabelDict = {'0': 'Adv', '1': 'HPCS', '2': 'unlab', '3': 'AT2-like', '4': 'AT1-like', '5': 'HPCS'}
print(relabelDict)
adata.obs["cs"] = [relabelDict[li] for li in adata.obs.leiden]

In [None]:
sc.pl.umap(adata, color=["leiden","cs"],ncols=4)

In [None]:
sc.pl.umap(adata, color=[f"{sigName}Score" for sigName in sigGenes.columns], cmap="bwr", ncols=4)

In [None]:
qcdata = sc.read_h5ad(resultsFileQC)
print(sum(qcdata.X.todense()))

rdata = adata.raw.to_adata()

qcdata = qcdata[rdata.obs_names,:]
rdata.layers["counts"] = qcdata.X.copy()
rdata.obs = rdata.obs[['source','leiden','cs']]
rdata.write('write/luadDost_Labeled.h5ad')

In [None]:
resultsFileQC

In [None]:
rdata