In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.io
import scanpy.external as scex
import sklearn.metrics
import seaborn as sns
from matplotlib import pyplot as plt

import scanpyHelpers as scH

In [None]:
#adata=sc.read(results_file_post)
#adata.uns['log1p'] = {"base":None}

Set up out properties

In [None]:
sc.settings.verbosity = 0             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
%matplotlib inline

Set scanpy out-files

In [None]:
writeDir = "write/"

fileName = "luadMDA"

resultsFile = writeDir + fileName + '_P2T7.h5ad'       # final output
resultsFileQC = writeDir + fileName + '_QC.h5ad'  # post QC (pre-analysis) 

In [None]:
#resultsFile = writeDir + fileName + '_PT.h5ad'       # final output


Set figure parameters

In [None]:
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')
sc.settings.figdir = "figures/" + fileName + "/"
figName = fileName

In [None]:
resultsFileQC

In [None]:
adata = sc.read_h5ad(resultsFileQC)
adata

In [None]:
adata.obs["patient"]=[name.split("_")[0].split("-")[0] for name in adata.obs_names.values]

In [None]:
kras = ["P2"]#"P10T","P14T",
#resultsFile = writeDir + fileName + '_P10T.h5ad'       # final output

adata = adata[[p in kras for p in adata.obs.patient],]

adata.obs["local"]=[name.split("_")[0].split("-")[1] for name in adata.obs_names.values]
adata = adata[[p in ["T7"] for p in  adata.obs.local],]
#adata = adata[adata.obs.celltype=="malig"]
#adata.write('write/luadMDAkras_QC.h5ad')

In [None]:
rawCounts = adata.X

In [None]:
np.sum(adata.X.todense(),axis=0)

In [None]:
sc.pp.normalize_total(adata,target_sum=1e4)

In [None]:
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata)#, flavor = "seurat", n_top_genes=2000)

In [None]:
minMean = 0.1
maxMean = 2.7
minDisp = 0.4

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

means = adata.var[["means"]][adata.var[["means"]] > np.exp(-14)]#adata.var[["means"]],
axs[0].hist(np.log(means), bins=100)#, log=True),
axs[0].axvline(np.log(minMean), color='k', linestyle='dashed', linewidth=1)
axs[0].axvline(np.log(maxMean), color='k', linestyle='dashed', linewidth=1)
axs[0].set_title('Gene means counts')
axs[0].set_xlabel('means')
axs[0].set_ylabel('counts')

dispNorm = adata.var[["dispersions_norm"]][adata.var[["dispersions_norm"]] > np.exp(-5)]#adata.var[["means"]],
axs[1].hist(np.log(dispNorm), bins=100)#, log=True),
axs[1].axvline(np.log(minDisp), color='k', linestyle='dashed', linewidth=1)
axs[1].set_title('Gene dispersions counts')
axs[1].set_xlabel('dispersions')
axs[1].set_ylabel('counts')

sc.pp.highly_variable_genes(adata, min_disp=minDisp, min_mean=minMean, max_mean=maxMean)
print(sum(adata.var.highly_variable))

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
#for inGene in adata.var[-8:].index.tolist():
#    adata.var.loc[inGene,"highly_variable"] = False

In [None]:
adata.raw = adata

In [None]:
#adata = adata[:, adata.var.highly_variable]
adata = adata[:, np.logical_and(adata.var.highly_variable, np.logical_not(adata.var.mt))]

In [None]:
#sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

In [None]:
#sc.pp.scale(adata, max_value=10)

In [None]:
sc.tl.pca(adata, n_comps = 100, svd_solver='arpack')

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100, log=True)

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs = 100)

In [None]:
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=40)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=0.3)

In [None]:
sc.pl.umap(adata, color=["leiden","celltype","patient"],ncols=2)

In [None]:
#sc.pl.umap(adata[adata.obs["local"]=="T7",:], color=["leiden","celltype","patient","local"],ncols=2)

In [None]:
sc.pl.umap(adata, color=["n_genes","pct_counts_mt","total_counts"])

In [None]:
resultsFile

In [None]:
adata

In [None]:
rdata = sc.read_h5ad('write/luadMDAkras_Labeled.h5ad')
rdata

In [None]:
lablMal = np.array(adata.obs.celltype.values.copy())
for i, cs in enumerate(lablMal):
    if cs=="malig":
        lablMal[i] = rdata.obs.loc[adata.obs_names[i],"cs"]
adata.obs["cs"] = lablMal        

In [None]:
sc.pl.umap(adata, color=["leiden","celltype","cs"],ncols=2)

In [None]:
adata.obs["Tumor"] = ["tumor" if cs=="malig" else "normal" for cs in adata.obs["celltype"]]

In [None]:
from collections import Counter
adata.obs.cs = ["AT2-like" if cst in ['Endoderm-like','Rib'] else cst for cst in adata.obs.cs]
Counter(adata.obs.cs)

In [None]:
resultsFile

In [None]:
adata.write(resultsFile)
adata

In [None]:
ndata = sc.read_h5ad("write/luadMDA_P2T7.h5ad")
ndata

In [None]:
adata.layers["counts"] = adata[:,adata.var_names].X

In [None]:
resultsFile

In [None]:
ndata.write('../scCompare/scVAE_ABC/resources/luadMDA_P2T7.h5ad')

In [None]:
adata = sc.read_h5ad("write/luadMDA_P2T7.h5ad")
adata.layers["counts"] = adata[:,adata.var_names].X
adata.uns["pairs"] = [['AT2-like','alveoli_AT2']]
adata.write('../scCompare/scVAE_ABC/resources/luadMDA_P2T7.h5ad')

In [None]:
adata.obs.cs.cat.categories

In [None]:
adata

In [None]:
ndata

In [None]:
rdata = adata[ndata.obs_names,:]
rdata.layers["counts"] = rdata.X
rdata.obs = ndata.obs
rdata

In [None]:
sc.pp.normalize_total(rdata,target_sum=1e4)
sc.pp.log1p(rdata)

In [None]:
rdata.write('write/luadMDA_P2T7_QC.h5ad')