In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import sklearn.metrics
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
#adata=sc.read(results_file_post)
#adata.uns['log1p'] = {"base":None}

Set up out properties

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
#%matplotlib inline

Set scanpy out-files

In [None]:
writeDir = "write/"

fileName = "pdacMouse"

resultsFileQC = writeDir + fileName + '_QC.h5ad' 


Set figure parameters

In [None]:
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')
sc.settings.figdir = "figures/" + fileName + "/"
figName = fileName

read input file

In [None]:
inDir = 'data/PDAC'
inputFile = f'{inDir}/202110251102_X_PDAC_Lgr5_all.h5'
#inMetaFile = f'{inDir}/.txt'

In [None]:
adata = sc.read_h5ad(inputFile)
adata

In [None]:
adata.var_names_make_unique()
adata.obs_names_make_unique()

In [None]:
adata.obs 

In [None]:
adata.var

# Start QC
investigate highest expressed genes

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, )

In [None]:
valX = adata.X>0
minGenes=500
minCells=20

fig, axs = plt.subplots(1, 2, figsize=(8, 4))

numCellwExp = valX.sum(axis=1)
axs[0].hist(np.log(numCellwExp), bins=100)#, log=True)
axs[0].axvline(np.log(minGenes), color='k', linestyle='dashed', linewidth=1)
axs[0].set_xlabel('num gene')
axs[0].set_ylabel('counts')

numGeneswExp = valX.sum(axis=0).T
axs[1].hist(np.log(numGeneswExp+1), bins=100, log=True)
axs[1].axvline(np.log(minCells+1), color='k', linestyle='dashed', linewidth=1)
#axs[0,1].axvline(np.log(600), color='k', linestyle='dashed', linewidth=1)
#axs[0,1].axvline(np.log(4000), color='k', linestyle='dashed', linewidth=1)
#axs[0,1].set_title('Gene means counts')
axs[1].set_xlabel('num cell')
axs[1].set_ylabel('counts')

fig.show()

In [None]:
sc.pp.filter_cells(adata, min_genes = minGenes)
sc.pp.filter_genes(adata, min_cells = minCells)

In [None]:
adata = adata[:,np.logical_not(adata.var_names=="Malat1")]

## Mito QC

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('mt-') 
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], log1p = False, inplace=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)

In [None]:
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

remove cells that have too much mito or could be doublets

In [None]:
adata = adata[adata.obs.n_genes_by_counts < 6000, :]
adata = adata[adata.obs.total_counts < 40000, :]
adata = adata[adata.obs.pct_counts_mt < 10, :]

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)

save post QC scanpy

In [None]:
adata

In [None]:
adata.obs[['batch', 'mouse', 'tumor', 'donor', 'treatment', 'hash', '10X_version', 'concat']]

In [None]:
adata.obs[['complexity_res', 'cl_pp', 'doublet_scores', 'doublets']]

In [None]:
sum(adata.obs.doublets)

In [None]:
adata.obs = adata.obs[['batch', 'mouse', 'tumor', 'donor', 'treatment', 
                        'hash', '10X_version', 'concat','n_genes_by_counts','n_genes']]
adata.var = adata.var[['n_cells', 'mt']]
adata.layers = None

In [None]:
adata

In [None]:
resultsFileQC

In [None]:
adata.write(resultsFileQC)

In [None]:
adata = sc.read(resultsFileQC)
adata

In [None]:
(140477*27131)/(150699*54838)

In [None]:
adata.var