In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import sklearn.metrics
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
#adata=sc.read(results_file_post)
#adata.uns['log1p'] = {"base":None}

Set up out properties

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
#%matplotlib inline

Set scanpy out-files

In [None]:
writeDir = "write/"

fileName = "pdacHwang"

resultsFileQC = writeDir + fileName + '_QC.h5ad' 


Set figure parameters

In [None]:
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')
sc.settings.figdir = "figures/" + fileName + "/"
figName = fileName

read input file

In [None]:
inDir = 'data/PDAC/Data_Hwang2022_Pancreas'
inputGenes = f'{inDir}/genes.txt'
inMetaFile = f'{inDir}/Meta-data.csv'

In [None]:
inMetaFile

In [None]:
inputFileMtx = f'{inDir}/Group1/Exp_data_TP10K_1.mtx'
inputBarcodes = f'{inDir}/Group1/Cells1.csv'

In [None]:
adata = sc.read_mtx(inputFileMtx).T

In [None]:
genes = pd.read_csv(inputGenes, header=None,index_col=0)
genes.index.name = None
cells = pd.read_table(inputBarcodes,index_col=0, sep=",")
meta = pd.read_table(inMetaFile,index_col=0, sep=",")

In [None]:
adata.obs = cells
adata.var = genes

In [None]:
adata.var_names_make_unique()
adata.obs_names_make_unique()

In [None]:
#addMeta = pd.DataFrame([meta.loc[samp].values for samp in adata.obs["sample"]], columns = [ 'sex', 'age', 'AJCC_stage', 'sample_primary_met','site', 'grade'], index = adata.obs.index)
#adata.obs = adata.obs.join(addMeta)

In [None]:
inputFileMtx = f'{inDir}/Group2/Exp_data_TP10K_2.mtx'
inputBarcodes = f'{inDir}/Group2/Cells2.csv'

In [None]:
adata2 = sc.read_mtx(inputFileMtx).T

In [None]:
genes = pd.read_csv(inputGenes, header=None,index_col=0)
genes.index.name = None
cells = pd.read_table(inputBarcodes,index_col=0, sep=",")
meta = pd.read_table(inMetaFile,index_col=0, sep=",")

In [None]:
adata2.obs = cells
adata2.var = genes

In [None]:
adata2.var_names_make_unique()
adata2.obs_names_make_unique()

In [None]:
inputFileMtx = f'{inDir}/Group3/Exp_data_TP10K_3.mtx'
inputBarcodes = f'{inDir}/Group3/Cells3.csv'

In [None]:
adata3 = sc.read_mtx(inputFileMtx).T

In [None]:
genes = pd.read_csv(inputGenes, header=None,index_col=0)
genes.index.name = None
cells = pd.read_table(inputBarcodes,index_col=0, sep=",")
meta = pd.read_table(inMetaFile,index_col=0, sep=",")

In [None]:
adata3.obs = cells
adata3.var = genes

In [None]:
adata3.var_names_make_unique()
adata3.obs_names_make_unique()

In [None]:
print(adata,adata2,adata3)

In [None]:
adata = sc.concat([adata,adata2,adata3])
adata

# Start QC
investigate highest expressed genes

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, )

In [None]:
valX = adata.X>0
minGenes=200
minCells=20

fig, axs = plt.subplots(1, 2, figsize=(8, 4))

numCellwExp = valX.sum(axis=1)
axs[0].hist(np.log(numCellwExp), bins=100)#, log=True)
axs[0].axvline(np.log(minGenes), color='k', linestyle='dashed', linewidth=1)
axs[0].set_xlabel('num gene')
axs[0].set_ylabel('counts')

numGeneswExp = valX.sum(axis=0).T
axs[1].hist(np.log(numGeneswExp+1), bins=100, log=True)
axs[1].axvline(np.log(minCells+1), color='k', linestyle='dashed', linewidth=1)
#axs[0,1].axvline(np.log(600), color='k', linestyle='dashed', linewidth=1)
#axs[0,1].axvline(np.log(4000), color='k', linestyle='dashed', linewidth=1)
#axs[0,1].set_title('Gene means counts')
axs[1].set_xlabel('num cell')
axs[1].set_ylabel('counts')

fig.show()

In [None]:
sc.pp.filter_cells(adata, min_genes = minGenes)
sc.pp.filter_genes(adata, min_cells = minCells)

In [None]:
adata = adata[:,np.logical_not(adata.var_names=="MALAT1")]

## Mito QC

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('MT-') 
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], log1p = False, inplace=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True, groupby="cell_type")

In [None]:
adata.obs["treated"] = [x[0] for x in adata.obs["sample"]]

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True, groupby="treated")

In [None]:
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

remove cells that have too much mito or could be doublets

In [None]:
adata = adata[adata.obs.n_genes_by_counts < 4000, :]
adata = adata[adata.obs.total_counts < 5000, :]
adata = adata[adata.obs.total_counts > 2500, :]
adata = adata[adata.obs.pct_counts_mt < 5, :]

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True, groupby="treated")

save post QC scanpy

In [None]:
adata

In [None]:
resultsFileQC

In [None]:
adata.write(resultsFileQC)