In [1]:
import os
import gzip
import numpy as np
import pandas as pd
import scanpy.api as sc
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import statsmodels.api as sm
import sklearn.preprocessing
import scipy
import scipy.sparse
import sklearn.metrics
import sklearn.mixture
import sklearn.linear_model
from anndata import AnnData
sc.settings.set_figure_params(dpi=100)
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
#%load_ext rpy2.ipython
sns.set(font_scale=1.5)
plt.style.use('seaborn-white')
%matplotlib inline
import sklearn.mixture

%load_ext rpy2.ipython

sc.logging.print_versions()

scanpy==1.4.3+56.g709bafb anndata==0.6.17 umap==0.3.8 numpy==1.16.2 scipy==1.2.0 pandas==0.24.1 scikit-learn==0.20.2 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1


In [2]:
samples = ['JYH_854_1_2','JYH_855_1_2','JYH_856_1_2','JYH_857_1_2']
sample_names=['Veh','OTX','PDL1','OP']
output_dir ='immune_rep2'

## Round 1: merge, normalize, filter windows and cluster

In [None]:
adatas = []
wd = os.getcwd()
for sample in samples:
    adata = sc.read_h5ad(
        os.path.join(wd, sample, '{0}.adata.h5ad'.format(sample)))
    doublet_res = pd.read_csv(os.path.join(
        wd,sample, '{}.doublet_result.txt'.format(sample)),
                              index_col=0)
    doublet_res['doublet_quantile'] = (doublet_res.doublet_scores.rank() /
                                       len(doublet_res.doublet_scores))
    adata.obs = pd.merge(adata.obs, doublet_res, how='left', on='index')
    adatas.append(
        AnnData(
            adata.raw.X,
            var=adata.raw.var,
            obs=adata.obs,
            raw=adata.raw,
        ))
del adata
sc.logging.print_memory_usage()

In [None]:
adata_merged = AnnData.concatenate(*adatas,
                                   batch_key='treat',
                                   index_unique=None)
adata_merged.obs['treat']=['_'.join(i.split('_')[:-1]) for i in adata_merged.obs.index]
adata_merged.obs['log10_unique_usable_reads'] = np.log10(adata_merged.obs['unique_usable_reads'])

del adatas
sc.logging.print_memory_usage()

In [None]:
adata_merged_raw = adata_merged.copy() #log1p already
adata_merged.X =adata_merged.X.expm1() # change back to cnt
sc.logging.print_memory_usage()
adata_merged

In [None]:
sc.pp.normalize_per_cell(adata_merged, counts_per_cell_after=1e4)
adata_merged_filter = sc.pp.filter_genes_dispersion(adata_merged.X, flavor='seurat', n_bins=50)
hvgs = adata_merged.var.loc[adata_merged_filter.gene_subset].index.tolist()

for sample in samples:
    adata_merged.var[sample] = (adata_merged_raw.X > 0)[adata_merged.obs.index.str.contains(sample),:].sum(axis=0).A1
hvgs = adata_merged.var.loc[adata_merged.var.index.isin(hvgs)]
hvgs = (hvgs>0).all(1).index
adata_merged.var = adata_merged.var.drop(samples, axis=1)

In [None]:
## regress out depth for each sample
adata_ind = []
for sample in samples:
    adata = adata_merged[adata_merged.obs.index.str.contains(sample +
                                                             '_'), :].copy()
    adata = adata[:, adata.var.index.isin(hvgs)]
    sc.pp.log1p(adata)
    adata_raw = adata_merged_raw[adata_merged_raw.obs.index.str.
                                     contains(sample + '_'), :].copy()
    adata.obs['log10_usable_counts'] = np.log10(
        adata_raw[:, adata_raw.var.index.isin(hvgs)].X.expm1().sum(
            axis=1).A1)
    sc.pp.regress_out(adata, ['log10_usable_counts'])
    adata_ind.append(adata)

adata=adata_raw = None
sc.logging.print_memory_usage()

In [None]:
adata_norm = AnnData.concatenate(*adata_ind,
                                 batch_key='norm',
                                 index_unique=None)
adata_norm.raw = adata_merged_raw.copy()
adata_ind = adata_merged = adata_merged_raw = None
sc.pp.scale(adata_norm)
sc.tl.pca(adata_norm, zero_center=False, random_state=0)
pc = pd.DataFrame(adata_norm.obsm['X_pca'],
                  columns=['PC{}'.format(i) for i in range(1, 51)],
                  index=adata_norm.obs.index)
treat = adata_norm.obs['treat'].tolist()
sc.logging.print_memory_usage()

In [None]:
%%R -i pc -i treat -o harmonized
.libPaths(c('/usr/lib64/R/library','/usr/share/R/library'))

library(harmony)
library(magrittr)

treat <- as.factor(unlist(treat))
harmonized <- HarmonyMatrix(pc, treat,do_pca=FALSE)
harmonized <- data.frame(harmonized)

In [None]:
adata_norm.obsm['X_pca'] = harmonized.values #pc.values
sc.pp.neighbors(adata_norm, n_neighbors=30, method='umap', metric='cosine', random_state=0, n_pcs=50)
sc.tl.leiden(adata_norm, resolution=1, random_state=0)
sc.tl.umap(adata_norm, min_dist=0.3, random_state=0)

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(12, 8))

## 1
sc.pl.umap(adata_norm,
           color=['leiden'],
           size=9,
           legend_loc='on data',
           ax=axs[0, 0],
           show=False)

## 2
sc.pl.umap(adata_norm, color=['treat'], size=9, ax=axs[0, 1], show=False)

## 3
cell_per_cluster = adata_norm.obs.groupby(
    ["leiden",
     "treat"]).size().reset_index(name="Cells").pivot_table(index='leiden',
                                                            columns='treat',
                                                            values='Cells',
                                                            fill_value=0)

pd = cell_per_cluster  #.apply(lambda x: round(x / x.sum(),3)*100)
pd.plot(
    kind='bar',
    stacked=True,
    legend=False,
    ax=axs[1, 0],
)
axs[1, 0].set_ylabel('Cell count')
#patches, labels = axs[1,0].get_legend_handles_labels()
#axs[1,0].legend(patches, labels, loc='upper left', bbox_to_anchor=(1,1))

## 4
pd = cell_per_cluster.apply(
    lambda x: round(x / x.sum(), 3) * 100,
    axis=1,
)
pd.plot(kind='bar', stacked=True, legend=False, ax=axs[1, 1])
axs[1, 1].set_ylabel('Percentage')
#patches, labels = axs[1,1].get_legend_handles_labels()
#axs[1,1].legend(patches, labels, loc='upper left', bbox_to_anchor=(1,1))

plt.tight_layout()
plt.show()

In [None]:
import math
n_sample = len(sample_names)
n_col = math.ceil(n_sample / 2)

fig, axs = plt.subplots(2, n_col, figsize=(3*n_col,6), sharex=True,sharey=True)
i = 0
for ax in axs.reshape(-1):
    if i < n_sample:
        sc.pl.umap(
            adata_norm[adata_norm.obs.treat == samples[i]],
            color=['treat'],
            size=9,
            ax=ax,
            show=False,
            title=sample_names[i],
        ).get_legend().remove()
    i += 1
plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(12, 6),sharex=True)
i=0
to_plot=[
        'log10_usable_counts','frac_reads_in_peaks', 'frac_reads_in_promoters',
        'frac_promoters_used', 'frac_mito_reads', 'doublet_quantile'
]

for ax in axs.reshape(-1): 
    sns.boxplot(x='leiden', y=to_plot[i], data=adata_norm.obs,ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(to_plot[i])
    i+=1
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 3, figsize=(12, 6), sharex=True)
i = 0

for ax in axs.reshape(-1):
    sc.pl.umap(adata_norm,
               color=to_plot[i],
               cmap='Blues' if i==0 else'Reds',
               size=9,
               ax=ax,
               show=False,
               legend_loc='on data')
    i+=1

plt.tight_layout()
plt.show()

## feature genes 

In [None]:
sns.set(font_scale=1)
plt.style.use('default')

marker_genes_dic_2 = {
    'Tcell': ["Cd3e.1", "Cd4.1", "Cd8a", 'Tcf7'],
    'Macrophage': ['Adgre1', 'Eif4a1', 'RP23-144N15.4', 'Cx3cr1'],
    'B_cells': [
        "Cd79b",
        "Mzb1",
    ],
    'Epithelial Cell': ['Krt19.1'],
    'Fib': ['Col1a2', 'Col1a2.1', 'Col1a2.2', 'Col1a2.3', 'Col1a2.4', 'Acta2'],
    'Neutrophil':
    ["Ly6g", "Cebpe", "Csf3r", 'Lcn2', 'Ltf', 'S100a8', 'S100a9'],
    'NK': ['Klrc1'],
    'DCs': ["Cd209a", "Cd74", "Flt3", "H2-Eb1", 'Itgax'],  # 'Itgax'-> Cd11c
    'MHC-II': ["H2-Aa", "H2-Ab1", "H2-Eb1"],  #"
}

ax = sc.pl.matrixplot(adata_norm,
                      var_names=marker_genes_dic_2,
                      cmap='Reds',
                      dendrogram=True,
                      groupby='leiden',
                      use_raw=True)

ax = sc.pl.matrixplot(adata_norm,
                      var_names=marker_genes_dic_2,
                      cmap='Reds',
                      dendrogram=True,
                      groupby='leiden',
                      standard_scale='var',
                      use_raw=True)

ax = sc.pl.dotplot(adata_norm,
                   marker_genes_dic_2,
                   groupby='leiden',
                   dendrogram=True,
                   use_raw=True,
                   expression_cutoff=0)

In [None]:
sns.set(font_scale=1)
plt.style.use('default')

import json
with open('./marker_genes_gecodeVm17_tr.json', 'r') as f:
    marker_genes_dic = json.load(f)

ax = sc.pl.matrixplot(adata_norm,
                      var_names=marker_genes_dic,
                      cmap='Reds',
                      dendrogram=True,
                      groupby='leiden',
                      use_raw=True)

ax = sc.pl.matrixplot(adata_norm,
                      var_names=marker_genes_dic,
                      cmap='Reds',
                      dendrogram=True,
                      groupby='leiden',
                      standard_scale='var',
                      use_raw=True)

ax = sc.pl.dotplot(adata_norm,
                   marker_genes_dic,
                   groupby='leiden',
                   dendrogram=True,
                   use_raw=True,
                   expression_cutoff=0)

## save Data

In [None]:
adata_norm.write(filename=os.path.join(output_dir, 'clustering_merge.adata.h5ad'))

adata_norm