In [None]:
import os
import re
import gzip
import numpy as np
import pandas as pd
import scanpy.api as sc
import matplotlib.pyplot as plt
import math
import seaborn as sns
import umap
import statsmodels.api as sm
import sklearn.preprocessing
import scipy
import scipy.sparse
import sklearn.metrics
import sklearn.mixture
import sklearn.linear_model
from anndata import AnnData
sc.settings.set_figure_params(dpi=100)
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
#%load_ext rpy2.ipython
sns.set(font_scale=1.5)
plt.style.use('seaborn-white')
%matplotlib inline
import sklearn.mixture


sc.logging.print_versions()

In [None]:
sample_name = 'OTX'
min_usable_reads=1000
min_frop=.01

# Run one individual sample

## Processing & clustering
Combine the data and QC matrics 

In [None]:
wd = os.path.join(os.getcwd(),sample_name)
sp = scipy.io.mmread(os.path.join(wd, '{}.mtx'.format(sample_name))).tocsr()
regions = open(os.path.join(
    wd, '{}.regions'.format(sample_name))).read().splitlines()
barcodes = open(os.path.join(
    wd, '{}.barcodes'.format(sample_name))).read().splitlines()
adata = AnnData(sp, {'obs_names': barcodes}, {'var_names': regions})
metrics = pd.read_table(os.path.join(wd,
                                     '{}.qc_metrics.txt'.format(sample_name)),
                        sep='\t',
                        header=0,
                        index_col=0)
adata.obs = adata.obs.join(metrics, how='inner')
adata.obs['log10_unique_usable_reads'] = np.log10(
    adata.obs['unique_usable_reads'] + 1)

filtering on both minimal usable reads and FRoP

In [None]:
adata
adata = adata[adata.obs['unique_usable_reads']>int(min_usable_reads)]
adata = adata[adata.obs['frac_reads_in_promoters']>float(min_frop)]
adata

Annotate genomic bins that have promoter

In [None]:
#low_frip = open(os.path.join(wd, '{}.lowfrip'.format(sample_name))).read().splitlines()
#adata = adata[(~adata.obs.index.isin(low_frip)),:].copy()
promoters = pd.read_table(
    '/home/zhc268/data/GENOME/mm10/gencode.vM17.5kb_tr_promoter_names.txt',
    sep='\t',
    header=None,
    index_col=0,
    names=['prom'])
promoter_names = promoters['prom'].to_dict()
adata.var.index = [
    promoter_names[b] if b in promoter_names else b for b in adata.var.index
]
adata.var_names_make_unique(join='.')


normalize to total 10k counts per cell

In [None]:
adata.var_names.is_unique
adata.raw = sc.pp.log1p(adata, copy=True)
adata_orig = adata.copy()
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)


Select most vairable bins, regress out total reads and then clustering

In [None]:
# https://github.com/theislab/scanpy/issues/450
adata_filter = sc.pp.filter_genes_dispersion(adata.X,
                                             flavor='seurat',
                                             n_bins=20)


hvgs = adata.var.loc[adata_filter.gene_subset].index.tolist()
adata = adata[:, adata.var.index.isin(hvgs)]
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)

adata.obs['log10_usable_counts'] = np.log10(
    adata_orig[:, adata_orig.var.index.isin(hvgs)].X.sum(axis=1).A1)
adata_orig = None

sc.pp.log1p(adata)
sc.pp.regress_out(adata, ['log10_usable_counts'])
sc.pp.scale(adata)
sc.tl.pca(adata, zero_center=False, random_state=0)
sc.pp.neighbors(adata,
                n_neighbors=30,
                method='umap',
                metric='cosine',
                random_state=0,
                n_pcs=50)
sc.tl.leiden(adata, resolution=1, random_state=0)
sc.tl.umap(adata, min_dist=0.3, random_state=0)

## Clustering results
### Examine how many cells per cluster

In [None]:
adata.obs.leiden.value_counts()

### Umap

In [None]:
sc.pl.umap(adata,
           color=['leiden'],
           size=9,
           title='Final clusters',
           legend_loc='on data')

### QC metrics

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(12, 6),sharex=True)
i=0
to_plot=[
        'log10_usable_counts','frac_reads_in_peaks', 'frac_reads_in_promoters',
        'frac_promoters_used', 'frac_mito_reads', 'frac_duplicated_reads'
]

for ax in axs.reshape(-1): 
    sns.boxplot(x='leiden', y=to_plot[i], data=adata.obs,ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(to_plot[i])
    i+=1
plt.tight_layout()
plt.show()

In [None]:
n_cluster = len(adata.obs.leiden.unique())
n_row = math.ceil(n_cluster / 5)
fig, axs = plt.subplots(n_row,
                        5,
                        figsize=(10, 2 * n_row),
                        sharex=True,
                        sharey=True)
j = 0

for ax in axs.reshape(-1):
    if j >= n_cluster: 
        break
    cols = ['red' if i == str(j) else 'grey' for i in adata.obs.leiden.tolist()]
    
    adata.obs.plot.scatter(x='log10_unique_usable_reads',
                           y='frac_reads_in_promoters',
                           s=2,
                           c=cols,
                           ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title("c{0}:{1} cells".format(str(j),str(cols.count('red'))),fontdict={'fontsize':8})
    j += 1

# add a big axis, hide frame
fig.add_subplot(111, frameon=False)
# hide tick and tick label of the big axis
plt.tick_params(labelcolor='none',
                top=False,
                bottom=False,
                left=False,
                right=False)
plt.ylabel('FRoP')
plt.xlabel('log10_unique_usable_reads')
plt.tight_layout()
plt.show()

### Feature genes

In [None]:
sns.set(font_scale=1)
plt.style.use('default')

marker_genes_dic_2 = {
    'Tcell': ["Cd3e.1", "Cd4.1", "Cd8a", 'Tcf7'],
    'Macrophage': ['Adgre1', 'Eif4a1', 'RP23-144N15.4', 'Cx3cr1'],
    'B_cells': [
        "Cd79b",
        "Mzb1",
    ],
    'Epithelial Cell': ['Krt19.1'],
    'Fib': ['Col1a2', 'Col1a2.1', 'Col1a2.2', 'Col1a2.3', 'Col1a2.4', 'Acta2'],
    'Neutrophil':
    ["Ly6g", "Cebpe", "Csf3r", 'Lcn2', 'Ltf', 'S100a8', 'S100a9'],
    'NK': ['Klrc1'],
    'DCs': ["Cd209a", "Cd74", "Flt3", "H2-Eb1", 'Itgax'],  # 'Itgax'-> Cd11c
    'MHC-II': ["H2-Aa", "H2-Ab1", "H2-Eb1"],  #"
}

ax = sc.pl.matrixplot(adata,
                      var_names=marker_genes_dic_2,
                      cmap='Reds',
                      dendrogram=True,
                      groupby='leiden',
                      use_raw=True)

ax = sc.pl.matrixplot(adata,
                      var_names=marker_genes_dic_2,
                      cmap='Reds',
                      dendrogram=True,
                      groupby='leiden',
                      standard_scale='var',
                      use_raw=True)

ax = sc.pl.dotplot(adata,
                   marker_genes_dic_2,
                   groupby='leiden',
                   dendrogram=True,
                   use_raw=True,
                   expression_cutoff=0)

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(12, 6), sharex=True)
i = 0
to_plot = [
    'log10_usable_counts', 'frac_reads_in_peaks', 'frac_reads_in_promoters',
    'frac_promoters_used', 'frac_mito_reads', 'frac_duplicated_reads'
]
for ax in axs.reshape(-1):
    sc.pl.umap(adata,
               color=to_plot[i],
               cmap='Blues' if i==0 else'Reds',
               size=9,
               ax=ax,
               show=False,
               legend_loc='on data')
    i+=1

plt.tight_layout()
plt.show()

In [None]:
import json
with open('./marker_genes_gecodeVm17_tr.json','r') as f:
    marker_genes_dic = json.load(f)
    
for cell_type, markers in marker_genes_dic.items():
    try:
        print(cell_type + " markers:", markers)
        sc.pl.umap(adata,
                   color=[
                       x for x in adata.raw.var_names
                       if re.sub(r'\.[0-9]+', '', x) in markers
                   ],
                   size=9,
                   color_map='Blues',
                   frameon=True,
                   use_raw=True)
    except:
        pass

## Save data

In [None]:
adata.write(filename=os.path.join(wd, '{}.adata.h5ad'.format(sample_name)))
adata
