In [None]:
import os
import gzip
import numpy as np
import pandas as pd
import scanpy.api as sc
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import statsmodels.api as sm
import sklearn.preprocessing
import scipy
import scipy.sparse
import sklearn.metrics
import sklearn.mixture
import sklearn.linear_model
from anndata import AnnData
sc.settings.set_figure_params(dpi=100)
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
#%load_ext rpy2.ipython
sns.set(font_scale=1.5)
plt.style.use('seaborn-white')
%matplotlib inline
import sklearn.mixture


sc.logging.print_versions()

In [None]:
sample_name = 'OTX'
min_usable_reads=1000

# Read depth distribution

In [None]:
wd = os.path.join(os.getcwd(),sample_name)

qc = pd.read_table(os.path.join(wd, '{}.qc_metrics.txt'.format(sample_name)),
                   sep='\t',
                   header=0,
                   index_col=0)
qc['log10_unique_usable_reads'] = np.log10(qc['unique_usable_reads'] + 1)
qc = qc[qc['log10_unique_usable_reads']>np.log10(50)]
qc = qc.sort_values('log10_unique_usable_reads')


# Run one individual sample

In [None]:
#sample_name = 'A0016'
#wd = '/home/joshchiou/joshchiou-data2/biobank_snATAC/{}'.format(sample_name)
sp = scipy.io.mmread(os.path.join(wd, '{}.mtx'.format(sample_name))).tocsr()
regions = open(os.path.join(
    wd, '{}.regions'.format(sample_name))).read().splitlines()
barcodes = open(os.path.join(
    wd, '{}.barcodes'.format(sample_name))).read().splitlines()
adata = AnnData(sp, {'obs_names': barcodes}, {'var_names': regions})
metrics = pd.read_table(os.path.join(wd,
                                     '{}.qc_metrics.txt'.format(sample_name)),
                        sep='\t',
                        header=0,
                        index_col=0)
adata.obs = adata.obs.join(metrics, how='inner')

In [None]:
adata = adata[adata.obs['unique_usable_reads']>int(min_usable_reads)]
adata

In [None]:
#low_frip = open(os.path.join(wd, '{}.lowfrip'.format(sample_name))).read().splitlines()
#adata = adata[(~adata.obs.index.isin(low_frip)),:].copy()
promoters = pd.read_table(
    '/home/zhc268/data/GENOME/mm10/gencode.vM17.5kb_tr_promoter_names.txt',
    sep='\t',
    header=None,
    index_col=0,
    names=['prom'])
promoter_names = promoters['prom'].to_dict()
adata.var.index = [
    promoter_names[b] if b in promoter_names else b for b in adata.var.index
]
adata.var_names_make_unique(join='.')


In [None]:
adata.var_names.is_unique
adata.raw = sc.pp.log1p(adata, copy=True)
adata_orig = adata.copy()
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)


In [None]:
# https://github.com/theislab/scanpy/issues/450
adata_filter = sc.pp.filter_genes_dispersion(adata.X,
                                             flavor='seurat',
                                             n_bins=20)


In [None]:
hvgs = adata.var.loc[adata_filter.gene_subset].index.tolist()
adata = adata[:, adata.var.index.isin(hvgs)]
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)

adata.obs['log10_usable_counts'] = np.log10(
    adata_orig[:, adata_orig.var.index.isin(hvgs)].X.sum(axis=1).A1)
adata_orig = None

sc.pp.log1p(adata)
sc.pp.regress_out(adata, ['log10_usable_counts'])
sc.pp.scale(adata)
sc.tl.pca(adata, zero_center=False, random_state=0)
sc.pp.neighbors(adata,
                n_neighbors=30,
                method='umap',
                metric='cosine',
                random_state=0,
                n_pcs=50)


In [None]:
sc.tl.leiden(adata, resolution=1, random_state=0)
sc.tl.umap(adata, min_dist=0.3, random_state=0)
adata.write(filename=os.path.join(wd, '{}.adata.h5ad'.format(sample_name)))


In [None]:
adata.obs.leiden.value_counts()

In [None]:
sc.pl.umap(adata,
           color=['leiden'],
           size=9,
           title='Final clusters',
           legend_loc='on data')

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(12, 6),sharex=True)
i=0
to_plot=[
        'log10_usable_counts','frac_reads_in_peaks', 'frac_reads_in_promoters',
        'frac_promoters_used', 'frac_mito_reads', 'frac_duplicated_reads'
]

for ax in axs.reshape(-1): 
    sns.boxplot(x='leiden', y=to_plot[i], data=adata.obs,ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(to_plot[i])
    i+=1
plt.tight_layout()
plt.show()

sc.tl.louvain(adata,
              restrict_to=('leiden', ['5']),
              resolution=1.5,
              random_state=0,
              key_added='subset')

sc.pl.umap(adata,
           color=['subset'],
           size=9,
           title='Final clusters',
           legend_loc='on data')
sc.pl.umap(adata,
           color=['log10_usable_counts'],
           title='log10(read depth)',
           size=9,
           color_map='Blues')

fig, ax1 = plt.subplots(1, 1, figsize=(9, 5))
sns.boxplot(x='subset', y='log10_usable_counts', data=adata.obs)
ax1.set_xlabel('')
ax1.set_ylabel('log10(read depth)')
plt.show()


In [None]:
sc.pl.umap(adata,color=['log10_usable_counts'],
           title='log10(read depth)',
           size=9,
           color_map='Blues')
sc.pl.umap(adata, color=['frac_reads_in_peaks'], cmap='Reds', size=9, legend_loc='on data')
sc.pl.umap(adata, color=['frac_reads_in_promoters'], cmap='Reds', size=9, legend_loc='on data')
sc.pl.umap(adata, color=['frac_promoters_used'], cmap='Reds', size=9, legend_loc='on data')
sc.pl.umap(adata, color=['frac_mito_reads'], cmap='Reds', size=9, legend_loc='on data')
sc.pl.umap(adata, color=['frac_duplicated_reads'], cmap='Reds', size=9, legend_loc='on data')

In [None]:
# T cell 

sc.pl.umap(adata,
           color=[" Cd3e", " Cd4", " Cd8a"           ],
           size=9,
           color_map='Blues',
           frameon=True,
           use_raw=True)

In [None]:
# Cd4+ T cell 

sc.pl.umap(adata,
           color=[" Cd4", " Igfbp4", " Tcf7",   ],#" Trbc2"  
           size=9,
           color_map='Blues',
           frameon=True,
           use_raw=True)

In [None]:
# cd8+ T
sc.pl.umap(adata,
           color=[
               ' Cd8a',
               ' Cd8b1',
               ' Nkg7',
           ],
           size=9,
           color_map='Blues',
           frameon=True,
           use_raw=True)

In [None]:
# CXCR6+ T cell

sc.pl.umap(adata,
           color=[" Cd3g", " Cxcr6", " Icos", " Il7r"],
           size=9,
           color_map='Blues',
           frameon=True,
           use_raw=True)


In [None]:
# macrophage

sc.pl.umap(adata,
           color=[" Adgre1", " Cd14", " Csf1r", " Fcgr1"],
           size=9,
           color_map='Blues',
           frameon=True,
           use_raw=True)


In [None]:
# MHCII

sc.pl.umap(adata,
           color=[" H2-Ab1", " H2-Aa", " H2-Ob", " H2-Eb1"],
           size=9,
           color_map='Blues',
           frameon=True,
           use_raw=True)


In [None]:
# Bcells
sc.pl.umap(adata,               
           color=[" Cd79a"," Cd79b"," Ly6d", " Mzb1",' Cd19'],
           size=9,
           color_map='Blues',
           frameon=True,
           use_raw=True)


In [None]:
# DCs
sc.pl.umap(adata,               
           color=[" Cd209a", " Cd74", " Flt3", " H2-Eb1"],
           size=9,
           color_map='Blues',
           frameon=True,
           use_raw=True)


In [None]:
adata.shape