![](neuromasts.jpeg)

# Neuromast single cell pipeline 

**Caleb Reagor, Rockefeller University**

Notebook summary:
* Cluster gene trajectories into modules
* Analyze pathway enrichment in pseudotime
* Differential expression analysis of hair cell polarities

In [None]:
# additional dependencies
import h5py, matplotlib, rpy2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# R interface via rpy2
from rpy2 import robjects
import rpy2.robjects.numpy2ri
robjects.numpy2ri.activate()
from rpy2.robjects.packages import importr
base = importr("base")
dollar = base.__dict__["$"]


# custom class for sc datasets
# * additional dependencies:
#   * scipy
#   * sklearn
#   * tqdm
#   * scprep
#   * gseapy
#   * magic-impute
#   * bblocks.py

from dataset import dataset

In [None]:
%matplotlib inline
matplotlib.rcParams['figure.dpi']= 1000

from IPython.display import Markdown
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 3)

---
## Load data from Lush et al., 2019

In [None]:
lush = dataset(name='lush et al')

# load GEO dataset from hdf5 file
f = h5py.File(('geo-datasets/'
               'GSE123241.h5'),'r')
    
group = 'danRer10.Ens_84'

lush.raw_counts_from_sparse_matrix(
    
    cell_names = [i.decode('ascii') for i in f[group]['barcodes'][:]],
    gene_names = [i.decode('ascii') for i in f[group]['genes'][:]], 
    data = f[group]['data'], dtype = 'i4', indices = f[group]['indices'],
    indptr = f[group]['indptr'], shape = tuple(reversed(f[group]['shape'])) )

# load pseudotime/clustering from fig 4I
lush4i = pd.read_csv(('geo-datasets/'
                      'lush_fig4i.csv'),
                     index_col=0)

lush4i.sort_values('pseudotime', inplace=True)

display(Markdown('### Raw counts'))
display(lush.raw_counts)

In [None]:
# pre-process, scale and impute expression
# * filter rare genes and cells with low counts
# * normalize library sizes, then log scale
# * impute expression using data diffusion

lush.preprocess_raw_counts(library_size_cutoff=0) # pre-filtered

display(Markdown('### Filtered, normalized and scaled counts'))
display(lush.normalized)

In [None]:
lush.impute_from_normalized(genes='all_genes')

display(Markdown('### Imputed counts'))
display(lush.imputed)

In [None]:
# new dataset object for differentiating hair cell trajectory
diff_traj = dataset(name='diff hair cell trajectory')

# cell barcodes for cells in the trajectory
trajectory = lush4i.loc[lush4i['cluster'].isin([14,4,2,1])].index

# assign expression values from lush dataset object
diff_traj.raw_counts = lush.raw_counts.loc[trajectory]
diff_traj.normalized = lush.normalized.loc[trajectory]
diff_traj.imputed = lush.imputed.loc[trajectory]

display(Markdown('### Differentiating hair cell trajectory (imputed counts)'))
display(diff_traj.imputed)

In [None]:
# assign pseudotime/clustering from lush et al figure 4I
diff_traj.pseudotimes = lush4i.loc[trajectory,'pseudotime']
diff_traj.clusters = lush4i.loc[trajectory,'cluster']

display(Markdown('### Trajectory pseudotimes'))
display(diff_traj.pseudotimes)

---
## Cluster gene trajectories into modules

In [None]:
# load lateral line genes (lush et al. & zfin)
lat_line = pd.read_csv('refs/drerio_latline.csv')

# bin data along pseudotime and expression axes
# * this spaces the data more evenly (in pseudotime)
# *** via simple histogram binning
# * and allows us to calculate MI (by expression)
# *** via Bayesian blocks adaptive binning 

diff_traj.bin_data(data = 'imputed', in_pt = True, pt_bin = 0.025,
                   genes = lat_line['Ensembl_id'].unique())

display(Markdown('### Bin imputed data in pseudotime and expression'))
display(diff_traj.binned)

In [None]:
# find pairwise gene similarities (adjusted MI)
diff_traj.find_gene_similarities(n_runs=5)

display(Markdown('### Gene similarities (Adjusted Mutual Information)'))
display(diff_traj.gene_similarities)

### Find the optimal number of clusters using spectral clustering and silhouette score

In [None]:
diff_traj.cluster_genes(n_components=2, max_clusters=10, plot_silhouette=True)

### Plot average gene module trajectories and errors

In [None]:
# plot smoothened average and errors for each module
diff_traj.plot_modules(data='imputed', smoothing=0.1)

# add labels for cell stage
for ax in diff_traj.module_axes:
    y, a, z, v = 0.5, 0.33, 0, 'center'
    ax.text(diff_traj.pseudotimes.iloc[np.where(diff_traj.clusters==14)].min(),
            y, 'central s.c.', va=v, ha='left', alpha=a, zorder=z)
    ax.text(diff_traj.pseudotimes.iloc[np.where(diff_traj.clusters==4)].max(),
            y, 'diff. s.c.', va=v, ha='left', alpha=a, zorder=z)
    ax.text(diff_traj.pseudotimes.iloc[np.where(diff_traj.clusters==2)].mean(),
            y, 'young h.c.', va=v, ha='center', alpha=a, zorder=z)
    ax.text(diff_traj.pseudotimes.iloc[np.where(diff_traj.clusters==1)].mean(),
            y, 'mature h.c.', va=v, ha='center', alpha=a, zorder=z)

---
## Cell signaling pathway enrichment in pseudotime

In [None]:
# order genes along pseudotime axis
# * criteria: maximum expression pseudotime

diff_traj.order_genes_pt(method='max')

display(Markdown('### Maximum expression'))
display(diff_traj.genes_1d)

In [None]:
# bin genes in pseudotime and perform GO analysis
# * term enrichment for KEGG pathways via Enrichr

display(Markdown('### GO term enrichment for developmental cell signaling pathways'))

diff_traj.pathway_ea_in_pt(pathways = ['Cell cycle',
                           'Notch signaling pathway',
                           'Wnt signaling pathway',
                           'Hedgehog signaling pathway',
                           'TGF-beta signaling pathway'],
                            pt_bin=0.09999, plot=True)

---
## Find differentially expressed genes between hair cells of opposite polarities

In [None]:
# # new dataset object for polarizing hair cells
# pols = dataset(name='polarizing hair cells')

# # cell barcodes for differentiating central support cells
# cells = lush4i.loc[lush4i['cluster'].isin([14,4])].index

# # assign data from previous trajectory dataset object
# pols.raw_counts = diff_traj.raw_counts.loc[cells]
# pols.normalized = diff_traj.normalized.loc[cells]
# pols.imputed = diff_traj.imputed.loc[cells]
# pols.pseudotimes = diff_traj.pseudotimes[cells]
# pols.clusters = diff_traj.clusters[cells]

# display(Markdown('### Hair cells undergoing the polarity determination (imputed counts)'))
# display(pols.imputed)

### Embed cells in low dimensions using known polarity genes

In [None]:
# # load polarity genes: deltas, notch genes, emx2
# pol_genes = pd.read_csv('refs/polarity_genes.csv')
# pol_genes.set_index('Ensembl_id', drop=False, inplace=True)
# g = pol_genes['Ensembl_id'].isin(pols.imputed.columns).index
# g_names = pol_genes.loc[g]
# g_names.drop('Ensembl_id', axis=1, inplace=True)

# # dimensionality reduction
# pols.embed_pca(data='imputed', 
#                n_components=5, 
#                genes=g)

# # t-stochastic neighbor embedding
# pols.embed_tsne(data='pca')

# display(Markdown('### Polarity genes'))
# display(g_names)

### Fit a principal curve to the differentiating hair cell trajectory

In [None]:
# # use the princurve package in R
# princurve = importr('princurve', on_conflict='warn')
# pc = princurve.principal_curve(pols.tsne_embedding.values)
# cur = np.array(dollar(pc,'s'))
# ordr = np.array(dollar(pc,'ord')) - 1

# pol_point = -35 # polarization point
# tsne1_pre = cur[ordr,0][:pol_point+1]
# tsne2_pre = cur[ordr,1][:pol_point+1]
# tsne1_pol = cur[ordr,0][pol_point:]
# tsne2_pol = cur[ordr,1][pol_point:]

# # new cluster labels for hair cells split by principal curve
# new_labels = pols.tsne_embedding.iloc[ordr,1][pol_point:] > tsne2_pol
# pols.clusters.loc[new_labels.index] = new_labels.astype(np.int)

In [None]:
# # plot the polarity tsne and principal curve
# pols.plot_embedding(data='tsne', ar=0.6, 
#                     labels=['central s.c.', 
#                             'diff s.c.', 
#                             'polarity0',
#                             'polarity1'])

# pols.embedding_axes.plot(tsne1_pol, 
#                          tsne2_pol, 
#                          c='gray')

# pols.embedding_axes.plot(tsne1_pre, 
#                          tsne2_pre,
#                          linestyle='--', 
#                          c='gray')
# display(Markdown('### A principal curve separates differentiating hair cells of opposite polarities'))
# display(Markdown('* tSNE embedding based only on genes known to participate in the polarity determination'))
# plt.show()

### Test for differential gene expression between cells of opposite polarities

In [None]:
# # deseq2 diff expression analysis
# pols.diff_exp2(clusters=[0,1])

### Plot expression of known polarity genes

In [None]:
# pols.plot_violin(clusters=[0,1], cluster_labels=['polarity0', 'polarity1'],
#                  gene=g_names.index[5], gene_label=g_names['gene_name'].values[5], ar=1)

In [None]:
# pols.plot_violin(clusters=[0,1], cluster_labels=['polarity0', 'polarity1'],
#                  gene='ENSDARG00000054562', gene_label='her15.1', ar=0.2)