# Cluster gene trajectories into modules

In [None]:
wd = '/Users/calebreagor/Documents/hudspeth-lab'

# script dependencies
import sys, pickle
import h5py, rpy2
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sys.path.insert(0, wd)

# custom class for scRNA-seq datasets
from classes.singlecell import dataset

# R interface via rpy2
from rpy2 import robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import numpy2ri
from rpy2.robjects import pandas2ri
numpy2ri.activate()
pandas2ri.activate()
base = importr("base")
dollar = base.__dict__["$"]

In [None]:
%matplotlib inline
mpl.rcParams['figure.dpi']= 1000

from IPython.display import Markdown
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 5)

---
## Cluster gene trajectories into modules

In [None]:
# load trajectory dataset from pickle file
with open('diff_traj.pickle', 'rb') as f:
    diff_traj = pickle.load(f)

In [None]:
# load lateral line genes (lush et al. & zfin)
lat_line = pd.read_csv('refs/drerio_latline.csv')

# bin data in pseudotime and expression domains:
# * binning in pseudotime spaces the data more evenly
# *** performed via simple histogram binning
# * binning in expression allows us to calculate MI 
# *** performed via Bayesian blocks adaptive binning

diff_traj.bin_data(data = 'imputed', in_pt = True, pt_bin = 0.025,
                   genes = lat_line['Ensembl_id'].unique())

display(Markdown('### Bin imputed data in pseudotime and expression'))
display(diff_traj.binned)

In [None]:
# find pairwise gene similarities (adjusted MI)
diff_traj.find_gene_similarities(n_runs=10)

display(Markdown('### Gene similarities (Adjusted Mutual Information)'))
display(diff_traj.gene_similarities)

### Find the optimal number of clusters using spectral clustering and silhouette score

In [None]:
diff_traj.cluster_genes(n_components=2, max_clusters=20, plot_silhouette=True)

### Plot average gene module trajectories and errors

In [None]:
# plot smoothened average and errors for each module
diff_traj.plot_modules(data='imputed', smoothing=0.1)

# add labels for cell stage
for ax in diff_traj.module_axes:
    y, a, z, v = 0.5, 0.33, 0, 'center'
    ax.text(diff_traj.pseudotimes.iloc[np.where(diff_traj.clusters==14)].min(),
            y, 'central s.c.', va=v, ha='left', alpha=a, zorder=z)
    ax.text(diff_traj.pseudotimes.iloc[np.where(diff_traj.clusters==4)].max(),
            y, 'diff. s.c.', va=v, ha='left', alpha=a, zorder=z)
    ax.text(diff_traj.pseudotimes.iloc[np.where(diff_traj.clusters==2)].mean(),
            y, 'young h.c.', va=v, ha='center', alpha=a, zorder=z)
    ax.text(diff_traj.pseudotimes.iloc[np.where(diff_traj.clusters==1)].mean(),
            y, 'mature h.c.', va=v, ha='center', alpha=a, zorder=z)

---
## Cell signaling pathway enrichment in pseudotime

In [None]:
# order genes along pseudotime axis
# * criteria: maximum expression pseudotime

diff_traj.order_genes_pt(method='max')

display(Markdown('### Maximum expression'))
display(diff_traj.genes_1d)

In [None]:
# bin genes in pseudotime and perform GO analysis
# * term enrichment for KEGG pathways via Enrichr

display(Markdown('### GO term enrichment for developmental cell signaling pathways'))

diff_traj.pathway_ea_in_pt(pathways = ['Cell cycle',
                           'Notch signaling pathway',
                           'Wnt signaling pathway',
                           'Hedgehog signaling pathway',
                           'TGF-beta signaling pathway'],
                            pt_bin=0.09999, plot=True)