In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

In [2]:
pth = os.path.join('..', '..')
pth_data = os.path.join(pth, 'data')
pth_model = os.path.join(pth_data, 'modeling')
pth_in = os.path.join(pth_model, 'predict')
pth_out = os.path.join(pth_model, 'landscape')
pth_mod = os.path.join(pth_data, 'features', 'biomart', 'modules')
pth_fig = os.path.join(pth, 'figures', 'cancer', 'CCCA', 'landscape')

In [3]:
adata = sc.read_h5ad(os.path.join(pth_in, 'CCCA.h5ad'))
adata

AnnData object with n_obs × n_vars = 1406087 × 856
    obs: 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'ebv', 'celltype', 'Title', 'Category', 'Disease', 'Technology', 'Samples', 'Cells ', 'Name', 'patient', 'cell_subtype', 'source', 'disease', 'p16_status', 'site', 't_stage', 'n_stage', 'gender', 'race', 'age', 'tobacco_use', 'alcohol_use', 'hpv', 'hpv_score', 'treatment', 'procedure', 'sorting', 'cluster', 'malignant', 'sample_id', 'cell_subtype_clusters', 'tnm_stage', 'viral', 'location', 'cancer_type', 'treatment_exposure', 'treatment_response', 'cell_lineage', 'type', 'clusters_by_authors', 'cell_QCpass', 'is_tumor', 'cell_compartment', 'cluster_assignment', 'metastasis', 'sex', 'prior_enzalutamide', 'prior_abiraterone', 'prior_taxane', 'prior_platinum', 'prior_sipuleucel_T', 'purity', 'has_bulk_RNA', 'cancer_cell_state', 'PSA', 'source_region', 'gleason_score', 'ERG_status', 'core

In [4]:
adata_ref = sc.read_h5ad(os.path.join(pth_out, 'development.h5ad'))
adata_ref

AnnData object with n_obs × n_vars = 13613 × 856
    obs: 'Stages', 'Clusters', 'Type', 'LineageAnnotations', 'celltype', 'trajectory', 't', 'training', 'validation', 'sample', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'plates', 'devtime', 'location', 'total_counts_ERCC', 'pct_counts_ERCC', 'doublet_scores', 'CytoTRACE', 'Gut_neuron', 'Sensory', 'Symp', 'enFib', 'ChC', 'Gut_glia', 'NCC', 'Mesenchyme', 'Melanocytes', 'SatGlia', 'SC', 'BCC', 'conflict', 'assignments', 'Hub', 'Hub_leiden', 'total_counts_ribo', 'pct_counts_ribo', 'T+ Sox2+', 'batch', 'percent_mito', 'n_counts', 'bh_pval', 'region', 'tissue', 'adj_stage', 'adj_sample', 'S_score', 'G2M_score', 'phase', 'leiden_R', 'celltype_original', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'RNA_snn_res.0.1', 'seurat_clusters', 'age', 'RNA_snn_res.0.5', 'ident', 'velocity_self_transition', 'source', 'weight_celltype', 'weight_source', 'weight', 'latent_z', 'Early', 'Late', 'X_mesen1', '

In [5]:
# early/late signatures
mod_names = ['Early', 'Late']
for mod in mod_names:
    print(mod)
    fn = os.path.join(pth_mod, f'{mod.lower()}.csv')
    g = pd.read_csv(fn).mmusculus.copy()
    g = g.loc[g.isin(adata.var_names)]
    for _, df_group in adata.obs.groupby('source', observed = True):
        msk = adata.obs_names.isin(df_group.index)
        adata_group = adata[msk].copy()
        sc.tl.score_genes(adata_group,
                          gene_list = g,
                          ctrl_as_ref = False,
                          score_name = mod,
                          random_state = 1234)
        adata.obs.loc[msk, mod] = adata_group.obs[mod]

# save basis
X = adata.obs[mod_names].values
adata.obsm['X_mod'] = X.copy()

Early
Late


In [6]:
# early-late, latent axes => landscape
dX = (X[:, 1] - X[:, 0]).reshape(-1, 1)
z = adata.obs.latent_z.values.reshape(-1, 1)
X_mesen = np.concatenate((dX, z), axis = 1)
adata.obsm['X_mesen'] = X_mesen.copy()

In [7]:
# select landscape (Early + Late > 0)
dims = ['X_mesen1', 'X_mesen2']
adata = adata[X.sum(axis = 1) > 0].copy()
adata.obs[dims] = adata.obsm['X_mesen'].copy()
adata.write(os.path.join(pth_out, 'CCCA.h5ad'))
adata

AnnData object with n_obs × n_vars = 96126 × 856
    obs: 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'ebv', 'celltype', 'Title', 'Category', 'Disease', 'Technology', 'Samples', 'Cells ', 'Name', 'patient', 'cell_subtype', 'source', 'disease', 'p16_status', 'site', 't_stage', 'n_stage', 'gender', 'race', 'age', 'tobacco_use', 'alcohol_use', 'hpv', 'hpv_score', 'treatment', 'procedure', 'sorting', 'cluster', 'malignant', 'sample_id', 'cell_subtype_clusters', 'tnm_stage', 'viral', 'location', 'cancer_type', 'treatment_exposure', 'treatment_response', 'cell_lineage', 'type', 'clusters_by_authors', 'cell_QCpass', 'is_tumor', 'cell_compartment', 'cluster_assignment', 'metastasis', 'sex', 'prior_enzalutamide', 'prior_abiraterone', 'prior_taxane', 'prior_platinum', 'prior_sipuleucel_T', 'purity', 'has_bulk_RNA', 'cancer_cell_state', 'PSA', 'source_region', 'gleason_score', 'ERG_status', 'cores'

In [8]:
#
thresh = .1
scale = np.ptp(adata_ref.obsm['X_mesen'], axis = 0)
X_ref = adata_ref.obsm['X_mesen'][np.newaxis, :, :] / scale

for name, sdf in adata.obs.groupby('Name', observed = True):
    X_name = sdf[dims].values[:, np.newaxis, :] / scale
    dist = np.linalg.norm(X_name - X_ref, axis = 2)
    counts = (dist <= thresh).sum(axis = 0)
    adata_ref.obs[name] = counts

In [12]:
# # 
# for name in adata.obs.Name.cat.categories:
#     print(adata[adata.obs.Name == name].obsm['X_mod'].sum(axis = 1).mean())
#     # plot signature/latent landscape
#     fig, ax = plt.subplots(1, 1, figsize = (5, 4.75))
#     sc.pl.embedding(adata_ref, 'X_mesen', size = 50, alpha = .4,
#                     color = name, cmap = 'Reds', frameon = False,
#                     show = False, ax = ax)

#     # sc.pl.embedding(,
#     #                 'X_mesen', size = 50, alpha = .4,
#     #                 color = 'latent_z', vmin = 0, vmax = 1,
#     #                 cmap = 'vanimo', colorbar_loc = None,
#     #                 frameon = False, show = False, ax = ax)

#     # draw trajectories
#     for src, sdf in adata_ref.obs.groupby('source', observed = True):
#         sdf = sdf.copy()
#         sdf['latent_z_bin'] = pd.qcut(sdf.latent_z, 13)
#         sdf = sdf.groupby('latent_z_bin', observed = True)[dims].mean()
#         ax.plot(sdf.X_mesen1, sdf.X_mesen2, lw = 2.5, color = 'gray')

#     ax.invert_yaxis()
#     # ax.set_title('MesenCoder Landscape', size = 11.5)
#     # fn = os.path.join(pth_out, 'landscape.pdf')
#     # plt.savefig(fn, bbox_inches = 'tight')
#     plt.show()


In [None]:
# plot landscape
fig, ax = plt.subplots(1, 1, figsize = (5, 5))
sc.pl.embedding(adata, 'X_mesen', size = 80, alpha = .35,
                color = 'latent_z', vmin = 0, vmax = 1,
                cmap = 'vanimo', colorbar_loc = None,
                frameon = False, show = False, ax = ax)

c = {'GSE162534'         : 'gold',
     'GSE229103'         : 'tab:cyan',
     'rRNAModifications' : 'orangered'}

# draw trajectories
for src, sdf in adata.obs.groupby('source', observed = True):
    sdf['latent_z_bin'] = pd.qcut(sdf.latent_z, 13)
    sdf = sdf.groupby('latent_z_bin', observed = True)[dims].mean()
    ax.plot(sdf[dims[0]], sdf[dims[1]], lw = 2.75,
            color = c[src], label = src_dict[src])

ax.invert_yaxis()
leg = ax.legend(bbox_to_anchor = (.875, -.1),
                frameon = False, fontsize = 10.5)
ax.set_title('MesenCoder Landscape', size = 13)
fn = os.path.join(pth_fig, 'landscape.pdf')
plt.savefig(fn, bbox_inches = 'tight')