In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc

In [None]:
pth = os.path.join('..', '..', '..', 'data')
pth_in = os.path.join(pth, 'modeling', 'predict')
pth_mod = os.path.join(pth, 'features', 'biomart', 'modules')

In [3]:
adata = sc.read_h5ad(os.path.join(pth_in, 'CCCA.h5ad'))
adata

AnnData object with n_obs × n_vars = 1406087 × 856
    obs: 'sample', 'cell_type', 'complexity', 'umap1', 'umap2', 'g1s_score', 'g2m_score', 'cell_cycle_phase', 'mp_top_score', 'mp_top', 'mp_assignment', 'ebv', 'celltype', 'Title', 'Category', 'Disease', 'Technology', 'Samples', 'Cells ', 'Name', 'patient', 'cell_subtype', 'source', 'disease', 'p16_status', 'site', 't_stage', 'n_stage', 'gender', 'race', 'age', 'tobacco_use', 'alcohol_use', 'hpv', 'hpv_score', 'treatment', 'procedure', 'sorting', 'cluster', 'malignant', 'sample_id', 'cell_subtype_clusters', 'tnm_stage', 'viral', 'location', 'cancer_type', 'treatment_exposure', 'treatment_response', 'cell_lineage', 'type', 'clusters_by_authors', 'cell_QCpass', 'is_tumor', 'cell_compartment', 'cluster_assignment', 'metastasis', 'sex', 'prior_enzalutamide', 'prior_abiraterone', 'prior_taxane', 'prior_platinum', 'prior_sipuleucel_T', 'purity', 'has_bulk_RNA', 'cancer_cell_state', 'PSA', 'source_region', 'gleason_score', 'ERG_status', 'core

In [4]:
for mod in ('Early', 'Late'):
    print(mod)
    df_mod = pd.read_csv(os.path.join(pth_mod, f'{mod.lower()}.csv'))
    g = df_mod.mmusculus.copy()
    g = g.loc[g.isin(adata.var_names)]
    for _, df_group in adata.obs.groupby('source', observed = True):
        msk = adata.obs_names.isin(df_group.index)
        adata_group = adata[msk].copy()
        sc.tl.score_genes(adata_group,
                          gene_list = g,
                          ctrl_as_ref = False,
                          score_name = mod,
                          random_state = 1234)
        adata.obs.loc[msk, mod] = adata_group.obs[mod]

Early
Late


In [None]:
# Early vs. Late vs. latent_z (correlations)
keys = ['Early', 'Late', 'latent_z']
arr = [[keys[0], keys[0], keys[1]],
       [keys[1], keys[2], keys[2]]]
cols = pd.MultiIndex.from_arrays(arr)
df = pd.DataFrame(columns = cols)
m = np.full([len(keys)] * 2, True)
msk_triu = np.triu(m, k = 1)
grp = adata.obs.groupby('Name', observed = True)
for name, sdf in grp:
    R = sdf[keys].corr()
    df.loc[name] = R.where(msk_triu).stack()

In [None]:
# Early ~ 1/(Late,z) (inverse); Late ~ z (direct)
thresh = .25
msk_early = (df.Early < -thresh).all(axis = 1)
msk_late = (df.Late > thresh).T
msk_both = (msk_early & msk_late).T.values
df.loc[msk_both].sort_values(('Early', 'Late'))

Unnamed: 0_level_0,Early,Early,Late
Unnamed: 0_level_1,Late,latent_z,latent_z
Data_Hovestadt2019_Brain,-0.455929,-0.256742,0.293333
Data_Giustacchini2017_Hematologic,-0.433493,-0.329373,0.306998
Data_Neftel2019_Brain,-0.402575,-0.288898,0.260721
Data_Tirosh2016_Skin,-0.394077,-0.25994,0.33321
Data_Puram2017_Head-and-Neck,-0.326223,-0.452397,0.356743
