In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from umap import UMAP
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, Isomap, MDS, LocallyLinearEmbedding

sns.set(style = 'white', font_scale=1.5)

In [2]:
import sys
sys.path.insert(0, '/home/was966/Research/mims-conceptor/')
from conceptor.tokenizer import CONCEPT_palette
CONCEPT_palette = pd.DataFrame([CONCEPT_palette]).T.reset_index().sort_index(ascending=False).set_index('index')[0].to_dict()

hue_order = CONCEPT_palette.keys()
hue_color = CONCEPT_palette.values()

In [3]:
dfc = pd.read_csv('../../02_extract_readouts//TCGA/04_features_celltype.csv', index_col=0)
dfl = pd.read_csv('../../02_extract_readouts//TCGA/00_clinical_label_orignal.csv', index_col=0)

tcga = dfc['bcr_patient_barcode'].map(dfl['cancer_type'].apply(lambda x:x.split('-')[1])).to_frame(name='cancer_type')
tcga['domain'] = 'TCGA'
tcga_dfc = tcga.join(dfc)

dfc = pd.read_csv('../../02_extract_readouts//ITRP/04_features_celltype.csv', index_col=0)
dfc = dfc.rename(columns = {'Index':'bcr_patient_barcode'})

dfl = pd.read_csv('../../02_extract_readouts//ITRP/00_clinical_label.csv', index_col=0)
itrp = dfc['bcr_patient_barcode'].map(dfl['cancer_type']).to_frame(name='cancer_type')

itrp['domain'] = 'ITRP'
itrp_dfc = itrp.join(dfc)

dfc = tcga_dfc._append(itrp_dfc)

In [4]:
data = dfc[dfc.columns[-32:]]
mp = PCA(n_components = 2) #
pca2d = mp.fit_transform(data)
df_pca2d  = pd.DataFrame(pca2d, index=data.index, columns = ['PCA1', 'PCA2'])

In [None]:
mp = UMAP(n_components = 2,  n_neighbors = 200, min_dist=0.5,  random_state = 42, verbose=1 ) #
umap2d = mp.fit_transform(data)
df_umap2d  = pd.DataFrame(umap2d, index=data.index, columns = ['UMAP1', 'UMAP2'])

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


UMAP(min_dist=0.5, n_neighbors=200, random_state=42, verbose=1)
Fri Mar 15 23:04:25 2024 Construct fuzzy simplicial set
Fri Mar 15 23:04:25 2024 Finding Nearest Neighbors
Fri Mar 15 23:04:25 2024 Building RP forest with 41 trees
Fri Mar 15 23:04:31 2024 NN descent for 19 iterations
	 1  /  19
	 2  /  19


In [None]:
df2d = dfc[['cancer_type', 'domain', 'bcr_patient_barcode', 'feature_name']].join(df_pca2d).join(df_umap2d)

In [None]:
df2d.to_csv('celltype_space.csv')

In [None]:
df2d.head()

In [None]:
dfp = df2d[df2d.domain == 'TCGA']

fig, ax = plt.subplots(figsize=(10, 10))

x = 'PCA1'
y = 'PCA2'
hue = 'feature_name'

sns.scatterplot(data = dfp, x = x, y = y, hue = hue,  alpha = 0.8,
                linewidth=0.0, hue_order = hue_order, palette=hue_color, s = 1, 
                 ax=ax, legend=False)

mean = dfp.groupby(hue)[[x,y]].median()
for name in mean.index:
    s = mean.loc[name]
    ax.text(s[x], s[y], name,  fontdict={'fontsize':9})

ax.tick_params(bottom='on', left='off',  labelleft='on', labelbottom='on', pad=-.6,)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

fig.savefig('cell_TCGA_PCA_labelled.pdf')


dfp = df2d[df2d.domain == 'TCGA']

fig, ax = plt.subplots(figsize=(10, 10))

x = 'PCA1'
y = 'PCA2'
hue = 'feature_name'

sns.scatterplot(data = dfp, x = x, y = y, hue = hue,  alpha = 0.8,
                linewidth=0.0, hue_order = hue_order, palette=hue_color, s = 1, 
                 ax=ax, legend=False)

# mean = dfp.groupby(hue)[[x,y]].median()
# for name in mean.index:
#     s = mean.loc[name]
#     ax.text(s[x], s[y], name,  fontdict={'fontsize':9})

ax.tick_params(bottom='on', left='off',  labelleft='on', labelbottom='on', pad=-.6,)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

fig.savefig('cell_TCGA_PCA_unlabelled.pdf', bbox_inches ='tight')

In [None]:
dfp = df2d[df2d.domain == 'ITRP']

fig, ax = plt.subplots(figsize=(10, 10))

x = 'PCA1'
y = 'PCA2'
hue = 'feature_name'

sns.scatterplot(data = dfp, x = x, y = y, hue = hue,  alpha = 0.8,
                linewidth=0.0, hue_order = hue_order, palette=hue_color, s =1.5, 
                 ax=ax, legend=False)

mean = dfp.groupby(hue)[[x,y]].median()
for name in mean.index:
    s = mean.loc[name]
    ax.text(s[x], s[y], name,  fontdict={'fontsize':9})

ax.tick_params(bottom='on', left='off',  labelleft='on', labelbottom='on', pad=-.6,)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
fig.savefig('cell_ITRP_PCA_labelled.pdf')



dfp = df2d[df2d.domain == 'ITRP']

fig, ax = plt.subplots(figsize=(10, 10))

x = 'PCA1'
y = 'PCA2'
hue = 'feature_name'

sns.scatterplot(data = dfp, x = x, y = y, hue = hue,  alpha = 0.8,
                linewidth=0.0, hue_order = hue_order, palette=hue_color, s =1.5, 
                 ax=ax, legend=False)

# mean = dfp.groupby(hue)[[x,y]].median()
# for name in mean.index:
#     s = mean.loc[name]
#     ax.text(s[x], s[y], name,  fontdict={'fontsize':9})

ax.tick_params(bottom='on', left='off',  labelleft='on', labelbottom='on', pad=-.6,)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
fig.savefig('cell_ITRP_PCA_unlabelled.pdf', bbox_inches ='tight')

In [None]:
dfp = df2d[df2d.domain == 'TCGA']

fig, ax = plt.subplots(figsize=(10, 10))

x = 'UMAP1'
y = 'UMAP2'
hue = 'feature_name'

sns.scatterplot(data = dfp, x = x, y = y, hue = hue,  alpha = 0.8,
                linewidth=0.0, hue_order = hue_order, palette=hue_color, s = 0.5, 
                 ax=ax, legend=False)

mean = dfp.groupby(hue)[[x,y]].median()
for name in mean.index:
    s = mean.loc[name]
    ax.text(s[x], s[y], name,  fontdict={'fontsize':8})

ax.tick_params(bottom='on', left='off',  labelleft='on', labelbottom='on', pad=-.6,)
ax.set_xlabel('UMAP1')
ax.set_ylabel('UMAP2')

fig.savefig('cell_TCGA_UMAP_labelled.pdf', bbox_inches ='tight')

In [None]:
dfp = df2d[df2d.domain == 'ITRP']

fig, ax = plt.subplots(figsize=(10, 10))

x = 'UMAP1'
y = 'UMAP2'
hue = 'feature_name'

sns.scatterplot(data = dfp, x = x, y = y, hue = hue,  alpha = 0.8,
                linewidth=0.0, hue_order = hue_order, palette=hue_color, s = 0.5, 
                 ax=ax, legend=False)

mean = dfp.groupby(hue)[[x,y]].median()
for name in mean.index:
    s = mean.loc[name]
    ax.text(s[x], s[y], name,  fontdict={'fontsize':9})

ax.tick_params(bottom='on', left='off',  labelleft='on', labelbottom='on', pad=-.6,)
ax.set_xlabel('UMAP1')
ax.set_ylabel('UMAP2')

fig.savefig('cell_ITRP_UMAP_labelled.pdf', bbox_inches ='tight')