In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set(style = 'white', font_scale=1.5)

from umap import UMAP

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
df_study_tpm = pd.read_csv('./model_data/itrp.v1.pc.csv', index_col=0)
df_study_tpm.shape

(349, 14880)

In [None]:
df_tcga_tpm = pd.read_pickle('../../tcga_dataset/df8.pkl')

print(df_tcga_tpm.shape)

tcga_data_path = '/n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37'
df_tcga_mut = pd.read_pickle(os.path.join(tcga_data_path, 'df_patient_mut.pkl'))
spm = list(set(df_tcga_mut.index) & set(df_tcga_tpm.index))

df_tcga_tpm = df_tcga_tpm.loc[spm]
df_tcga_mut = df_tcga_mut.loc[spm]
df_tcga_mut.shape

(8971, 20191)


In [None]:
com_genes = list(set(df_study_tpm.columns[2:]) & set(df_tcga_tpm.columns[1:]))

In [None]:
dfm1 = df_study_tpm[com_genes].join(df_study_tpm.cohort)
dfm1['cancer_type'] = 'TCGA-SKCM'

In [None]:
dfm2 = df_tcga_tpm[com_genes].join(df_tcga_tpm.cancer_type)
dfm2['cohort'] = 'TCGA'

In [None]:
dfm = pd.concat([dfm1, dfm2])

In [None]:
skcm = dfm[dfm.cancer_type == 'TCGA-SKCM']

In [None]:
def _sel_genes(x):
    var = x[com_genes].var(axis=0) 
    sel_genes = var[var > 1].index
    return sel_genes

In [None]:
sgenes = skcm.groupby('cohort').apply(lambda x:_sel_genes(x).tolist()).tolist()
sgenes = list(set.intersection(*map(set, sgenes)))

In [None]:
print(len(sgenes))

dfmp = skcm[sgenes]
dfmp = np.log2(dfmp+1)
#dfmp = dfmp.join(dfm.cohort)

In [None]:
mp = UMAP(spread=2.5, random_state=123)
skcm_genes_2d = mp.fit_transform(dfmp)

In [None]:
dfp_skcm = pd.DataFrame(skcm_genes_2d, columns=['x', 'y'],index=skcm.index)
dfp_skcm = dfp_skcm.join(skcm[skcm.columns[-2:]])

In [None]:
fig, ax = plt.subplots(figsize=(4,5))
cohorts = [ 'Hugo','Riaz', 'Liu', 'MGH', 'Allen', 'Gide', 'TCGA', ]
for bt in cohorts:
    dfp1 = dfp_skcm[dfp_skcm.cohort == bt]
    if bt == 'TCGA':
        bt = 'TCGA-SKCM'
    ax.scatter(dfp1.x, dfp1.y, label = bt, s = 20)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

ax.tick_params(bottom='on', left='off',  labelleft='on', labelbottom='on', pad=-.6,)
sns.despine(top=True, right=True, left=False, bottom=False)
ax.set_xlabel('UMAP1')
ax.set_ylabel('UMAP2')

In [None]:
skcm.columns[-2:]

In [None]:
def convert2fp(x):
    sg = skcm.columns[:-2]
    x = x[sg]
    tertile_thres = x.quantile(1/3)
    x_new = x.apply(lambda i: i.gt(tertile_thres), axis=1)*1.0
    return x_new
    
skcm_new = skcm.groupby('cohort').apply(convert2fp).reset_index().set_index('level_1')
skcm_new = skcm_new.loc[skcm.index]

mp = UMAP(random_state=123, spread=2.5)
skcm_genes_2d = mp.fit_transform(skcm_new[sgenes])

dfp_skcm = pd.DataFrame(skcm_genes_2d, columns=['x', 'y'],index=skcm.index)
dfp_skcm = dfp_skcm.join(skcm[skcm.columns[-2:]])

fig, ax = plt.subplots(figsize=(4,5))
cohorts = [ 'Hugo','Riaz', 'Liu', 'MGH', 'Allen', 'Gide', 'TCGA', ]
for bt in cohorts:
    dfp1 = dfp_skcm[dfp_skcm.cohort == bt]
    if bt == 'TCGA':
        bt = 'TCGA-SKCM'
    ax.scatter(dfp1.x, dfp1.y, label = bt, s = 20)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

ax.tick_params(bottom='on', left='off',  labelleft='on', labelbottom='on', pad=-.6,)
sns.despine(top=True, right=True, left=False, bottom=False)
ax.set_xlabel('UMAP1')
ax.set_ylabel('UMAP2')

In [None]:
dfp_skcm = dfp_skcm.join(df_study_tpm.flag)
dfp_skcm = dfp_skcm[~dfp_skcm.flag.isna()]
dfp_skcm.flag = dfp_skcm.flag.map({1:'responder', 0:'non-responder'})


fig, ax = plt.subplots(figsize=(4,5))
cohorts = [ 'Hugo','Riaz', 'Liu', 'MGH', 'Allen', 'Gide', 'TCGA', ]
for bt in dfp_skcm.flag.unique():
    dfp1 = dfp_skcm[dfp_skcm.flag == bt]

    ax.scatter(dfp1.x, dfp1.y, label = bt, s = 20)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

ax.tick_params(bottom='on', left='off',  labelleft='on', labelbottom='on', pad=-.6,)
sns.despine(top=True, right=True, left=False, bottom=False)
ax.set_xlabel('UMAP1')
ax.set_ylabel('UMAP2')