# 04 - Model Evaluation (Clustering)

## I. Preliminaries

Import the libraries and functions for model evaluation.

In [None]:
# Libraries Here

Import the trained DGD model for evaluation.

In [None]:
dgd_final = model.MetaboDGD(
    latent_dim=50,
    output_dim=np_normal_log.shape[1],
    dec_hidden_layers_dim=[500, 1500],
    dec_output_prediction_type='mean',
    dec_output_activation_type='softplus',
    n_comp=8,
    cm_type='diagonal',
    gaussian_stddev=0.5
)

dgd_final.dec.load_state_dict(torch.load('torch_outputs/torch_outputs_dec.pt'))
dgd_final.gmm.load_state_dict(torch.load('torch_outputs/torch_outputs_gmm.pt'))
# dgd_final.dec.load_state_dict(torch.load('98_85_acc/torch_outputs_dec.pt'))
# dgd_final.gmm.load_state_dict(torch.load('98_85_acc/torch_outputs_gmm.pt'))

Import the learned representations of the training samples.

In [None]:
from metaboDGD.src.latent import RepresentationLayer
train_rep_final = RepresentationLayer(values=torch.zeros(size=(np_normal_log.shape[0], dgd_final.gmm.dim)))
train_rep_final.load_state_dict(torch.load('torch_outputs/torch_outputs_train_rep.pt'))
# train_rep_final.load_state_dict(torch.load('98_85_acc/torch_outputs_train_rep.pt'))

## II. Model Evaluation - Figure X

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import adjusted_rand_score, confusion_matrix
from scipy.optimize import linear_sum_assignment

le = LabelEncoder()
true_labels = le.fit_transform(train_loader.dataset.get_labels())
clustering  = torch.exp(dgd_model.gmm.get_log_prob_comp(train_rep.z.detach()))
pred_labels = torch.max(clustering, dim=-1).indices.cpu().detach().numpy()
cm_test = confusion_matrix(true_labels, pred_labels)
cm_test

idxs = linear_sum_assignment(-cm_test + np.max(cm_test))
cm_test2 = cm_test[:, idxs[1]]
# print(true_labels)
# print(pred_labels)

# print(cm_test)

# adjusted_rand_score(true_labels, pred_labels)

In [None]:
cm_norm = np.round(np.nan_to_num(cm / np.sum(cm, axis=0) * 100), decimals=1)
# np.sum(cm, axis=0)

class_lbls = list(cohorts.keys())
# sample_num_lbls = [len(cohorts[c]['sample_list']) for c in cohorts.keys()]
# sample_num_lbls = [37,37,19,31,4,2,9,36]
sample_num_lbls = [47, 47, 24, 39, 6, 3, 12, 46]
# sample_num_lbls = [47, 94, 118, 157, 169, 215]
# plt.rcParams["font.family"] = 'sans-serif'


zr_mask = np.where(cm_norm > 0, False, True)

fig, ax = plt.subplots(figsize=(8,6))

sns.heatmap(cm_norm, cmap='Blues',
            annot=True,
            fmt='g',
            xticklabels=np.sum(cm, axis=0),
            yticklabels=class_lbls,
            linewidths=0.1,
            mask=zr_mask,
            linecolor='black',
            ax=ax,
            cbar_kws={
                'pad': 0.15,
                'shrink': 0.375,
                'aspect': 5,
                'anchor': (0.0, 1.0)
            })

ax_y = ax.twinx()
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
ax.tick_params(axis='both', length=0)

ax_y.set_yticks(ax.get_yticks())
ax_y.set_ylim(ax.get_ylim())
ax_y.set_yticklabels(sample_num_lbls)
ax_y.tick_params(axis='y', length=0)

ax.set_xlabel('Samples per component', fontsize=14, labelpad=15)
ax_y.set_ylabel('Samples per tissue' , fontsize=14, labelpad=30, rotation=270)

ax.tick_params(  axis='y', direction='out', pad=5)
ax_y.tick_params(axis='y', direction='out', pad=5)

ax.set_title('Gaussian Components', pad=15, fontsize=14)

# fig.subplots_adjust(bottom=0.5)
plt.tight_layout()

## III. Model Evaluation - Figure Y

PCA - Training Reps

In [None]:
from sklearn.decomposition import PCA
from matplotlib.patches import Ellipse
# import umap
means_and_reps = np.vstack((train_rep_final.z.detach().numpy(),dgd_final.gmm.means.detach().numpy()))
# means_and_reps = np.vstack((train_rep_final.z.detach().numpy(),dgd_final.gmm.means.detach().numpy()))
# pca = umap.UMAP(n_neighbors=2)
pca = PCA(n_components=2)
pca_fit = pca.fit_transform(means_and_reps)
# pca_fit        = pca.fit_transform(train_rep_final.z.detach().numpy())
# pca_fit_means  = pca.transform(dgd_model.gmm.means.detach().numpy())
fig, ax = plt.subplots(figsize=(6,4))
# ax.scatter(pca_fit[0:37,0]   , pca_fit[0:37,1]   , label='BRCA1'    )
# ax.scatter(pca_fit[37:74,0]  , pca_fit[37:74,1]  , label='ccRCC3'     )
# ax.scatter(pca_fit[74:93,0]  , pca_fit[74:93,1]  , label='ccRCC4'   )
# ax.scatter(pca_fit[93:124,0] , pca_fit[93:124,1] , label='COAD'   )
# ax.scatter(pca_fit[124:128,0], pca_fit[124:128,1], label='GBM'      )
# ax.scatter(pca_fit[128:130,0], pca_fit[128:130,1], label='HurthleCC')
# ax.scatter(pca_fit[130:139,0], pca_fit[130:139,1], label='PDAC'     )
# ax.scatter(pca_fit[139:175,0], pca_fit[139:175,1], label='PRAD'     )

ax.scatter(pca_fit[0:47,0]   , pca_fit[0:47,1]   ,  label='Breast (BRCA1)'    )
ax.scatter(pca_fit[47:94,0]  , pca_fit[47:94,1]  ,  label='Kidney (ccRCC3)'     )
ax.scatter(pca_fit[94:118,0]  , pca_fit[94:118,1],  label='Kidney (ccRCC4)' )
ax.scatter(pca_fit[118:157,0] , pca_fit[118:157,1], label='Colon (COAD)' )
ax.scatter(pca_fit[157:163,0], pca_fit[157:163,1],  label='Brain (GBM)'      )
ax.scatter(pca_fit[163:166,0], pca_fit[163:166,1],  label='Thyroid (HurthleCC)')
ax.scatter(pca_fit[166:178,0], pca_fit[166:178,1],  label='Pancreas (PDAC)'     )
ax.scatter(pca_fit[178:224,0], pca_fit[178:224,1],  label='Prostate (PRAD)'     )

# idx_lcgc = np.r_[0:47, 157:163, 166:178, 178:224]
# idx_lc = np.r_[47:94, 94:118, 163:166]

# ax.scatter(pca_fit[idx_lcgc,0]   , pca_fit[idx_lcgc,1]   ,  label='LC-MS and GC-MS'    )
# ax.scatter(pca_fit[idx_lc,0]  , pca_fit[idx_lc,1]  ,  label='LC-MS'     )
# ax.scatter(pca_fit[118:157,0] , pca_fit[118:157,1], label='CE-TOFMS' )
# ax.scatter(pca_fit[94:118,0]  , pca_fit[94:118,1],  label='LC-MS' )
# ax.scatter(pca_fit[157:163,0], pca_fit[157:163,1],  label='LC-MS and GC-MS'      )
# ax.scatter(pca_fit[163:166,0], pca_fit[163:166,1],  label='LC-MS')
# ax.scatter(pca_fit[166:178,0], pca_fit[166:178,1],  label='LC-MS and GC-MS'     )
# ax.scatter(pca_fit[178:224,0], pca_fit[178:224,1],  label='LC-MS and GC-MS'     )

# ax.annotate('BRCA1', xy=(pca_fit[0,0], pca_fit[0, 1])        , ha='center', textcoords='offset points', xytext=(0,5))
# ax.annotate('ccRCC3', xy=(pca_fit[47,0], pca_fit[47, 1])     , ha='center', textcoords='offset points', xytext=(0,5))
# ax.annotate('ccRCC4', xy=(pca_fit[94,0], pca_fit[94, 1])     , ha='center', textcoords='offset points', xytext=(0,5))
# ax.annotate('COAD', xy=(pca_fit[118,0], pca_fit[118, 1])     , ha='center', textcoords='offset points', xytext=(0,5))
# ax.annotate('GBM', xy=(pca_fit[157,0], pca_fit[157, 1])      , ha='center', textcoords='offset points', xytext=(0,5))
# ax.annotate('HurthleCC', xy=(pca_fit[163,0], pca_fit[163, 1]), ha='center', textcoords='offset points', xytext=(0,5))
# ax.annotate('PDAC', xy=(pca_fit[166,0], pca_fit[166, 1])     , ha='center', textcoords='offset points', xytext=(0,5))
# ax.annotate('PRAD', xy=(pca_fit[178,0], pca_fit[178, 1])     , ha='center', textcoords='offset points', xytext=(0,5))

# ax.scatter(pca_fit[175:,0], pca_fit[175:,1], label='GMM Means', c='black', marker='*')
# ax.scatter(pca_fit[224:,0], pca_fit[224:,1], label='GMM Means', c='black', marker='*',)
# ax.scatter(pca_fit[0:19,0] , pca_fit[0:19,1] , label='Kidney 4'   )
# ax.scatter(pca_fit[19:28,0], pca_fit[19:28,1], label='Pancreas'     )
# ax.scatter(pca_fit[28:64,0], pca_fit[28:64,1], label='Prostate'     )
# ax.scatter(pca_fit[64:,0], pca_fit[64:,1], label='GMM Means', c='black', marker='*')

P = pca.components_
for x in range(0, dgd_final.gmm.n_comp):
    cov_2d = np.diag(torch.exp(dgd_final.gmm.log_var).detach().numpy()[x])
    # print(cov_2d)
    mean_2d = P @ (dgd_final.gmm.means.detach().numpy()[x] - pca.mean_)
    # mean_2d = pca_fit[175+x]
    cov_2d_projected = P @ cov_2d @ P.T

    vals, vecs = np.linalg.eigh(cov_2d_projected)
    order = vals.argsort()[::-1]
    vals, vecs = vals[order], vecs[:, order]
    angle = np.degrees(np.arctan2(vecs[1, 0], vecs[0, 0]))
    
    std = 1.0
    w, h = 2 * std * np.sqrt(vals)
    ellipse = Ellipse(xy=mean_2d, width=w, height=h, angle=angle,
                    facecolor='black', alpha=0.1)
    ax.add_patch(ellipse)

    std = 2.0
    w, h = 2 * std * np.sqrt(vals)
    ellipse_2 = Ellipse(xy=mean_2d, width=w, height=h, angle=angle,
                    facecolor='black', alpha=0.1)
    ax.add_patch(ellipse_2)


# ax.scatter(pca_fit_means[:,0], pca_fit_means[:,1], label='Means', c='black'     )
ax.set_title('Analytical Platforms of All Cohorts')

ax.set_xlabel(f'PC1 ({(pca.explained_variance_ratio_[0] * 100):.2f}%)', labelpad=10)
ax.set_ylabel(f'PC2 ({(pca.explained_variance_ratio_[1] * 100):.2f}%)')
# ax.set_xlabel('UMAP D1')
# ax.set_ylabel('UMAP D2')
ax.legend(bbox_to_anchor=(1.40, 1.025),loc='upper right', fancybox=False, framealpha=0.0, title='Analytical Platforms', alignment='left')

## IV. Model Evaluation - Figure Z

PCA - Tumor Reps

In [None]:
pca_tumor = PCA(n_components=2)
means_and_tumor_reps = np.vstack((tumor_rep_final.detach(), dgd_model.gmm.means.detach().numpy()))
pca_fit = pca_tumor.fit_transform(means_and_tumor_reps)

fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(pca_fit[0:61,0]  , pca_fit[0:61,1]  ,     label='Breast (BRCA1)'    )
ax.scatter(pca_fit[61:128,0]  , pca_fit[61:128,1]  , label='Kidney (ccRCC3)'     )
ax.scatter(pca_fit[128:180,0]  , pca_fit[128:180,1], label='Kidney (ccRCC4)' )
ax.scatter(pca_fit[180:217,0] , pca_fit[180:217,1] , label='Colon (COAD)' )
ax.scatter(pca_fit[217:291,0], pca_fit[217:291,1],   label='Brain (GBM)'      )
ax.scatter(pca_fit[291:319,0], pca_fit[291:319,1],   label='Thyroid (HurthleCC)')
ax.scatter(pca_fit[319:346,0], pca_fit[319:346,1],   label='Pancreas (PDAC)'     )
ax.scatter(pca_fit[346:437,0], pca_fit[346:437,1],   label='Prostate (PRAD)'     )
ax.scatter(pca_fit[437:,0], pca_fit[437:,1], label='GMM Means', c='black', marker='*')
ax.set_title('Tumor Representations and Means in Latent Space')


P = pca_tumor.components_
for x in range(0, dgd_model.gmm.n_comp):
    cov_2d = np.diag(torch.exp(dgd_model.gmm.log_var).detach().numpy()[x])
    # print(cov_2d)
    mean_2d = P @ (dgd_model.gmm.means.detach().numpy()[x] - pca_tumor.mean_)
    # mean_2d = pca_fit[175+x]
    cov_2d_projected = P @ cov_2d @ P.T

    vals, vecs = np.linalg.eigh(cov_2d_projected)
    order = vals.argsort()[::-1]
    vals, vecs = vals[order], vecs[:, order]
    angle = np.degrees(np.arctan2(vecs[1, 0], vecs[0, 0]))
    
    std = 1.0
    w, h = 2 * std * np.sqrt(vals)
    ellipse = Ellipse(xy=mean_2d, width=w, height=h, angle=angle,
                      facecolor='black', alpha=0.1)
    ax.add_patch(ellipse)

    std = 2.0
    w, h = 2 * std * np.sqrt(vals)
    ellipse_2 = Ellipse(xy=mean_2d, width=w, height=h, angle=angle,
                      facecolor='black', alpha=0.1)
    ax.add_patch(ellipse_2)

ax.set_xlabel(f'PC1 ({(pca_tumor.explained_variance_ratio_[0] * 100):.2f}%)', labelpad=10)
ax.set_ylabel(f'PC2 ({(pca_tumor.explained_variance_ratio_[1] * 100):.2f}%)')
ax.legend(bbox_to_anchor=(1.40, 1.025),loc='upper right', fancybox=False, framealpha=0.0, title='Tissue Type', alignment='left')