This is the **2nd** Notebook in the clustering pipeline. It allows you to take the DataSet object and compute projections to allow for visualization of the data. This step is purely exploratory, but can give you a better sense of the data. In the end, the projections generated here are saved as a .npz file.

Use <u>***pappas_tadam***</u> virtual environment.

In [None]:
# Set this to whatever directory GoodCopy is in, make sure to add a / at the end.

home_dir = "/home/l/lungboy/tadam/Project/"

# Importing Data

In [None]:
# Importing packages and functions

import numpy as np
import pandas as pd
import umap.umap_ as umap
import matplotlib.pyplot as plt
import sys

sys.path.append(home_dir + 'GoodCopy/Functions')

import FunctionsOOPGood as func

In [None]:
# Importing DataSet object

data = func.DataSet(empty=True)
data.open_DataSet(home_dir + "DataSet_Objects/data_saved")

# UMAP
UMAP projections generated by the umap package in python. Find the docs at https://umap-learn.readthedocs.io/en/latest/

Feel free to change parameters n_neighbors, min_dist and n_components, as well as any other parameters you may find on the documentation to get a nicer looking visualization.

In [None]:
# Generating UMAP projections for gower of risk factors, euclidian of risk factors, snf distance matrix
# and euclidian of biomarker data

UMAP_gower = umap.UMAP(n_neighbors=10,
        min_dist=0.5,
        n_components=2, metric="precomputed").fit_transform(data.gower)

UMAP_onehot = umap.UMAP(n_neighbors=10,
        min_dist=0.5,
        n_components=2).fit_transform(data.input_data)

UMAP_snf = umap.UMAP(n_neighbors=10,
        min_dist=0.5,
        n_components=2, metric = "precomputed").fit_transform(data.snf_dist)

UMAP_biodata = umap.UMAP(n_neighbors=10,
        min_dist=0.5,
        n_components=2).fit_transform(data.bio_data.dropna())

### Plotting Overall

In [None]:
# Plotting

fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(24, 8))

# Plot the data on each subplot
ax1.scatter(*UMAP_gower.T, color='blue', label='gower')
ax1.set_title('gower')

ax2.scatter(*UMAP_onehot.T, color='green', label='onehot')
ax2.set_title('onehot')

ax3.scatter(*UMAP_snf.T, color='red', label='SNF')
ax3.set_title('SNF')

ax4.scatter(*UMAP_biodata.T, color='yellow', label='SNF')
ax4.set_title('bio')

# Adjust layout to avoid overlapping titles and axis labels
plt.tight_layout()

# Adding title
fig.suptitle('UMAP Projections')

# Show the plot
plt.show()

# Saving plot

fig.savefig(home_dir + "GoodCopy/Plots/UMAP.png")

### Plotting colored by PE

In [None]:
# Plotting colored by PE

from matplotlib.colors import ListedColormap

# Creating color map to visualize nicely, here red is PE and gray is non-PE

cmap_colors = ['gray', 'red']
cmap = ListedColormap(cmap_colors)

# Creating figure

fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(24, 8))

# Plot the data on each subplot
ax1.scatter(*UMAP_gower.T, c = np.ravel(data.pe_labels), label='gower', cmap=cmap, 
            alpha = 0.2 + 0.8 * np.ravel(data.pe_labels)) # notice the alpha parameter, which
                                                          # which gives an alpha of 0.2 if c 
                                                          # value is 0 and 1 if c value is 1
                                                          # i.e. makes PE more visible
ax1.set_title('gower')

ax2.scatter(*UMAP_onehot.T, c = np.ravel(data.pe_labels), label='onehot', cmap=cmap, 
            alpha = 0.2 + 0.8 * np.ravel(data.pe_labels))
ax2.set_title('onehot')

ax3.scatter(*UMAP_snf.T, c = np.ravel(data.pe_labels), label='SNF', cmap=cmap, 
            alpha = 0.2 + 0.8 * np.ravel(data.pe_labels))
ax3.set_title('SNF')

ax4.scatter(*UMAP_biodata.T, 
            c = np.ravel(data.pe_labels[data.pe_labels.index.isin(data.bio_data.dropna().index)]), 
            label='SNF', cmap=cmap, 
            alpha = 0.2 + 0.8 * np.ravel(data.pe_labels[data.pe_labels.index.isin(data.bio_data.dropna().index)]))
ax4.set_title('bio')

# Adjust layout to avoid overlapping titles and axis labels
plt.tight_layout()

# Adding title
fig.suptitle('UMAP Projections Colored by PE')

# Show the plot
plt.show()

# Saving figure

fig.savefig(home_dir + "GoodCopy/Plots/UMAP_PE.png")

### Plotting with site labels

In [None]:
# Plotting colored by PE

from matplotlib.colors import ListedColormap
from sklearn.preprocessing import LabelEncoder

# Creating color map to visualize nicely, here red is PE and gray is non-PE

colors = ['red', 'blue', 'green', 'purple', 'orange', 'cyan']
cmap = ListedColormap(colors)

category_names = np.unique(data.site_labels)

site_labels_encoded = pd.DataFrame(LabelEncoder().fit_transform(data.site_labels.copy()))

# Creating figure

fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(30, 8))

# Plot the data on each subplot
ax1.scatter(*UMAP_gower.T, c = np.ravel(site_labels_encoded), cmap=cmap, 
            alpha = 0.5) 
ax1.set_title('gower')
# This creates a legend for each plot based on site labels
legend_artists = [plt.Line2D([0], [0], marker='o', color='w', label=category_names[i], 
                             markerfacecolor=colors[i], markersize=10) for i in range(6)]
ax1.legend(handles=legend_artists)

ax2.scatter(*UMAP_onehot.T, c = np.ravel(site_labels_encoded), label='onehot', cmap=cmap, 
            alpha = 0.5)
ax2.set_title('onehot')
legend_artists = [plt.Line2D([0], [0], marker='o', color='w', label=category_names[i], 
                             markerfacecolor=colors[i], markersize=10) for i in range(6)]
ax2.legend(handles=legend_artists)

ax3.scatter(*UMAP_snf.T, c = np.ravel(site_labels_encoded), label='SNF', cmap=cmap, 
            alpha = 0.5)
ax3.set_title('SNF')
legend_artists = [plt.Line2D([0], [0], marker='o', color='w', label=category_names[i], 
                             markerfacecolor=colors[i], markersize=10) for i in range(6)]
ax3.legend(handles=legend_artists, loc = "lower right")

ax4.scatter(*UMAP_biodata.T, 
            c = np.ravel(site_labels_encoded[site_labels_encoded.index.isin(data.bio_data.dropna().index)]), 
            label='SNF', cmap=cmap, 
            alpha = 0.5)
ax4.set_title('bio')
legend_artists = [plt.Line2D([0], [0], marker='o', color='w', label=category_names[i], 
                             markerfacecolor=colors[i], markersize=10) for i in range(6)]
ax4.legend(handles=legend_artists)

# Adjust layout to avoid overlapping titles and axis labels
plt.tight_layout()

# Adding title
fig.suptitle('UMAP Projections Colored by Site Labels')

# Show the plot
plt.show()


# Saving figure

fig.savefig(home_dir + "GoodCopy/Plots/UMAP_site.png")

## Exporting Visualizations for Future Use

In [None]:
np.savez(home_dir + 'GoodCopy/Objects/UMAP_projections.npz', UMAP_gower=UMAP_gower, UMAP_biodata=UMAP_biodata, 
         UMAP_snf=UMAP_snf, UMAP_onehot=UMAP_onehot)