<a href="https://colab.research.google.com/github/benjaminnigjeh/keyProteoforms/blob/main/dimensionalityReduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries and dependencies

Dimensionality reduction is performed by using tensorfolw and keras as frontend and backend, respectively. The datasets are prepared by using numpy and pandas libraries.

In [38]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
from keras.callbacks import TensorBoard
from google.colab import files

# Set up the tensorboard callback function

In [39]:
NAME = "Unsupervised_clustering"
tensorboard = TensorBoard(
    log_dir='/content/{}'.format(NAME))

# Upload dataset from a local folder


In [None]:
n_dim = 1369
dataset = files.upload_file('dataset')
df = pd.read_csv('/content/dataset')
X = df.copy()
Y = X.pop("target")
x_train = np.array(X)
y_train = np.array(Y)


#Generate random dataset

In [65]:
n_dim = 1369
x_train = np.random.rand(32, n_dim)
y_train = np.random.randint(2, size=32)

# Encoder and decoder architure and VAE class

In [96]:

latent_dim = 2

# Sampling layer for latent space
class Sampling(layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

# VAE Model for 1D input
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            # Get the latent variables from the encoder
            z_mean, z_log_var, z = self.encoder(data)

            # Reconstruct the input from the latent space using the decoder
            reconstruction = self.decoder(z)

            # Compute reconstruction loss (binary crossentropy for 1D data)
            reconstruction_loss = tf.reduce_mean(
                keras.losses.binary_crossentropy(data, reconstruction)
            )

            # Compute KL divergence loss
            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
            )

            # Total loss is the sum of reconstruction and KL loss
            total_loss = reconstruction_loss + kl_loss

        # Apply gradients to the model
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        # Update the metrics
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)

        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }


#Training function

In [90]:
def vae_training(random_seed):
  np.random.seed(random_seed)
  tf.random.set_seed(random_seed)
  encoder_inputs = keras.Input(shape=(n_dim,))
  x = layers.Dense(120 , activation="relu")(encoder_inputs)
  x = layers.Dense(120 , activation="relu")(x)
  x = layers.Dense(16, activation="relu")(x)
  z_mean = layers.Dense(latent_dim, name="z_mean")(x)
  z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
  z = Sampling()([z_mean, z_log_var])
  encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
  latent_inputs = keras.Input(shape=(latent_dim,))
  x = layers.Dense(n_dim, activation="relu")(latent_inputs)
  decoder_outputs = layers.Dense(n_dim, activation="sigmoid")(x)
  decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
  vae = VAE(encoder, decoder)
  vae.compile(optimizer=keras.optimizers.Adam())
  history = vae.fit(x_train, epochs=50, callbacks=[tensorboard])
  loss = history.history['loss']
  return loss

#Training with multiple random initiations

In [None]:
pellet_loss = []

for i in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
  pellet_loss.append(vae_training(i))

# soluble_loss = []

# for i in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
#   soluble_loss.append(vae_training(i))

# random_loss = []

# for i in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
#   random_loss.append(vae_training(i))

#plotting loss versus epoch

In [None]:
data_array = np.array(random_loss)
mean_values = np.mean(data_array, axis=0)
std_values = np.std(data_array, axis=0)
x = np.arange(1, data_array.shape[1] + 1)

plt.figure(figsize=(8, 5))
plt.errorbar(x, mean_values, yerr=std_values, fmt='o', color='black', label='random loss', capsize=5)

data_array = np.array(soluble_loss)
mean_values = np.mean(data_array, axis=0)
std_values = np.std(data_array, axis=0)
x = np.arange(1, data_array.shape[1] + 1)

plt.errorbar(x, mean_values, yerr=std_values, fmt='o', color='blue', label='soluble loss', capsize=5)

data_array = np.array(pellet_loss)
mean_values = np.mean(data_array, axis=0)
std_values = np.std(data_array, axis=0)
x = np.arange(1, data_array.shape[1] + 1)

plt.errorbar(x, mean_values, yerr=std_values, fmt='o', color='red', label='pellet loss', capsize=5)


# Adding labels and title
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim(0.5, 0.8)
plt.title('')
plt.legend()

# Display the plot
plt.grid(True)
plt.show()


#visualizing the latent layer

In [None]:
np.random.seed(42)
tf.random.set_seed(42)
encoder_inputs = keras.Input(shape=(n_dim,))
x = layers.Dense(120 , activation="relu")(encoder_inputs)
x = layers.Dense(120 , activation="relu")(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(n_dim, activation="relu")(latent_inputs)
decoder_outputs = layers.Dense(n_dim, activation="sigmoid")(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())
vae.fit(x_train, epochs=10, callbacks=[tensorboard])


def plot_label_clusters(vae, data, labels):
    z_mean, _, _ = vae.encoder.predict(data)
    plt.figure(figsize=(12, 10))
    plt.scatter(z_mean[:, 0], z_mean[:, 1], c=labels)
    plt.colorbar()
    plt.xlabel("z[0]")
    plt.ylabel("z[1]")
    plt.show()

plot_label_clusters(vae, x_train, y_train)