In [2]:
#@title Libraries
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, metrics, Model, losses
import pandas as pd
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
import time

# ***Libraries*** 👆
---
# ***Methods and Classes*** 👇


In [3]:
#@title Samling
class Sampling(layers.Layer):
  '''
  Sampling Layer: Sample z from the Probability Distribution of z_mean and z_log_var
  '''
  def call(self, z_mean, z_log_var):
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = tf.random.normal(shape=(batch, dim))
    z = z_mean + tf.exp(0.5 * z_log_var) * epsilon
    return z

In [4]:
#@title VAE Model
class VAE(keras.Model):

  def __init__(self, input_dim:int, hidden_dim:int, latent_dim:int, **kwargs):
    '''
    Define the model structure and it's properties
    '''
    super().__init__(**kwargs)
    self.input_dim = input_dim
    self.hidden_dim = hidden_dim
    self.latent_dim = latent_dim
    self.encoder = self.Encoder()
    self.decoder = self.Decoder()
    self.total_loss = metrics.Mean(name="total_loss")
    self.reconstruction_loss = metrics.Mean(name="reconstruction_loss")
    self.kl_loss = metrics.Mean(name="kl_loss")
    self.epoch = 0
    self.x = tf.Variable(10, trainable=False, dtype=float)


  @property
  def metrics(self):
    '''
    Loss metrics
    '''
    return [self.total_loss, self.reconstruction_loss, self.kl_loss,]


  def Encoder(self)->Model:
    '''
    Encoder model to transform the input to the latent space (compress)
    '''
    encoder_inputs = keras.Input(shape=(self.input_dim,))
    x = layers.Dense(self.hidden_dim, activation="relu")(encoder_inputs)
    z_mean = layers.Dense(self.latent_dim, name="z_mean")(x)
    z_log_var = layers.Dense(self.latent_dim, name="z_log_var")(x)
    z = Sampling()(z_mean, z_log_var)
    encoder = Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
    encoder.summary()
    keras.utils.plot_model(encoder, show_shapes=True, to_file='Encoder.png')
    return encoder


  def Decoder(self)->Model:
    '''
    Decoder model to reconstruct the input from the latent vector  (decompress)
    '''
    latent_inputs = keras.Input(shape=(self.latent_dim,))
    x = layers.Dense(self.hidden_dim, activation="relu")(latent_inputs)
    decoder_outputs = layers.Dense(self.input_dim, activation="relu")(x)
    decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
    decoder.summary()
    keras.utils.plot_model(decoder, show_shapes=True, to_file='Decoder.png')
    return decoder


  def Loss(self, input: tf.Tensor, output: tf.Tensor, z_mean: tf.Tensor, z_log_var: tf.is_tensor, beta: int)->list:
    '''
    The Loss fuction to calculate the loss of VEA (Reconstruction_loss+KL_loss)
    '''
    Reconstruction_loss = tf.reduce_mean((input-output)**2)
    KL_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
    KL_loss = tf.reduce_mean(tf.reduce_sum(KL_loss))
    return [Reconstruction_loss + beta*KL_loss, Reconstruction_loss, KL_loss]


  def train_step(self, input: tf.Tensor, beta:int=0)->dict:
    '''
    Calculate the output of the model and the errors.
    Update the model's weights by Backpropagating the error
    As the loss of KL_divergence is bigger than Reconstruction, first warmup the model by setting the beta=0
    '''
    with tf.GradientTape() as tape:
      z_mean, z_log_var, z = self.encoder(input)
      output = self.decoder(z)
      Total_loss, Reconstruction_loss, KL_loss = self.Loss(input, output, z_mean, z_log_var, beta)
      grads = tape.gradient(Total_loss, self.trainable_weights)
      self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
      self.total_loss.update_state(Total_loss)
      self.reconstruction_loss.update_state(Reconstruction_loss)
      self.kl_loss.update_state(KL_loss)
      return { "Loss": self.total_loss.result(), "Reconstruction_loss": self.reconstruction_loss.result(), "KL_loss": self.kl_loss.result(),}


In [5]:
#@title ML Classifiers
class classifiers:

  def __init__(self, baseline_results, X_train, y_train, X_dev, y_dev,) -> None:
     self.baseline_results = baseline_results
     self.X_train = X_train
     self.y_train = y_train
     self.X_dev = X_dev
     self.y_dev = y_dev

  def naive_bayes_classification(self, suffix, verbose):
    beg = time.time()
    clf = GaussianNB()
    clf.fit(self.X_train, self.y_train)
    y_pred = clf.predict(self.X_dev)
    end = time.time()
    self.baseline_results[f'Naive-Bayes {suffix}'] = classification_report(self.y_dev, y_pred, output_dict=True, zero_division=1)
    self.baseline_results[f'Naive-Bayes {suffix}']['infer_time'] = end-beg
    if verbose:
        print(f'Naive Bayes\t{clf.score(self.X_dev, self.y_dev)}')


  def knn_classification(self, n_neighbors, suffix, verbose):
    beg = time.time()
    clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    clf.fit(self.X_train, self.y_train)
    y_pred = clf.predict(self.X_dev)
    end = time.time()
    self.baseline_results[f'KNN-{n_neighbors} {suffix}'] = classification_report(self.y_dev, y_pred, output_dict=True, zero_division=1)
    self.baseline_results[f'KNN-{n_neighbors} {suffix}']['infer_time'] = end-beg
    if verbose:
        print(f"KNN with n_neighbors: {n_neighbors}", end='\t')
        print(clf.score(self.X_dev, self.y_dev))


  def random_forest_classification(self, n_estimators, suffix, verbose):
    beg = time.time()
    clf = RandomForestClassifier(n_estimators=n_estimators)
    clf.fit(self.X_train, self.y_train)
    y_pred = clf.predict(self.X_dev)
    end = time.time()
    self.baseline_results[f'RandomForest-{n_estimators} {suffix}'] = classification_report(self.y_dev, y_pred, output_dict=True, zero_division=1)
    self.baseline_results[f'RandomForest-{n_estimators} {suffix}']['infer_time'] = end-beg
    if verbose:
        print(f"Random Forest with n_estimators: {n_estimators}", end='\t')
        print(clf.score(self.X_dev, self.y_dev))


  def mlp_classification(self, layers, suffix, verbose):
    beg = time.time()
    clf = MLPClassifier(hidden_layer_sizes=layers, max_iter=3000)
    clf.fit(self.X_train, self.y_train)
    y_pred = clf.predict(self.X_dev)
    end = time.time()
    self.baseline_results[f'MLP-{layers} {suffix}'] = classification_report(self.y_dev, y_pred, output_dict=True, zero_division=1)
    self.baseline_results[f'MLP-{layers} {suffix}']['infer_time'] = end-beg
    if verbose:
        print(f"MLP with layers: {layers}", end='\t')
        print(clf.score(self.X_dev, self.y_dev))


  def svm_classification(self, kernels, suffix, verbose):
    beg = time.time()
    clf = SVC(kernel=kernels)
    clf.fit(self.X_train, self.y_train)
    y_pred = clf.predict(self.X_dev)
    end = time.time()
    self.baseline_results[f'SVM-{kernels} {suffix}'] = classification_report(self.y_dev, y_pred, output_dict=True, zero_division=1)
    self.baseline_results[f'SVM-{kernels} {suffix}']['infer_time'] = end-beg
    if verbose:
        print(f"SVM with kernel: {kernels}", end='\t')
        print(clf.score(self.X_dev, self.y_dev))

# ***Methods and Classes*** 👆
---
# ***Main*** 👇

In [8]:
# Load the dataset and set the hyperparams
Train = pd.read_pickle("/content/train_embeddings.p")
Test = pd.read_pickle("/content/dev_embeddings_originals.p")
input_dim = Train.shape[1]
hidden_dim = 128
latent_dim = 50
epochs = 30

# Build and Train the model
vae = VAE(input_dim, hidden_dim, latent_dim)
vae.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001))
# vae.fit(Train, epochs=30, batch_size=16)
trainset = tf.data.Dataset.from_tensor_slices(Train).batch(16)
for epoch in range(epochs):
  beta = (epoch/10**8)
  total_loss, reconstruction_loss, kl_loss = [], [], []
  for batch in trainset:
    Losses = vae.train_step(batch, beta)
    total_loss.append(Losses['Loss'].numpy())
    reconstruction_loss.append( Losses['Reconstruction_loss'].numpy())
    kl_loss.append(Losses['KL_loss'].numpy())
  print(f"\x1b[1mEpoch {epoch+1}/{epochs}\x1b[0m ==> \x1b[0;31mLoss\x1b[0m: {tf.reduce_mean(total_loss)} \t \x1b[0;31mReconstruction_Loss\x1b[0m: {tf.reduce_mean(reconstruction_loss)} \t \x1b[0;31mKL_Loss\x1b[0m: {tf.reduce_mean(kl_loss)}")

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 384)]        0           []                               
                                                                                                  
 dense_3 (Dense)                (None, 128)          49280       ['input_3[0][0]']                
                                                                                                  
 z_mean (Dense)                 (None, 50)           6450        ['dense_3[0][0]']                
                                                                                                  
 z_log_var (Dense)              (None, 50)           6450        ['dense_3[0][0]']                
                                                                                            

In [9]:
# Use the Trained encoder to get the compressed vectors
trained_encoder = vae.encoder
trained_encoder.trainable = False  # freeze the weights
# Trainset
z_mean, z_log_var, z = trained_encoder.predict(Train, verbose=0)
print(f'Size of the trainset before {Train.nbytes} and after {z.nbytes} dimensionality reduction.')
# Testset
z_mean_test, z_log_var_test, z_test = trained_encoder.predict(Test, verbose=0)
# Result
baseline_results = {}
# Lables
# df = pd.read_json('train.json', orient='index') # AspectSentiment
# labels = df['polarity'].values
# df_test = pd.read_json('test.json', orient='index')
# labels_test = df_test['polarity'].values
df = pd.read_table('train_ml.tsv') # CheckThatLab
labels = df['label'].values
df_test = pd.read_table('dev_ml.tsv')
labels_test = df_test['label'].values
# df = pd.read_csv('Train.csv') # FEVER
# labels = df['label'].values
# df_test = pd.read_csv('Test.csv')
# labels_test = df_test['label'].values
# Classifiers
suffix = 'VAE_' + 'CheckThatLab'
cls = classifiers(baseline_results, X_train=z, y_train=labels, X_dev=z_test, y_dev=labels_test)
cls.naive_bayes_classification(suffix, True)
cls.knn_classification(50, suffix, True)
cls.random_forest_classification(100, suffix, True)
cls.mlp_classification((200, 100), suffix, True)
cls.svm_classification('rbf', suffix, True)

Size of the trainset before 10180608 and after 1325600 dimensionality reduction.
Naive Bayes	0.655
KNN with n_neighbors: 50	0.6483333333333333
Random Forest with n_estimators: 100	0.745
MLP with layers: (200, 100)	0.8
SVM with kernel: rbf	0.7433333333333333


In [None]:
import pickle
with open(f'{suffix}.pickle', 'wb') as f:
    pickle.dump(baseline_results, f, protocol=pickle.HIGHEST_PROTOCOL)