<a href="https://colab.research.google.com/github/cchang-vassar/Semantic-Relations-in-Vector-Embeddings/blob/main/study5_nomic_autoencoder_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Nomic] Autoencoder: Generate Corresponding Embedding

## Set Up

### Imports

In [None]:
import os
import subprocess
import zipfile
import shutil
import pickle
import tensorflow as tf
import pandas as pd
import numpy as np
from google.colab import userdata
from scipy import spatial
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from plotnine import ggplot, geom_line, aes, ggsave, labs, theme, element_text, guides, guide_legend

### OSF Setup

In [None]:
!pip install osfclient

Collecting osfclient
  Downloading osfclient-0.0.5-py2.py3-none-any.whl (39 kB)
Installing collected packages: osfclient
Successfully installed osfclient-0.0.5


In [None]:
os.environ["OSF_USERNAME"] = userdata.get("OSF_USERNAME")
OSF_USERNAME = os.environ["OSF_USERNAME"]

In [None]:
os.environ["OSF_PASSWORD"] = userdata.get("OSF_PASSWORD")
OSF_PASSWORD = os.environ["OSF_PASSWORD"]

In [None]:
os.environ["OSF_TOKEN"] = userdata.get("OSF_TOKEN")
OSF_TOKEN = os.environ["OSF_TOKEN"]

In [None]:
os.environ["OSF_PROJECT_ID"] = userdata.get("OSF_PROJECT_ID")
OSF_PROJECT_ID = os.environ["OSF_PROJECT_ID"]

## Data

### Load embeddings data from OSF

In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/data-dump/nomic-autoencoder/nomic_embeddings_dump.zip", shell=True)
print("nomic_embeddings_dump.zip successfully imported")

nomic_embeddings_dump_file_path_zip = 'nomic_embeddings_dump.zip'
nomic_embeddings_dump_file_path = 'current-data-dump/embeddings-dump'
with zipfile.ZipFile(nomic_embeddings_dump_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(nomic_embeddings_dump_file_path)

extracted_files = os.listdir(nomic_embeddings_dump_file_path)
print("Files extracted:", extracted_files)

### Functions to prepare data

In [None]:
def prepare_training_df(data: pd.DataFrame):
  """Drop rows that do not follow 'point' -> 'counter' pattern"""
  point_indices = data[data['type'] == 'point'].index
  counter_indices = data[data['type'] == 'counter'].index
  drop_indices = []
  for idx in point_indices:
    if (idx == len(data)-1) or (idx + 1 < len(data) and data.loc[idx + 1, 'type'] != 'counter'):
      drop_indices.append(idx)
  for idx in counter_indices:
    if idx > 0 and data.loc[idx - 1, 'type'] != 'point':
      drop_indices.append(idx)
  data = data.drop(drop_indices)
  data = data.select_dtypes(include=[np.number])
  data = data.reset_index(drop=True)
  return data

In [None]:
def prepare_training_df_shuffled(data: pd.DataFrame):
  """Drop rows that do not follow 'point' -> 'counter' pattern"""
  point_indices = data[data['type'] == 'point'].index
  counter_indices = data[data['type'] == 'counter'].index
  drop_indices = []
  for idx in point_indices:
    if (idx == len(data)-1) or (idx + 1 < len(data) and data.loc[idx + 1, 'type'] != 'counter'):
      drop_indices.append(idx)
  for idx in counter_indices:
    if idx > 0 and data.loc[idx - 1, 'type'] != 'point':
      drop_indices.append(idx)
  data = data.drop(drop_indices)
  data = data.reset_index(drop=True)
  return data

In [None]:
def make_x_train(data: pd.DataFrame) -> pd.DataFrame:
  """Make training and testing datasets"""
  cutoff = int(0.8 * data.shape[0])
  if cutoff % 2 != 0:
    cutoff = cutoff - 1
  train_rows_df = data.iloc[:cutoff, :]
  x_train = train_rows_df[train_rows_df.index % 2 == 0].reset_index(drop=True)
  return x_train

def make_y_train(data: pd.DataFrame) -> pd.DataFrame:
  cutoff = int(0.8 * data.shape[0])
  if cutoff % 2 != 0:
    cutoff = cutoff - 1
  train_rows_df = data.iloc[:cutoff, :]
  y_train = train_rows_df[train_rows_df.index % 2 != 0].reset_index(drop=True)
  return y_train

def make_x_test(data: pd.DataFrame) -> pd.DataFrame:
  cutoff = int(0.8 * data.shape[0])
  if cutoff % 2 != 0:
    cutoff = cutoff - 1
  test_rows_df = data.iloc[cutoff:, :]
  x_test = test_rows_df[test_rows_df.index % 2 == 0].reset_index(drop=True)
  return x_test

def make_y_test(data: pd.DataFrame) -> pd.DataFrame:
  cutoff = int(0.8 * data.shape[0])
  if cutoff % 2 != 0:
    cutoff = cutoff - 1
  test_rows_df = data.iloc[cutoff:, :]
  y_test = test_rows_df[test_rows_df.index % 2 != 0].reset_index(drop=True)
  return y_test

### Global data

In [None]:
global_sd_embeddings_data = pd.read_pickle("current-data-dump/embeddings_dump/nomic_embeddings_dump/global_sd_embeddings.pkl")
global_sq_embeddings_data = pd.read_pickle("current-data-dump/embeddings_dump/nomic_embeddings_dump/global_sq_embeddings.pkl")
global_clu_embeddings_data = pd.read_pickle("current-data-dump/embeddings_dump/nomic_embeddings_dump/global_clu_embeddings.pkl")
global_cla_embeddings_data = pd.read_pickle("current-data-dump/embeddings_dump/nomic_embeddings_dump/global_cla_embeddings.pkl")

In [None]:
global_sd_training_df = prepare_training_df(global_sd_embeddings_data)
global_sq_training_df = prepare_training_df(global_sq_embeddings_data)
global_clu_training_df = prepare_training_df(global_clu_embeddings_data)
global_cla_training_df = prepare_training_df(global_cla_embeddings_data)

In [None]:
global_sd_x_train = make_x_train(global_sd_training_df)
global_sq_x_train = make_x_train(global_sq_training_df)
global_clu_x_train = make_x_train(global_clu_training_df)
global_cla_x_train = make_x_train(global_cla_training_df)

In [None]:
global_sd_y_train = make_y_train(global_sd_training_df)
global_sq_y_train = make_y_train(global_sq_training_df)
global_clu_y_train = make_y_train(global_clu_training_df)
global_cla_y_train = make_y_train(global_cla_training_df)

In [None]:
global_sd_x_test = make_x_test(global_sd_training_df)
global_sq_x_test = make_x_test(global_sq_training_df)
global_clu_x_test = make_x_test(global_clu_training_df)
global_cla_x_test = make_x_test(global_cla_training_df)

In [None]:
global_sd_y_test = make_y_test(global_sd_training_df)
global_sq_y_test = make_y_test(global_sq_training_df)
global_clu_y_test = make_y_test(global_clu_training_df)
global_cla_y_test = make_y_test(global_cla_training_df)

In [None]:
global_sd_y_train_test = pd.concat([global_sd_y_train, global_sd_y_test], axis=0)
global_sq_y_train_test = pd.concat([global_sq_y_train, global_sq_y_test], axis=0)
global_clu_y_train_test = pd.concat([global_clu_y_train, global_clu_y_test], axis=0)
global_cla_y_train_test = pd.concat([global_cla_y_train, global_cla_y_test], axis=0)

### Global data shuffled

In [None]:
global_sd_training_df_shuffled = prepare_training_df_shuffled(global_sd_embeddings_data)
global_sq_training_df_shuffled = prepare_training_df_shuffled(global_sq_embeddings_data)
global_clu_training_df_shuffled = prepare_training_df_shuffled(global_clu_embeddings_data)
global_cla_training_df_shuffled = prepare_training_df_shuffled(global_cla_embeddings_data)

In [None]:
global_sd_y_train_shuffled = make_y_train(global_sd_training_df_shuffled)
global_sq_y_train_shuffled = make_y_train(global_sq_training_df_shuffled)
global_clu_y_train_shuffled = make_y_train(global_clu_training_df_shuffled)
global_cla_y_train_shuffled = make_y_train(global_cla_training_df_shuffled)

In [None]:
global_sd_y_train_shuffled = global_sd_y_train_shuffled.groupby(['topic'], sort=False)
global_sd_y_train_shuffled = global_sd_y_train_shuffled.sample(frac=1).reset_index(drop=True)
global_sd_y_train_shuffled = global_sd_y_train_shuffled.select_dtypes(include=[np.number])

global_sq_y_train_shuffled = global_sq_y_train_shuffled.groupby(['topic'], sort=False)
global_sq_y_train_shuffled = global_sq_y_train_shuffled.sample(frac=1).reset_index(drop=True)
global_sq_y_train_shuffled = global_sq_y_train_shuffled.select_dtypes(include=[np.number])

global_clu_y_train_shuffled = global_clu_y_train_shuffled.groupby(['topic'], sort=False)
global_clu_y_train_shuffled = global_clu_y_train_shuffled.sample(frac=1).reset_index(drop=True)
global_clu_y_train_shuffled = global_clu_y_train_shuffled.select_dtypes(include=[np.number])

global_cla_y_train_shuffled = global_cla_y_train_shuffled.groupby(['topic'], sort=False)
global_cla_y_train_shuffled = global_cla_y_train_shuffled.sample(frac=1).reset_index(drop=True)
global_cla_y_train_shuffled = global_cla_y_train_shuffled.select_dtypes(include=[np.number])

## Model

### Architecture

In [None]:
# Layers
input_layer = tf.keras.layers.Input(shape=(768, ), name="Input")
hidden_layer = tf.keras.layers.Dense(units=768, activation="relu", name="Hidden")(input_layer)
output_layer = tf.keras.layers.Dense(units=768, activation="linear", name="Output")(hidden_layer)

In [None]:
# Model
autoencoder_model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
autoencoder_model.summary()

In [None]:
def metric_choose_argument_global_sd_y_train(y_true, y_pred):
  """global_sd_metric"""
  global_sd_training_df_32 = tf.cast(global_sd_training_df, dtype=tf.float32)

  cos_sim_pred = tf.matmul(global_sd_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_sd_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(global_sd_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_sd_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

In [None]:
def metric_choose_argument_global_sq_y_train(y_true, y_pred):
  """global_sq_metric"""
  global_sq_training_df_32 = tf.cast(global_sq_training_df, dtype=tf.float32)

  cos_sim_pred = tf.matmul(global_sq_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_sq_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(global_sq_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_sq_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

In [None]:
def metric_choose_argument_global_clu_y_train(y_true, y_pred):
  """global_clu_metric"""
  global_clu_training_df_32 = tf.cast(global_clu_training_df, dtype=tf.float32)

  cos_sim_pred = tf.matmul(global_clu_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_clu_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(global_clu_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_clu_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

In [None]:
def metric_choose_argument_global_cla_y_train(y_true, y_pred):
  """global_cla_metric"""
  global_cla_training_df_32 = tf.cast(global_cla_training_df, dtype=tf.float32)

  cos_sim_pred = tf.matmul(global_cla_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_cla_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(global_cla_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_cla_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

### Compile

In [None]:
# Global SD Model
global_sd_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_sd_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_global_sd_y_train]
)

In [None]:
# Global SQ Model
global_sq_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_sq_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_global_sq_y_train]
)

In [None]:
# Global CLU Model
global_clu_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_clu_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_global_clu_y_train]
)

In [None]:
# Global CLA Model
global_cla_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_cla_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_global_cla_y_train]
)

## Training

In [None]:
checkpoint_callback = ModelCheckpoint(filepath='current-data-dump/nomic-autoencoder/global_sd_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='current-data-dump/nomic-autoencoder/global_sd_training_log.csv', separator=',', append=True)

global_sd_history = global_sd_autoencoder_model.fit(
  x=global_sd_x_train,
  y=global_sd_y_train,
  batch_size=1,
  epochs=20,
  validation_data = (global_sd_x_test, global_sd_y_test),
  callbacks=[checkpoint_callback, csv_logger_callback]
)

In [None]:
global_sd_autoencoder_model.save('nomic-autoencoder/global_sd_autoencoder_model.keras')

In [None]:
checkpoint_callback = ModelCheckpoint(filepath='nomic_autoencoder/global_sq_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='nomic-autoencoder/global_sq_training_log.csv', separator=',', append=True)

global_sq_history = global_sq_autoencoder_model.fit(
  x=global_sq_x_train,
  y=global_sq_y_train,
  batch_size=1,
  epochs=20,
  validation_data = (global_sq_x_test, global_sq_y_test),
  callbacks=[checkpoint_callback, csv_logger_callback]
)

In [None]:
global_sq_autoencoder_model.save('nomic-autoencoder/global_sq_autoencoder_model.keras')

In [None]:
checkpoint_callback = ModelCheckpoint(filepath='nomic-autoencoder/global_clu_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='nomic-autoencoder/global_clu_training_log.csv', separator=',', append=True)

global_clu_history = global_clu_autoencoder_model.fit(
  x=global_clu_x_train,
  y=global_clu_y_train,
  batch_size=1,
  epochs=20,
  validation_data = (global_clu_x_test, global_clu_y_test),
  callbacks=[checkpoint_callback, csv_logger_callback]
)

In [None]:
global_clu_autoencoder_model.save('nomic-autoencoder/global_clu_autoencoder_model.keras')

In [None]:
checkpoint_callback = ModelCheckpoint(filepath='nomic-autoencoder/global_cla_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='nomic-autoencoder/global_cla_training_log.csv', separator=',', append=True)

global_cla_history = global_cla_autoencoder_model.fit(
  x=global_cla_x_train,
  y=global_cla_y_train,
  batch_size=1,
  epochs=20,
  validation_data = (global_cla_x_test, global_cla_y_test),
  callbacks=[checkpoint_callback, csv_logger_callback]
)

In [None]:
global_cla_autoencoder_model.save('nomic-autoencoder/global_cla_autoencoder_model.keras')

## Load  Training History

In [None]:
# Access training history
loaded_sd_global_history = pd.DataFrame(pd.read_csv("current-data-dump/nomic-autoencoder/global_sd_training_log.csv"))
loaded_sd_global_history = pd.melt(loaded_sd_global_history, id_vars='epoch', value_vars=['metric_choose_argument_global_sd_y_train', 'val_metric_choose_argument_global_sd_y_train'], var_name='dataset', value_name='accuracy')
loaded_sd_global_history = loaded_sd_global_history.replace(['metric_choose_argument_global_sd_y_train', 'val_metric_choose_argument_global_sd_y_train'], ['training set', 'validation set'])
loaded_sd_global_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True)
loaded_sd_global_history['shuffled'] = False

In [None]:
# Access training history
loaded_sq_global_history = pd.DataFrame(pd.read_csv("current-data-dump/nomic-autoencoder/global_sq_training_log.csv"))
loaded_sq_global_history = pd.melt(loaded_sq_global_history, id_vars='epoch', value_vars=['metric_choose_argument_global_sq_y_train', 'val_metric_choose_argument_global_sq_y_train'], var_name='dataset', value_name='accuracy')
loaded_sq_global_history = loaded_sq_global_history.replace(['metric_choose_argument_global_sq_y_train', 'val_metric_choose_argument_global_sq_y_train'], ['training set', 'validation set'])
# loaded_sq_global_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True)
loaded_sq_global_history['shuffled'] = False

In [None]:
# Access training history
loaded_clu_global_history = pd.DataFrame(pd.read_csv("current-data-dump/nomic-autoencoder/global_clu_training_log.csv"))
loaded_clu_global_history = pd.melt(loaded_clu_global_history, id_vars='epoch', value_vars=['metric_choose_argument_global_clu_y_train', 'val_metric_choose_argument_global_clu_y_train'], var_name='dataset', value_name='accuracy')
loaded_clu_global_history = loaded_clu_global_history.replace(['metric_choose_argument_global_clu_y_train', 'val_metric_choose_argument_global_clu_y_train'], ['training set', 'validation set'])
loaded_clu_global_history['shuffled'] = False

In [None]:
# Access training history
loaded_cla_global_history = pd.DataFrame(pd.read_csv("current-data-dump/nomic-autoencoder/global_cla_training_log.csv"))
loaded_cla_global_history = pd.melt(loaded_cla_global_history, id_vars='epoch', value_vars=['metric_choose_argument_global_cla_y_train', 'val_metric_choose_argument_global_cla_y_train'], var_name='dataset', value_name='accuracy')
loaded_cla_global_history = loaded_cla_global_history.replace(['metric_choose_argument_global_cla_y_train', 'val_metric_choose_argument_global_cla_y_train'], ['training set', 'validation set'])
loaded_cla_global_history['shuffled'] = False

In [None]:
global_sd_training_plot = ggplot(loaded_sd_global_history, aes(x='epoch', y='accuracy', linetype='dataset')) + geom_line() + labs(title='Learning Curve of Model Trained on Unshuffled Data', x='Epoch', y='Accuracy')
ggsave(global_sd_training_plot, "current-data-dump/nomic-autoencoder/nomic-training-plots-dump/global_sd_training_plot.png")

## Global Shuffled Training

### Compile

In [None]:
# Global Shuffled SD Model
global_sd_autoencoder_model_shuffled = tf.keras.models.clone_model(autoencoder_model)
global_sd_autoencoder_model_shuffled.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_global_sd_y_train]
)

In [None]:
# Global Shuffled SQ Model
global_sq_autoencoder_model_shuffled = tf.keras.models.clone_model(autoencoder_model)
global_sq_autoencoder_model_shuffled.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_global_sq_y_train]
)

In [None]:
# Global Shuffled CLU Model
global_clu_autoencoder_model_shuffled = tf.keras.models.clone_model(autoencoder_model)
global_clu_autoencoder_model_shuffled.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_global_clu_y_train]
)

In [None]:
# Global Shuffled CLA Model
global_cla_autoencoder_model_shuffled = tf.keras.models.clone_model(autoencoder_model)
global_cla_autoencoder_model_shuffled.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_global_cla_y_train]
)

### Shuffled Training

In [None]:
checkpoint_callback = ModelCheckpoint(filepath='current-data-dump/nomic-autoencoder/global_sd_autoencoder_shuffled_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='current-data-dump/nomic-autoencoder/global_sd_training_shuffled_log.csv', separator=',', append=True)

global_sd_history = global_sd_autoencoder_model.fit(
  x=global_sd_x_train,
  y=global_sd_y_train_shuffled,
  batch_size=1,
  epochs=20,
  validation_data = (global_sd_x_test, global_sd_y_test),
  callbacks=[checkpoint_callback, csv_logger_callback]
)

In [None]:
global_sd_autoencoder_model.save('current-data-dump/nomic_autoencoder/global_sd_autoencoder_shuffled_model.keras')

In [None]:
checkpoint_callback = ModelCheckpoint(filepath='current-data-dump/nomic-autoencoder/global_sq_autoencoder_shuffled_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='current-data-dump/nomic-autoencoder/global_sq_training_shuffled_log.csv', separator=',', append=True)

global_sq_history = global_sq_autoencoder_model.fit(
    x=global_sq_x_train,
    y=global_sq_y_train_shuffled,
    batch_size=1,
    epochs=20,
    validation_data = (global_sq_x_test, global_sq_y_test),
    callbacks=[checkpoint_callback, csv_logger_callback]
)

In [None]:
global_sq_autoencoder_model.save('current-data-dump/nomic-autoencoder/global_sq_autoencoder_shuffled_model.keras')

In [None]:
checkpoint_callback = ModelCheckpoint(filepath='current-data-dump/nomic-autoencoder/global_clu_autoencoder_shuffled_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='current-data-dump/nomic-autoencoder/global_clu_training_shuffled_log.csv', separator=',', append=True)

global_clu_history = global_clu_autoencoder_model.fit(
  x=global_clu_x_train,
  y=global_clu_y_train_shuffled,
  batch_size=1,
  epochs=20,
  validation_data = (global_clu_x_test, global_clu_y_test),
  callbacks=[checkpoint_callback, csv_logger_callback]
)

In [None]:
global_clu_autoencoder_model.save('current-data-dump/nomic-autoencoder/global_clu_autoencoder_shuffled_model.keras')

In [None]:
checkpoint_callback = ModelCheckpoint(filepath='current-data-dump/nomic-autoencoder/global_cla_autoencoder_shuffled_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='current-data-dump/nomic-autoencoder/global_cla_training_shuffled_log.csv', separator=',', append=True)

global_cla_history = global_cla_autoencoder_model.fit(
  x=global_cla_x_train,
  y=global_cla_y_train_shuffled,
  batch_size=1,
  epochs=20,
  validation_data = (global_cla_x_test, global_cla_y_test),
  callbacks=[checkpoint_callback, csv_logger_callback]
)

In [None]:
global_cla_autoencoder_model.save('current-data-dump/nomic-autoencoder/global_cla_autoencoder_shuffled_model.keras')

## Load Training History

In [None]:
loaded_global_shuffled_history = pd.read_csv("current-data-dump/nomic-autoencoder/global_training_shuffled_log.csv")

In [None]:
loaded_global_shuffled_history = loaded_global_shuffled_history.loc[0:19]
loaded_global_shuffled_history = pd.melt(loaded_global_shuffled_history, id_vars='epoch', value_vars=['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], var_name='dataset', value_name='accuracy')
loaded_global_shuffled_history = loaded_global_shuffled_history.replace(['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], ['training set', 'validation set'])
loaded_global_shuffled_history['shuffled'] = True

In [None]:
global_training_shuffled_plot = ggplot(loaded_global_shuffled_history, aes(x='epoch', y='accuracy', linetype='dataset')) + geom_line() + labs(title='Learning Curve of Model Trained on Within-Topic Shuffled Data', x='Epoch', y='Accuracy')
ggsave(global_training_shuffled_plot, "current-data-dump/nomic-autoencoder/training_plots_dump/global_shuffled_training_plot.png")

In [None]:
combined_global_training_df = pd.concat([loaded_global_history, loaded_global_shuffled_history])
combined_global_training_df

In [None]:
combined_global_plot = (
    ggplot(combined_global_training_df, aes(x='epoch', y='accuracy', linetype='dataset', color='shuffled')) +
    geom_line(size=2) +
    labs(title='Learning Curve of Model Trained on Unshuffled vs. Within-Topic Shuffled Data', x='Epoch', y='Accuracy') +
    theme(
        figure_size=(16,24),
        axis_title=element_text(size=32),
        axis_text=element_text(size=24),
        legend_title=element_text(size=32, lineheight=1.5),
        legend_text=element_text(size=24, lineheight=1.5),
        plot_title=element_text(size=40, wrap=True, lineheight=1.5),
        legend_position="bottom",
        legend_key_width=64
    ) +
    guides(fill = guide_legend(byrow = True))
)
ggsave(combined_global_plot, "current-data-dump/nomic-autoencder/training_plots_dump/combined_global_training_plot.png")