# [Nomic] Autoencoder: Generate Corresponding Embedding

### Imports

In [1]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
from scipy import spatial
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from plotnine import ggplot, geom_line, aes, ggsave, labs, theme, element_text, guides, guide_legend

2024-02-27 14:11:50.755027: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Data

In [2]:
# Drop rows that do not follow 'point' -> 'counter' pattern
def prepare_training_df(data: pd.DataFrame):
    point_indices = data[data['type'] == 'point'].index
    counter_indices = data[data['type'] == 'counter'].index
    drop_indices = []
    for idx in point_indices:
        if (idx == len(data)-1) or (idx + 1 < len(data) and data.loc[idx + 1, 'type'] != 'counter'):
            drop_indices.append(idx)
    for idx in counter_indices:
        if idx > 0 and data.loc[idx - 1, 'type'] != 'point':
            drop_indices.append(idx)
    data = data.drop(drop_indices)
    data = data.select_dtypes(include=[np.number])
    data = data.reset_index(drop=True)
    return data

In [3]:
# Drop rows that do not follow 'point' -> 'counter' pattern
def prepare_training_df_shuffled(data: pd.DataFrame):
    point_indices = data[data['type'] == 'point'].index
    counter_indices = data[data['type'] == 'counter'].index
    drop_indices = []
    for idx in point_indices:
        if (idx == len(data)-1) or (idx + 1 < len(data) and data.loc[idx + 1, 'type'] != 'counter'):
            drop_indices.append(idx)
    for idx in counter_indices:
        if idx > 0 and data.loc[idx - 1, 'type'] != 'point':
            drop_indices.append(idx)
    data = data.drop(drop_indices)
    data = data.reset_index(drop=True)
    return data

In [4]:
# Make training and testing datasets
def make_x_train(data: pd.DataFrame) -> pd.DataFrame:
    cutoff = int(0.8 * data.shape[0])
    if cutoff % 2 != 0:
        cutoff = cutoff - 1
    train_rows_df = data.iloc[:cutoff, :]
    x_train = train_rows_df[train_rows_df.index % 2 == 0].reset_index(drop=True)
    return x_train
    
def make_y_train(data: pd.DataFrame) -> pd.DataFrame:
    cutoff = int(0.8 * data.shape[0])
    if cutoff % 2 != 0:
        cutoff = cutoff - 1
    train_rows_df = data.iloc[:cutoff, :]
    y_train = train_rows_df[train_rows_df.index % 2 != 0].reset_index(drop=True)
    return y_train

def make_x_test(data: pd.DataFrame) -> pd.DataFrame:
    cutoff = int(0.8 * data.shape[0])
    if cutoff % 2 != 0:
        cutoff = cutoff - 1
    test_rows_df = data.iloc[cutoff:, :]
    x_test = test_rows_df[test_rows_df.index % 2 == 0].reset_index(drop=True)
    return x_test

def make_y_test(data: pd.DataFrame) -> pd.DataFrame:
    cutoff = int(0.8 * data.shape[0])
    if cutoff % 2 != 0:
        cutoff = cutoff - 1
    test_rows_df = data.iloc[cutoff:, :]
    y_test = test_rows_df[test_rows_df.index % 2 != 0].reset_index(drop=True)
    return y_test

#### Global data

In [5]:
global_sd_embeddings_data = pd.read_pickle("../data_dump/nomic_embeddings_dump/global_sd_embeddings.pkl")
global_sq_embeddings_data = pd.read_pickle("../data_dump/nomic_embeddings_dump/global_sq_embeddings.pkl")
global_clu_embeddings_data = pd.read_pickle("../data_dump/nomic_embeddings_dump/global_clu_embeddings.pkl")
global_cla_embeddings_data = pd.read_pickle("../data_dump/nomic_embeddings_dump/global_cla_embeddings.pkl")

In [6]:
global_sd_training_df = prepare_training_df(global_sd_embeddings_data)
global_sq_training_df = prepare_training_df(global_sq_embeddings_data)
global_clu_training_df = prepare_training_df(global_clu_embeddings_data)
global_cla_training_df = prepare_training_df(global_cla_embeddings_data)

In [7]:
global_sd_x_train = make_x_train(global_sd_training_df)
global_sq_x_train = make_x_train(global_sq_training_df)
global_clu_x_train = make_x_train(global_clu_training_df)
global_cla_x_train = make_x_train(global_cla_training_df)

In [8]:
global_sd_y_train = make_y_train(global_sd_training_df)
global_sq_y_train = make_y_train(global_sq_training_df)
global_clu_y_train = make_y_train(global_clu_training_df)
global_cla_y_train = make_y_train(global_cla_training_df)

In [9]:
global_sd_x_test = make_x_test(global_sd_training_df)
global_sq_x_test = make_x_test(global_sq_training_df)
global_clu_x_test = make_x_test(global_clu_training_df)
global_cla_x_test = make_x_test(global_cla_training_df)

In [10]:
global_sd_y_test = make_y_test(global_sd_training_df)
global_sq_y_test = make_y_test(global_sq_training_df)
global_clu_y_test = make_y_test(global_clu_training_df)
global_cla_y_test = make_y_test(global_cla_training_df)

In [11]:
global_sd_y_train_test = pd.concat([global_sd_y_train, global_sd_y_test], axis=0)
global_sq_y_train_test = pd.concat([global_sq_y_train, global_sq_y_test], axis=0)
global_clu_y_train_test = pd.concat([global_clu_y_train, global_clu_y_test], axis=0)
global_cla_y_train_test = pd.concat([global_cla_y_train, global_cla_y_test], axis=0)

#### Global data shuffled

In [12]:
global_sd_training_df_shuffled = prepare_training_df_shuffled(global_sd_embeddings_data)
global_sq_training_df_shuffled = prepare_training_df_shuffled(global_sq_embeddings_data)
global_clu_training_df_shuffled = prepare_training_df_shuffled(global_clu_embeddings_data)
global_cla_training_df_shuffled = prepare_training_df_shuffled(global_cla_embeddings_data)

In [13]:
global_sd_y_train_shuffled = make_y_train(global_sd_training_df_shuffled)
global_sq_y_train_shuffled = make_y_train(global_sq_training_df_shuffled)
global_clu_y_train_shuffled = make_y_train(global_clu_training_df_shuffled)
global_cla_y_train_shuffled = make_y_train(global_cla_training_df_shuffled)

In [14]:
global_sd_y_train_shuffled = global_sd_y_train_shuffled.groupby(['topic'], sort=False)
global_sd_y_train_shuffled = global_sd_y_train_shuffled.sample(frac=1).reset_index(drop=True)
global_sd_y_train_shuffled = global_sd_y_train_shuffled.select_dtypes(include=[np.number])

global_sq_y_train_shuffled = global_sq_y_train_shuffled.groupby(['topic'], sort=False)
global_sq_y_train_shuffled = global_sq_y_train_shuffled.sample(frac=1).reset_index(drop=True)
global_sq_y_train_shuffled = global_sq_y_train_shuffled.select_dtypes(include=[np.number])

global_clu_y_train_shuffled = global_clu_y_train_shuffled.groupby(['topic'], sort=False)
global_clu_y_train_shuffled = global_clu_y_train_shuffled.sample(frac=1).reset_index(drop=True)
global_clu_y_train_shuffled = global_clu_y_train_shuffled.select_dtypes(include=[np.number])

global_cla_y_train_shuffled = global_cla_y_train_shuffled.groupby(['topic'], sort=False)
global_cla_y_train_shuffled = global_cla_y_train_shuffled.sample(frac=1).reset_index(drop=True)
global_cla_y_train_shuffled = global_cla_y_train_shuffled.select_dtypes(include=[np.number])

## Model

In [24]:
# Layers
input_layer = tf.keras.layers.Input(shape=(768, ), name="Input")
hidden_layer = tf.keras.layers.Dense(units=768, activation="relu", name="Hidden")(input_layer)
output_layer = tf.keras.layers.Dense(units=768, activation="linear", name="Output")(hidden_layer)

In [25]:
# Model
autoencoder_model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
autoencoder_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 768)]             0         
                                                                 
 Hidden (Dense)              (None, 768)               590592    
                                                                 
 Output (Dense)              (None, 768)               590592    
                                                                 
Total params: 1181184 (4.51 MB)
Trainable params: 1181184 (4.51 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
def metric_choose_argument_global_sd_y_train(y_true, y_pred):
    """global_sd_metric"""
    global_sd_training_df_32 = tf.cast(global_sd_training_df, dtype=tf.float32)
    
    cos_sim_pred = tf.matmul(global_sd_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_sd_training_df_32, axis=1), [-1, 1])
    cos_sim_true = tf.matmul(global_sd_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_sd_training_df_32, axis=1), [-1, 1])

    max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
    max_cos_sim_true = tf.math.argmax(cos_sim_true)

    return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

In [28]:
def metric_choose_argument_global_sq_y_train(y_true, y_pred):
    """global_sq_metric"""
    global_sq_training_df_32 = tf.cast(global_sq_training_df, dtype=tf.float32)
    
    cos_sim_pred = tf.matmul(global_sq_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_sq_training_df_32, axis=1), [-1, 1])
    cos_sim_true = tf.matmul(global_sq_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_sq_training_df_32, axis=1), [-1, 1])

    max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
    max_cos_sim_true = tf.math.argmax(cos_sim_true)

    return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

In [29]:
def metric_choose_argument_global_clu_y_train(y_true, y_pred):
    """global_clu_metric"""
    global_clu_training_df_32 = tf.cast(global_clu_training_df, dtype=tf.float32)
    
    cos_sim_pred = tf.matmul(global_clu_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_clu_training_df_32, axis=1), [-1, 1])
    cos_sim_true = tf.matmul(global_clu_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_clu_training_df_32, axis=1), [-1, 1])

    max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
    max_cos_sim_true = tf.math.argmax(cos_sim_true)

    return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

In [30]:
def metric_choose_argument_global_cla_y_train(y_true, y_pred):
    """global_cla_metric"""
    global_cla_training_df_32 = tf.cast(global_cla_training_df, dtype=tf.float32)
    
    cos_sim_pred = tf.matmul(global_cla_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_cla_training_df_32, axis=1), [-1, 1])
    cos_sim_true = tf.matmul(global_cla_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_cla_training_df_32, axis=1), [-1, 1])

    max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
    max_cos_sim_true = tf.math.argmax(cos_sim_true)

    return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

#### Global Training

In [39]:
# Global SD Model
global_sd_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_sd_autoencoder_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="cosine_similarity",
    metrics=[metric_choose_argument_global_sd_y_train]
)

In [32]:
# Global SQ Model
global_sq_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_sq_autoencoder_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="cosine_similarity",
    metrics=[metric_choose_argument_global_sq_y_train]
)

In [33]:
# Global CLU Model
global_clu_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_clu_autoencoder_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="cosine_similarity",
    metrics=[metric_choose_argument_global_clu_y_train]
)

In [34]:
# Global CLA Model
global_cla_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_cla_autoencoder_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="cosine_similarity",
    metrics=[metric_choose_argument_global_cla_y_train]
)

## Training

In [40]:
checkpoint_callback = ModelCheckpoint(filepath='nomic_autoencoder/global_sd_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='nomic_autoencoder/global_sd_training_log.csv', separator=',', append=True)

global_sd_history = global_sd_autoencoder_model.fit(
    x=global_sd_x_train,
    y=global_sd_y_train,
    batch_size=1,
    epochs=20,
    validation_data = (global_sd_x_test, global_sd_y_test),
    callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/20
Epoch 1: saving model to nomic_autoencoder/global_sd_autoencoder_weights.keras
Epoch 2/20
Epoch 2: saving model to nomic_autoencoder/global_sd_autoencoder_weights.keras
Epoch 3/20
Epoch 3: saving model to nomic_autoencoder/global_sd_autoencoder_weights.keras
Epoch 4/20
Epoch 4: saving model to nomic_autoencoder/global_sd_autoencoder_weights.keras
Epoch 5/20
Epoch 5: saving model to nomic_autoencoder/global_sd_autoencoder_weights.keras
Epoch 6/20
Epoch 6: saving model to nomic_autoencoder/global_sd_autoencoder_weights.keras
Epoch 7/20
Epoch 7: saving model to nomic_autoencoder/global_sd_autoencoder_weights.keras
Epoch 8/20
Epoch 8: saving model to nomic_autoencoder/global_sd_autoencoder_weights.keras
Epoch 9/20
Epoch 9: saving model to nomic_autoencoder/global_sd_autoencoder_weights.keras
Epoch 10/20
Epoch 10: saving model to nomic_autoencoder/global_sd_autoencoder_weights.keras
Epoch 11/20
Epoch 11: saving model to nomic_autoencoder/global_sd_autoencoder_weights.keras
Epoch 

In [38]:
global_sd_autoencoder_model.save('nomic_autoencoder/global_sd_autoencoder_model.keras')

In [51]:
checkpoint_callback = ModelCheckpoint(filepath='nomic_autoencoder/global_sq_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='nomic_autoencoder/global_sq_training_log.csv', separator=',', append=True)

global_sq_history = global_sq_autoencoder_model.fit(
    x=global_sq_x_train,
    y=global_sq_y_train,
    batch_size=1,
    epochs=20,
    validation_data = (global_sq_x_test, global_sq_y_test),
    callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/20
Epoch 1: saving model to nomic_autoencoder/global_sq_autoencoder_weights.keras
Epoch 2/20
Epoch 2: saving model to nomic_autoencoder/global_sq_autoencoder_weights.keras
Epoch 3/20
Epoch 3: saving model to nomic_autoencoder/global_sq_autoencoder_weights.keras
Epoch 4/20
Epoch 4: saving model to nomic_autoencoder/global_sq_autoencoder_weights.keras
Epoch 5/20
Epoch 5: saving model to nomic_autoencoder/global_sq_autoencoder_weights.keras
Epoch 6/20
Epoch 6: saving model to nomic_autoencoder/global_sq_autoencoder_weights.keras
Epoch 7/20
Epoch 7: saving model to nomic_autoencoder/global_sq_autoencoder_weights.keras
Epoch 8/20
Epoch 8: saving model to nomic_autoencoder/global_sq_autoencoder_weights.keras
Epoch 9/20
Epoch 9: saving model to nomic_autoencoder/global_sq_autoencoder_weights.keras
Epoch 10/20
Epoch 10: saving model to nomic_autoencoder/global_sq_autoencoder_weights.keras
Epoch 11/20
Epoch 11: saving model to nomic_autoencoder/global_sq_autoencoder_weights.keras
Epoch 

In [53]:
global_sq_autoencoder_model.save('nomic_autoencoder/global_sq_autoencoder_model.keras')

In [54]:
checkpoint_callback = ModelCheckpoint(filepath='nomic_autoencoder/global_clu_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='nomic_autoencoder/global_clu_training_log.csv', separator=',', append=True)

global_clu_history = global_clu_autoencoder_model.fit(
    x=global_clu_x_train,
    y=global_clu_y_train,
    batch_size=1,
    epochs=20,
    validation_data = (global_clu_x_test, global_clu_y_test),
    callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/20
Epoch 1: saving model to nomic_autoencoder/global_clu_autoencoder_weights.keras
Epoch 2/20
Epoch 2: saving model to nomic_autoencoder/global_clu_autoencoder_weights.keras
Epoch 3/20
Epoch 3: saving model to nomic_autoencoder/global_clu_autoencoder_weights.keras
Epoch 4/20
Epoch 4: saving model to nomic_autoencoder/global_clu_autoencoder_weights.keras
Epoch 5/20
Epoch 5: saving model to nomic_autoencoder/global_clu_autoencoder_weights.keras
Epoch 6/20
Epoch 6: saving model to nomic_autoencoder/global_clu_autoencoder_weights.keras
Epoch 7/20
Epoch 7: saving model to nomic_autoencoder/global_clu_autoencoder_weights.keras
Epoch 8/20
Epoch 8: saving model to nomic_autoencoder/global_clu_autoencoder_weights.keras
Epoch 9/20
Epoch 9: saving model to nomic_autoencoder/global_clu_autoencoder_weights.keras
Epoch 10/20
Epoch 10: saving model to nomic_autoencoder/global_clu_autoencoder_weights.keras
Epoch 11/20
Epoch 11: saving model to nomic_autoencoder/global_clu_autoencoder_weights.k

In [55]:
global_clu_autoencoder_model.save('nomic_autoencoder/global_clu_autoencoder_model.keras')

In [56]:
checkpoint_callback = ModelCheckpoint(filepath='nomic_autoencoder/global_cla_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='nomic_autoencoder/global_cla_training_log.csv', separator=',', append=True)

global_cla_history = global_cla_autoencoder_model.fit(
    x=global_cla_x_train,
    y=global_cla_y_train,
    batch_size=1,
    epochs=20,
    validation_data = (global_cla_x_test, global_cla_y_test),
    callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/20
Epoch 1: saving model to nomic_autoencoder/global_cla_autoencoder_weights.keras
Epoch 2/20
Epoch 2: saving model to nomic_autoencoder/global_cla_autoencoder_weights.keras
Epoch 3/20
Epoch 3: saving model to nomic_autoencoder/global_cla_autoencoder_weights.keras
Epoch 4/20
Epoch 4: saving model to nomic_autoencoder/global_cla_autoencoder_weights.keras
Epoch 5/20
Epoch 5: saving model to nomic_autoencoder/global_cla_autoencoder_weights.keras
Epoch 6/20
Epoch 6: saving model to nomic_autoencoder/global_cla_autoencoder_weights.keras
Epoch 7/20
Epoch 7: saving model to nomic_autoencoder/global_cla_autoencoder_weights.keras
Epoch 8/20
Epoch 8: saving model to nomic_autoencoder/global_cla_autoencoder_weights.keras
Epoch 9/20
Epoch 9: saving model to nomic_autoencoder/global_cla_autoencoder_weights.keras
Epoch 10/20
Epoch 10: saving model to nomic_autoencoder/global_cla_autoencoder_weights.keras
Epoch 11/20
Epoch 11: saving model to nomic_autoencoder/global_cla_autoencoder_weights.k

In [57]:
global_cla_autoencoder_model.save('nomic_autoencoder/global_cla_autoencoder_model.keras')

## Load  Training History

In [42]:
# Access training history
loaded_sd_global_history = pd.DataFrame(pd.read_csv("nomic_autoencoder/global_sd_training_log.csv"))
loaded_sd_global_history = pd.melt(loaded_sd_global_history, id_vars='epoch', value_vars=['metric_choose_argument_global_sd_y_train', 'val_metric_choose_argument_global_sd_y_train'], var_name='dataset', value_name='accuracy')
loaded_sd_global_history = loaded_sd_global_history.replace(['metric_choose_argument_global_sd_y_train', 'val_metric_choose_argument_global_sd_y_train'], ['training set', 'validation set'])
loaded_sd_global_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True) 
loaded_sd_global_history['shuffled'] = False

In [43]:
# Access training history
loaded_sq_global_history = pd.DataFrame(pd.read_csv("nomic_autoencoder/global_sq_training_log.csv"))
loaded_sq_global_history = pd.melt(loaded_sq_global_history, id_vars='epoch', value_vars=['metric_choose_argument_global_sq_y_train', 'val_metric_choose_argument_global_sq_y_train'], var_name='dataset', value_name='accuracy')
loaded_sq_global_history = loaded_sq_global_history.replace(['metric_choose_argument_global_sq_y_train', 'val_metric_choose_argument_global_sq_y_train'], ['training set', 'validation set'])
# loaded_sq_global_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True) 
loaded_sq_global_history['shuffled'] = False

In [44]:
# Access training history
loaded_clu_global_history = pd.DataFrame(pd.read_csv("nomic_autoencoder/global_clu_training_log.csv"))
loaded_clu_global_history = pd.melt(loaded_clu_global_history, id_vars='epoch', value_vars=['metric_choose_argument_global_clu_y_train', 'val_metric_choose_argument_global_clu_y_train'], var_name='dataset', value_name='accuracy')
loaded_clu_global_history = loaded_clu_global_history.replace(['metric_choose_argument_global_clu_y_train', 'val_metric_choose_argument_global_clu_y_train'], ['training set', 'validation set'])
loaded_clu_global_history['shuffled'] = False

In [45]:
# Access training history
loaded_cla_global_history = pd.DataFrame(pd.read_csv("nomic_autoencoder/global_cla_training_log.csv"))
loaded_cla_global_history = pd.melt(loaded_cla_global_history, id_vars='epoch', value_vars=['metric_choose_argument_global_cla_y_train', 'val_metric_choose_argument_global_cla_y_train'], var_name='dataset', value_name='accuracy')
loaded_cla_global_history = loaded_cla_global_history.replace(['metric_choose_argument_global_cla_y_train', 'val_metric_choose_argument_global_cla_y_train'], ['training set', 'validation set'])
loaded_cla_global_history['shuffled'] = False

In [46]:
global_sd_training_plot = ggplot(loaded_sd_global_history, aes(x='epoch', y='accuracy', linetype='dataset')) + geom_line() + labs(title='Learning Curve of Model Trained on Unshuffled Data', x='Epoch', y='Accuracy')
ggsave(global_sd_training_plot, "../data_dump/nomic_training_plots_dump/global_sd_training_plot.png")



## Global Shuffled Training

In [48]:
# Global Shuffled SD Model
global_sd_autoencoder_model_shuffled = tf.keras.models.clone_model(autoencoder_model)
global_sd_autoencoder_model_shuffled.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="cosine_similarity",
    metrics=[metric_choose_argument_global_sd_y_train]
)

In [49]:
# Global Shuffled SQ Model
global_sq_autoencoder_model_shuffled = tf.keras.models.clone_model(autoencoder_model)
global_sq_autoencoder_model_shuffled.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="cosine_similarity",
    metrics=[metric_choose_argument_global_sq_y_train]
)

In [50]:
# Global Shuffled CLU Model
global_clu_autoencoder_model_shuffled = tf.keras.models.clone_model(autoencoder_model)
global_clu_autoencoder_model_shuffled.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="cosine_similarity",
    metrics=[metric_choose_argument_global_clu_y_train]
)

In [51]:
# Global Shuffled CLA Model
global_cla_autoencoder_model_shuffled = tf.keras.models.clone_model(autoencoder_model)
global_cla_autoencoder_model_shuffled.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="cosine_similarity",
    metrics=[metric_choose_argument_global_cla_y_train]
)

## Shuffled Training

In [52]:
checkpoint_callback = ModelCheckpoint(filepath='nomic_autoencoder/global_sd_autoencoder_shuffled_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='nomic_autoencoder/global_sd_training_shuffled_log.csv', separator=',', append=True)

global_sd_history = global_sd_autoencoder_model.fit(
    x=global_sd_x_train,
    y=global_sd_y_train_shuffled,
    batch_size=1,
    epochs=20,
    validation_data = (global_sd_x_test, global_sd_y_test),
    callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/20
Epoch 1: saving model to nomic_autoencoder/global_sd_autoencoder_shuffled_weights.keras
Epoch 2/20
Epoch 2: saving model to nomic_autoencoder/global_sd_autoencoder_shuffled_weights.keras
Epoch 3/20
Epoch 3: saving model to nomic_autoencoder/global_sd_autoencoder_shuffled_weights.keras
Epoch 4/20
Epoch 4: saving model to nomic_autoencoder/global_sd_autoencoder_shuffled_weights.keras
Epoch 5/20
Epoch 5: saving model to nomic_autoencoder/global_sd_autoencoder_shuffled_weights.keras
Epoch 6/20
Epoch 6: saving model to nomic_autoencoder/global_sd_autoencoder_shuffled_weights.keras
Epoch 7/20
Epoch 7: saving model to nomic_autoencoder/global_sd_autoencoder_shuffled_weights.keras
Epoch 8/20
Epoch 8: saving model to nomic_autoencoder/global_sd_autoencoder_shuffled_weights.keras
Epoch 9/20
Epoch 9: saving model to nomic_autoencoder/global_sd_autoencoder_shuffled_weights.keras
Epoch 10/20
Epoch 10: saving model to nomic_autoencoder/global_sd_autoencoder_shuffled_weights.keras
Epoch 11

In [53]:
global_sd_autoencoder_model.save('nomic_autoencoder/global_sd_autoencoder_shuffled_model.keras')

In [54]:
checkpoint_callback = ModelCheckpoint(filepath='nomic_autoencoder/global_sq_autoencoder_shuffled_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='nomic_autoencoder/global_sq_training_shuffled_log.csv', separator=',', append=True)

global_sq_history = global_sq_autoencoder_model.fit(
    x=global_sq_x_train,
    y=global_sq_y_train_shuffled,
    batch_size=1,
    epochs=20,
    validation_data = (global_sq_x_test, global_sq_y_test),
    callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/20
Epoch 1: saving model to nomic_autoencoder/global_sq_autoencoder_shuffled_weights.keras
Epoch 2/20
Epoch 2: saving model to nomic_autoencoder/global_sq_autoencoder_shuffled_weights.keras
Epoch 3/20
Epoch 3: saving model to nomic_autoencoder/global_sq_autoencoder_shuffled_weights.keras
Epoch 4/20
Epoch 4: saving model to nomic_autoencoder/global_sq_autoencoder_shuffled_weights.keras
Epoch 5/20
Epoch 5: saving model to nomic_autoencoder/global_sq_autoencoder_shuffled_weights.keras
Epoch 6/20
Epoch 6: saving model to nomic_autoencoder/global_sq_autoencoder_shuffled_weights.keras
Epoch 7/20
Epoch 7: saving model to nomic_autoencoder/global_sq_autoencoder_shuffled_weights.keras
Epoch 8/20
Epoch 8: saving model to nomic_autoencoder/global_sq_autoencoder_shuffled_weights.keras
Epoch 9/20
Epoch 9: saving model to nomic_autoencoder/global_sq_autoencoder_shuffled_weights.keras
Epoch 10/20
Epoch 10: saving model to nomic_autoencoder/global_sq_autoencoder_shuffled_weights.keras
Epoch 11

In [55]:
global_sq_autoencoder_model.save('nomic_autoencoder/global_sq_autoencoder_shuffled_model.keras')

In [56]:
checkpoint_callback = ModelCheckpoint(filepath='nomic_autoencoder/global_clu_autoencoder_shuffled_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='nomic_autoencoder/global_clu_training_shuffled_log.csv', separator=',', append=True)

global_clu_history = global_clu_autoencoder_model.fit(
    x=global_clu_x_train,
    y=global_clu_y_train_shuffled,
    batch_size=1,
    epochs=20,
    validation_data = (global_clu_x_test, global_clu_y_test),
    callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/20
Epoch 1: saving model to nomic_autoencoder/global_clu_autoencoder_shuffled_weights.keras
Epoch 2/20
Epoch 2: saving model to nomic_autoencoder/global_clu_autoencoder_shuffled_weights.keras
Epoch 3/20
Epoch 3: saving model to nomic_autoencoder/global_clu_autoencoder_shuffled_weights.keras
Epoch 4/20
Epoch 4: saving model to nomic_autoencoder/global_clu_autoencoder_shuffled_weights.keras
Epoch 5/20
Epoch 5: saving model to nomic_autoencoder/global_clu_autoencoder_shuffled_weights.keras
Epoch 6/20
Epoch 6: saving model to nomic_autoencoder/global_clu_autoencoder_shuffled_weights.keras
Epoch 7/20
Epoch 7: saving model to nomic_autoencoder/global_clu_autoencoder_shuffled_weights.keras
Epoch 8/20
Epoch 8: saving model to nomic_autoencoder/global_clu_autoencoder_shuffled_weights.keras
Epoch 9/20
Epoch 9: saving model to nomic_autoencoder/global_clu_autoencoder_shuffled_weights.keras
Epoch 10/20
Epoch 10: saving model to nomic_autoencoder/global_clu_autoencoder_shuffled_weights.kera

In [57]:
global_clu_autoencoder_model.save('nomic_autoencoder/global_clu_autoencoder_shuffled_model.keras')

In [58]:
checkpoint_callback = ModelCheckpoint(filepath='nomic_autoencoder/global_cla_autoencoder_shuffled_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='nomic_autoencoder/global_cla_training_shuffled_log.csv', separator=',', append=True)

global_cla_history = global_cla_autoencoder_model.fit(
    x=global_cla_x_train,
    y=global_cla_y_train_shuffled,
    batch_size=1,
    epochs=20,
    validation_data = (global_cla_x_test, global_cla_y_test),
    callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/20
Epoch 1: saving model to nomic_autoencoder/global_cla_autoencoder_shuffled_weights.keras
Epoch 2/20
Epoch 2: saving model to nomic_autoencoder/global_cla_autoencoder_shuffled_weights.keras
Epoch 3/20
Epoch 3: saving model to nomic_autoencoder/global_cla_autoencoder_shuffled_weights.keras
Epoch 4/20
Epoch 4: saving model to nomic_autoencoder/global_cla_autoencoder_shuffled_weights.keras
Epoch 5/20
Epoch 5: saving model to nomic_autoencoder/global_cla_autoencoder_shuffled_weights.keras
Epoch 6/20
Epoch 6: saving model to nomic_autoencoder/global_cla_autoencoder_shuffled_weights.keras
Epoch 7/20
Epoch 7: saving model to nomic_autoencoder/global_cla_autoencoder_shuffled_weights.keras
Epoch 8/20
Epoch 8: saving model to nomic_autoencoder/global_cla_autoencoder_shuffled_weights.keras
Epoch 9/20
Epoch 9: saving model to nomic_autoencoder/global_cla_autoencoder_shuffled_weights.keras
Epoch 10/20
Epoch 10: saving model to nomic_autoencoder/global_cla_autoencoder_shuffled_weights.kera

In [59]:
global_cla_autoencoder_model.save('nomic_autoencoder/global_cla_autoencoder_shuffled_model.keras')

## Load Training History

In [174]:
loaded_global_shuffled_history = pd.read_csv("./global_training_shuffled_log.csv")

In [175]:
loaded_global_shuffled_history = loaded_global_shuffled_history.loc[0:19]
loaded_global_shuffled_history = pd.melt(loaded_global_shuffled_history, id_vars='epoch', value_vars=['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], var_name='dataset', value_name='accuracy')
loaded_global_shuffled_history = loaded_global_shuffled_history.replace(['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], ['training set', 'validation set'])
loaded_global_shuffled_history['shuffled'] = True

In [176]:
loaded_global_shuffled_history

Unnamed: 0,epoch,dataset,accuracy,shuffled
0,0,training set,0.012608,True
1,1,training set,0.035363,True
2,2,training set,0.048893,True
3,3,training set,0.073801,True
4,4,training set,0.108549,True
5,5,training set,0.154367,True
6,6,training set,0.217405,True
7,7,training set,0.273063,True
8,8,training set,0.338561,True
9,9,training set,0.405904,True


In [170]:
global_training_shuffled_plot = ggplot(loaded_global_shuffled_history, aes(x='epoch', y='accuracy', linetype='dataset')) + geom_line() + labs(title='Learning Curve of Model Trained on Within-Topic Shuffled Data', x='Epoch', y='Accuracy')
ggsave(global_training_shuffled_plot, "../data_dump/training_plots_dump/global_shuffled_training_plot.png")



In [178]:
combined_global_training_df = pd.concat([loaded_global_history, loaded_global_shuffled_history])
combined_global_training_df

Unnamed: 0,epoch,dataset,accuracy,shuffled
0,0,training set,0.040283,False
1,1,training set,0.124231,False
2,2,training set,0.200800,False
3,3,training set,0.281058,False
4,4,training set,0.382226,False
...,...,...,...,...
35,15,validation set,0.071341,True
36,16,validation set,0.073801,True
37,17,validation set,0.066421,True
38,18,validation set,0.066421,True


In [218]:
combined_global_plot = (
    ggplot(combined_global_training_df, aes(x='epoch', y='accuracy', linetype='dataset', color='shuffled')) +
    geom_line(size=2) +
    labs(title='Learning Curve of Model Trained on Unshuffled vs. Within-Topic Shuffled Data', x='Epoch', y='Accuracy') +
    theme(
        figure_size=(16,24),
        axis_title=element_text(size=32),
        axis_text=element_text(size=24),
        legend_title=element_text(size=32, lineheight=1.5),
        legend_text=element_text(size=24, lineheight=1.5),
        plot_title=element_text(size=40, wrap=True, lineheight=1.5),
        legend_position="bottom",
        legend_key_width=64
    ) +
    guides(fill = guide_legend(byrow = True))
)
ggsave(combined_global_plot, "../data_dump/training_plots_dump/combined_global_training_plot.png")

