<a href="https://colab.research.google.com/github/cchang-vassar/Semantic-Relations-in-Vector-Embeddings/blob/main/study2_9_%5Bnomic%5D_scifact_autoencoder_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [nomic] Autoencoder: Generate Corresponding Embedding (Scifact)

## Set Up

### Imports

In [None]:
!pip install tensorflow



In [None]:
!pip install plotnine



In [None]:
import os
import subprocess
import zipfile
import shutil
import pickle
import tensorflow as tf
import pandas as pd
import numpy as np
from google.colab import userdata
from scipy import spatial
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from plotnine import ggplot, geom_line, aes, ggsave, labs, theme, element_text, guides, guide_legend, scale_color_manual, scale_y_continuous, scale_linetype_manual

### OSF Setup

In [None]:
!pip install osfclient

Collecting osfclient
  Downloading osfclient-0.0.5-py2.py3-none-any.whl (39 kB)
Installing collected packages: osfclient
Successfully installed osfclient-0.0.5


In [None]:
import osfclient.cli

In [None]:
from osfclient.api import OSF
from osfclient.models import Project, Storage
from io import BytesIO

In [None]:
os.environ["OSF_USERNAME"] = userdata.get("OSF_USERNAME")
OSF_USERNAME = os.environ["OSF_USERNAME"]

In [None]:
os.environ["OSF_PASSWORD"] = userdata.get("OSF_PASSWORD")
OSF_PASSWORD = os.environ["OSF_PASSWORD"]

In [None]:
os.environ["OSF_TOKEN"] = userdata.get("OSF_TOKEN")
OSF_TOKEN = os.environ["OSF_TOKEN"]

In [None]:
os.environ["OSF_PROJECT_ID"] = userdata.get("OSF_PROJECT_ID")
OSF_PROJECT_ID = os.environ["OSF_PROJECT_ID"]

## Data

### Import corpora data from OSF

In [None]:
!osf -p sakjg fetch osfstorage/data-dump/nomic-autoencoder/nomic_scifact_embeddings_dump.zip

100% 7.96M/7.96M [00:00<00:00, 49.0Mbytes/s]


In [None]:
embeddings_file_path = 'nomic_scifact_embeddings_dump.zip'
output_folder_path = 'current-data-dump/embeddings-dump'
os.makedirs(output_folder_path, exist_ok=True)

with zipfile.ZipFile(embeddings_file_path, 'r') as zip_ref:
  zip_ref.extractall(output_folder_path)

extracted_files = os.listdir(output_folder_path)
print("Files extracted:", extracted_files)

Files extracted: ['training_pro_argument_embeddings.pkl', 'combined_counter_evidence_embeddings.pkl', 'training_pro_evidence_embeddings.pkl', 'combined_pro_argument_embeddings.pkl', 'test_counter_evidence_embeddings.pkl', 'training_counter_argument_embeddings.pkl', 'test_counter_argument_embeddings.pkl', 'combined_counter_argument_embeddings.pkl', 'combined_pro_evidence_embeddings.pkl', 'test_pro_evidence_embeddings.pkl', 'test_pro_argument_embeddings.pkl', 'training_counter_evidence_embeddings.pkl']


In [None]:
x_train_pro = pd.read_pickle('current-data-dump/embeddings-dump/training_pro_argument_embeddings.pkl')
y_train_pro = pd.read_pickle('current-data-dump/embeddings-dump/training_pro_evidence_embeddings.pkl')
x_test_pro = pd.read_pickle('current-data-dump/embeddings-dump/test_pro_argument_embeddings.pkl')
y_test_pro = pd.read_pickle('current-data-dump/embeddings-dump/test_pro_evidence_embeddings.pkl')

In [None]:
x_train_counter = pd.read_pickle('current-data-dump/embeddings-dump/training_counter_argument_embeddings.pkl')
y_train_counter = pd.read_pickle('current-data-dump/embeddings-dump/training_counter_evidence_embeddings.pkl')
x_test_counter = pd.read_pickle('current-data-dump/embeddings-dump/test_counter_argument_embeddings.pkl')
y_test_counter = pd.read_pickle('current-data-dump/embeddings-dump/test_counter_evidence_embeddings.pkl')

In [None]:
x_train_pro = x_train_pro.drop(columns=['argument'])
y_train_pro = y_train_pro.drop(columns=['evidence'])
x_test_pro = x_test_pro.drop(columns=['argument'])
y_test_pro = y_test_pro.drop(columns=['evidence'])

In [None]:
x_train_counter = x_train_counter.drop(columns=['argument'])
y_train_counter = y_train_counter.drop(columns=['evidence'])
x_test_counter = x_test_counter.drop(columns=['argument'])
y_test_counter = y_test_counter.drop(columns=['evidence'])

In [None]:
x_train_combined = pd.concat([x_train_pro, x_train_counter])
y_train_combined = pd.concat([y_train_pro, y_train_counter])
x_test_combined = pd.concat([x_test_pro, x_test_counter])
y_test_combined = pd.concat([y_test_pro, y_test_counter])

### Make global data shuffled

In [None]:
y_train_shuffled = y_train_combined.copy().sample(frac=1).reset_index(drop=True)
y_train_pro_shuffled = y_train_pro.copy().sample(frac=1).reset_index(drop=True)
y_train_counter_shuffled = y_train_counter.copy().sample(frac=1).reset_index(drop=True)

### Save global training df

In [None]:
x_train_combined_folder_path = 'current-data-dump/nomic-autoencoder/'
x_train_combined_file_path = f'{x_train_combined_folder_path}x_train_combined.pkl'
y_train_combined_folder_path = 'current-data-dump/nomic-autoencoder/'
y_train_combined_file_path = f'{x_train_combined_folder_path}y_train_combined.pkl'
os.makedirs(x_train_combined_folder_path, exist_ok=True)
os.makedirs(y_train_combined_folder_path, exist_ok=True)
with open(x_train_combined_file_path, 'wb') as file:
  pickle.dump(x_train_combined, file)
  print(f"File uploaded to {x_train_combined_file_path}")
with open(y_train_combined_file_path, 'wb') as file:
  pickle.dump(y_train_combined, file)
  print(f"File uploaded to {y_train_combined_file_path}")

File uploaded to current-data-dump/nomic-autoencoder/x_train_combined.pkl
File uploaded to current-data-dump/nomic-autoencoder/y_train_combined.pkl


In [None]:
x_test_combined_folder_path = 'current-data-dump/nomic-autoencoder/'
x_test_combined_file_path = f'{x_test_combined_folder_path}x_test_combined.pkl'
y_test_combined_folder_path = 'current-data-dump/nomic-autoencoder/'
y_test_combined_file_path = f'{x_test_combined_folder_path}y_test_combined.pkl'
os.makedirs(x_test_combined_folder_path, exist_ok=True)
os.makedirs(y_test_combined_folder_path, exist_ok=True)
with open(x_test_combined_file_path, 'wb') as file:
  pickle.dump(x_test_combined, file)
  print(f"File uploaded to {x_test_combined_file_path}")
with open(y_test_combined_file_path, 'wb') as file:
  pickle.dump(y_test_combined, file)
  print(f"File uploaded to {y_test_combined_file_path}")

File uploaded to current-data-dump/nomic-autoencoder/x_test_combined.pkl
File uploaded to current-data-dump/nomic-autoencoder/y_test_combined.pkl


In [None]:
x_train_pro_folder_path = 'current-data-dump/nomic-autoencoder/'
x_train_pro_file_path = f'{x_train_pro_folder_path}x_train_pro.pkl'
y_train_pro_folder_path = 'current-data-dump/nomic-autoencoder/'
y_train_pro_file_path = f'{x_train_pro_folder_path}y_train_pro.pkl'
os.makedirs(x_train_pro_folder_path, exist_ok=True)
os.makedirs(y_train_pro_folder_path, exist_ok=True)
with open(x_train_pro_file_path, 'wb') as file:
  pickle.dump(x_train_pro, file)
  print(f"File uploaded to {x_train_pro_file_path}")
with open(y_train_pro_file_path, 'wb') as file:
  pickle.dump(y_train_pro, file)
  print(f"File uploaded to {y_train_pro_file_path}")

File uploaded to current-data-dump/nomic-autoencoder/x_train_pro.pkl
File uploaded to current-data-dump/nomic-autoencoder/y_train_pro.pkl


In [None]:
x_test_pro_folder_path = 'current-data-dump/nomic-autoencoder/'
x_test_pro_file_path = f'{x_test_pro_folder_path}x_test_pro.pkl'
y_test_pro_folder_path = 'current-data-dump/nomic-autoencoder/'
y_test_pro_file_path = f'{x_test_pro_folder_path}y_test_pro.pkl'
os.makedirs(x_test_pro_folder_path, exist_ok=True)
os.makedirs(y_test_pro_folder_path, exist_ok=True)
with open(x_test_pro_file_path, 'wb') as file:
  pickle.dump(x_test_pro, file)
  print(f"File uploaded to {x_test_pro_file_path}")
with open(y_test_pro_file_path, 'wb') as file:
  pickle.dump(y_test_pro, file)
  print(f"File uploaded to {y_test_pro_file_path}")

File uploaded to current-data-dump/nomic-autoencoder/x_test_pro.pkl
File uploaded to current-data-dump/nomic-autoencoder/y_test_pro.pkl


In [None]:
x_train_counter_folder_path = 'current-data-dump/nomic-autoencoder/'
x_train_counter_file_path = f'{x_train_counter_folder_path}x_train_counter.pkl'
y_train_counter_folder_path = 'current-data-dump/nomic-autoencoder/'
y_train_counter_file_path = f'{x_train_counter_folder_path}y_train_counter.pkl'
os.makedirs(x_train_counter_folder_path, exist_ok=True)
os.makedirs(y_train_counter_folder_path, exist_ok=True)
with open(x_train_counter_file_path, 'wb') as file:
  pickle.dump(x_train_counter, file)
  print(f"File uploaded to {x_train_counter_file_path}")
with open(y_train_counter_file_path, 'wb') as file:
  pickle.dump(y_train_counter, file)
  print(f"File uploaded to {y_train_counter_file_path}")

File uploaded to current-data-dump/nomic-autoencoder/x_train_counter.pkl
File uploaded to current-data-dump/nomic-autoencoder/y_train_counter.pkl


In [None]:
x_test_counter_folder_path = 'current-data-dump/nomic-autoencoder/'
x_test_counter_file_path = f'{x_test_counter_folder_path}x_test_counter.pkl'
y_test_counter_folder_path = 'current-data-dump/nomic-autoencoder/'
y_test_counter_file_path = f'{x_test_counter_folder_path}y_test_counter.pkl'
os.makedirs(x_test_counter_folder_path, exist_ok=True)
os.makedirs(y_test_counter_folder_path, exist_ok=True)
with open(x_test_counter_file_path, 'wb') as file:
  pickle.dump(x_test_counter, file)
  print(f"File uploaded to {x_test_counter_file_path}")
with open(y_test_counter_file_path, 'wb') as file:
  pickle.dump(y_test_counter, file)
  print(f"File uploaded to {y_test_counter_file_path}")

File uploaded to current-data-dump/nomic-autoencoder/x_test_counter.pkl
File uploaded to current-data-dump/nomic-autoencoder/y_test_counter.pkl


## Model

### Architecture

In [None]:
# Layers
input_layer = tf.keras.layers.Input(shape=(768, ), name="Input")
hidden_layer = tf.keras.layers.Dense(units=768, activation="relu", name="Hidden")(input_layer)
output_layer = tf.keras.layers.Dense(units=768, activation="linear", name="Output")(hidden_layer)

In [None]:
# Model
autoencoder_model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
autoencoder_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 768)]             0         
                                                                 
 Hidden (Dense)              (None, 768)               590592    
                                                                 
 Output (Dense)              (None, 768)               590592    
                                                                 
Total params: 1181184 (4.51 MB)
Trainable params: 1181184 (4.51 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Metric

In [None]:
@tf.keras.saving.register_keras_serializable()
def metric_choose_argument_combined_y_train(y_true, y_pred):
  """combined_metric"""
  combined_training_df_32 = tf.cast(pd.concat([x_train_combined, y_train_combined, x_test_combined, y_test_combined]), dtype=tf.float32)

  cos_sim_pred = tf.matmul(combined_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(combined_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(combined_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(combined_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

In [None]:
@tf.keras.saving.register_keras_serializable()
def metric_choose_argument_pro_y_train(y_true, y_pred):
  """pro_metric"""
  pro_training_df_32 = tf.cast(pd.concat([x_train_pro, y_train_pro, x_test_pro, y_test_pro]), dtype=tf.float32)

  cos_sim_pred = tf.matmul(pro_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(pro_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(pro_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(pro_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

In [None]:
@tf.keras.saving.register_keras_serializable()
def metric_choose_argument_counter_y_train(y_true, y_pred):
  """counter_metric"""
  counter_training_df_32 = tf.cast(pd.concat([x_train_counter, y_train_counter, x_test_counter, y_test_counter]), dtype=tf.float32)

  cos_sim_pred = tf.matmul(counter_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(counter_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(counter_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(counter_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

### Combined Training

In [None]:
# combined Model
combined_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
combined_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_combined_y_train]
)

In [None]:
checkpoint_callback = ModelCheckpoint(filepath='combined_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='combined_training_log.csv', separator=',', append=True)
combined_history = combined_autoencoder_model.fit(
  x=x_train_combined,
  y=y_train_combined,
  batch_size=1,
  epochs=100,
  validation_data = (x_test_combined, y_test_combined),
  callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/100
Epoch 1: saving model to combined_autoencoder_weights.keras
Epoch 2/100
Epoch 2: saving model to combined_autoencoder_weights.keras
Epoch 3/100
Epoch 3: saving model to combined_autoencoder_weights.keras
Epoch 4/100
Epoch 4: saving model to combined_autoencoder_weights.keras
Epoch 5/100
Epoch 5: saving model to combined_autoencoder_weights.keras
Epoch 6/100
Epoch 6: saving model to combined_autoencoder_weights.keras
Epoch 7/100
Epoch 7: saving model to combined_autoencoder_weights.keras
Epoch 8/100
Epoch 8: saving model to combined_autoencoder_weights.keras
Epoch 9/100
Epoch 9: saving model to combined_autoencoder_weights.keras
Epoch 10/100
Epoch 10: saving model to combined_autoencoder_weights.keras
Epoch 11/100
Epoch 11: saving model to combined_autoencoder_weights.keras
Epoch 12/100
Epoch 12: saving model to combined_autoencoder_weights.keras
Epoch 13/100
Epoch 13: saving model to combined_autoencoder_weights.keras
Epoch 14/100
Epoch 14: saving model to combined_autoenco

In [None]:
combined_history_df = pd.DataFrame(combined_history.history)

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
combined_history_df.to_csv(f'{output_folder_path}combined_training_log.csv')

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
combined_autoencoder_model.save(f'{output_folder_path}combined_autoencoder_model.keras')

### Combined Shuffled Training

In [None]:
# combined Shuffled Model
combined_shuffled_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
combined_shuffled_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_combined_y_train]
)

combined_shuffled_history = combined_shuffled_autoencoder_model.fit(
  x=x_train_combined,
  y=y_train_shuffled,
  batch_size=1,
  epochs=20,
  validation_data = (x_test_combined, y_test_combined)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
combined_shuffled_history_df = pd.DataFrame(combined_shuffled_history.history)

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
combined_shuffled_history_df.to_csv(f'{output_folder_path}combined_shuffled_training_log.csv')

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
combined_shuffled_autoencoder_model.save(f'{output_folder_path}combined_shuffled_autoencoder_model.keras')

### Pro Training

In [None]:
# pro Model
pro_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
pro_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_pro_y_train]
)

In [None]:
checkpoint_callback = ModelCheckpoint(filepath='pro_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='pro_training_log.csv', separator=',', append=True)
pro_history = pro_autoencoder_model.fit(
  x=x_train_pro,
  y=y_train_pro,
  batch_size=1,
  epochs=20,
  validation_data = (x_test_pro, y_test_pro),
  callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/20
Epoch 1: saving model to pro_autoencoder_weights.keras
Epoch 2/20
Epoch 2: saving model to pro_autoencoder_weights.keras
Epoch 3/20
Epoch 3: saving model to pro_autoencoder_weights.keras
Epoch 4/20
Epoch 4: saving model to pro_autoencoder_weights.keras
Epoch 5/20
Epoch 5: saving model to pro_autoencoder_weights.keras
Epoch 6/20
Epoch 6: saving model to pro_autoencoder_weights.keras
Epoch 7/20
Epoch 7: saving model to pro_autoencoder_weights.keras
Epoch 8/20
Epoch 8: saving model to pro_autoencoder_weights.keras
Epoch 9/20
Epoch 9: saving model to pro_autoencoder_weights.keras
Epoch 10/20
Epoch 10: saving model to pro_autoencoder_weights.keras
Epoch 11/20
Epoch 11: saving model to pro_autoencoder_weights.keras
Epoch 12/20
Epoch 12: saving model to pro_autoencoder_weights.keras
Epoch 13/20
Epoch 13: saving model to pro_autoencoder_weights.keras
Epoch 14/20
Epoch 14: saving model to pro_autoencoder_weights.keras
Epoch 15/20
Epoch 15: saving model to pro_autoencoder_weights.kera

In [None]:
pro_history_df = pd.DataFrame(pro_history.history)

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
pro_history_df.to_csv(f'{output_folder_path}pro_training_log.csv')

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
pro_autoencoder_model.save(f'{output_folder_path}pro_autoencoder_model.keras')

### Pro Shuffled Training

In [None]:
# pro Shuffled Model
pro_shuffled_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
pro_shuffled_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_pro_y_train]
)

pro_shuffled_history = pro_shuffled_autoencoder_model.fit(
  x=x_train_pro,
  y=y_train_pro_shuffled,
  batch_size=1,
  epochs=20,
  validation_data = (x_test_pro, y_test_pro)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
pro_shuffled_history_df = pd.DataFrame(pro_shuffled_history.history)

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
pro_shuffled_history_df.to_csv(f'{output_folder_path}pro_shuffled_training_log.csv')

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
pro_shuffled_autoencoder_model.save(f'{output_folder_path}pro_shuffled_autoencoder_model.keras')

### Counter Training

In [None]:
# counter Model
counter_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
counter_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_counter_y_train]
)

In [None]:
checkpoint_callback = ModelCheckpoint(filepath='counter_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='counter_training_log.csv', separator=',', append=True)
counter_history = counter_autoencoder_model.fit(
  x=x_train_counter,
  y=y_train_counter,
  batch_size=1,
  epochs=20,
  validation_data = (x_test_counter, y_test_counter),
  callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/20
Epoch 1: saving model to counter_autoencoder_weights.keras
Epoch 2/20
Epoch 2: saving model to counter_autoencoder_weights.keras
Epoch 3/20
Epoch 3: saving model to counter_autoencoder_weights.keras
Epoch 4/20
Epoch 4: saving model to counter_autoencoder_weights.keras
Epoch 5/20
Epoch 5: saving model to counter_autoencoder_weights.keras
Epoch 6/20
Epoch 6: saving model to counter_autoencoder_weights.keras
Epoch 7/20
Epoch 7: saving model to counter_autoencoder_weights.keras
Epoch 8/20
Epoch 8: saving model to counter_autoencoder_weights.keras
Epoch 9/20
Epoch 9: saving model to counter_autoencoder_weights.keras
Epoch 10/20
Epoch 10: saving model to counter_autoencoder_weights.keras
Epoch 11/20
Epoch 11: saving model to counter_autoencoder_weights.keras
Epoch 12/20
Epoch 12: saving model to counter_autoencoder_weights.keras
Epoch 13/20
Epoch 13: saving model to counter_autoencoder_weights.keras
Epoch 14/20
Epoch 14: saving model to counter_autoencoder_weights.keras
Epoch 15/2

In [None]:
counter_history_df = pd.DataFrame(counter_history.history)

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
counter_history_df.to_csv(f'{output_folder_path}counter_training_log.csv')

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
counter_autoencoder_model.save(f'{output_folder_path}counter_autoencoder_model.keras')

### Counter Shuffled Training

In [None]:
# counter Shuffled Model
counter_shuffled_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
counter_shuffled_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_counter_y_train]
)

counter_shuffled_history = counter_shuffled_autoencoder_model.fit(
  x=x_train_counter,
  y=y_train_counter_shuffled,
  batch_size=1,
  epochs=20,
  validation_data = (x_test_counter, y_test_counter)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
counter_shuffled_history_df = pd.DataFrame(counter_shuffled_history.history)

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
counter_shuffled_history_df.to_csv(f'{output_folder_path}counter_shuffled_training_log.csv')

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
counter_shuffled_autoencoder_model.save(f'{output_folder_path}counter_shuffled_autoencoder_model.keras')

## Export data

In [None]:
def export_nomic_autoencoder():
  """Export nomic_autoencoder directory"""
  nomic_autoencoder_file_path = 'current-data-dump/nomic-autoencoder'
  nomic_autoencoder_file_path_zip = 'current-data-dump/nomic-autoencoder'
  shutil.make_archive(nomic_autoencoder_file_path_zip, 'zip', nomic_autoencoder_file_path)
  print(f"Zip file created at: {nomic_autoencoder_file_path_zip}")
  result = subprocess.run([f"osf -p sakjg upload --force {nomic_autoencoder_file_path_zip}.zip data-dump/nomic-scifact-autoencoder/nomic_scifact_autoencoder.zip"], shell=True, capture_output=True, text=True)
  print(result.stderr)
  print(f"File: {nomic_autoencoder_file_path_zip} uploaded at osfstorage")

In [None]:
export_nomic_autoencoder()

Zip file created at: current-data-dump/nomic-autoencoder

File: current-data-dump/nomic-autoencoder uploaded at osfstorage


## Import data

In [None]:
def import_nomic_autoencoder():
  """Import nomic_autoencoder directory"""
  subprocess.run("osf -p sakjg fetch --force osfstorage/data-dump/nomic-scifact-autoencoder/nomic_scifact_autoencoder.zip", shell=True)
  print("nomic_autoencoder.zip successfully imported")
  nomic_autoencoder_file_path_zip = 'nomic_scifact_autoencoder.zip'
  nomic_autoencoder_file_path = 'current-data-dump/nomic-autoencoder'
  os.makedirs(nomic_autoencoder_file_path, exist_ok=True)
  with zipfile.ZipFile(nomic_autoencoder_file_path_zip, 'r') as zip_ref:
    zip_ref.extractall(nomic_autoencoder_file_path)
  extracted_files = os.listdir(nomic_autoencoder_file_path)
  print("Files extracted:", extracted_files)

## Load global training history

In [None]:
import_nomic_autoencoder()

nomic_autoencoder.zip successfully imported
Files extracted: ['combined_shuffled_training_log.csv', 'pro_training_log.csv', 'counter_training_log.csv', 'counter_training_plot.png', 'x_test_pro.pkl', 'y_train_combined.pkl', 'counter_shuffled_training_log.csv', 'pro_shuffled_training_plot.png', 'y_test_combined.pkl', 'y_test_pro.pkl', 'combined_shuffled_training_plot.png', 'pro_shuffled_autoencoder_model.keras', 'combined_autoencoder_model.keras', 'pro_training_plot.png', 'y_train_counter.pkl', 'y_test_counter.pkl', 'y_train_pro.pkl', 'x_train_combined.pkl', 'combined_training_plot.png', 'all_pro_training_plot.png', 'x_train_counter.pkl', 'counter_shuffled_autoencoder_model.keras', 'x_test_combined.pkl', 'counter_shuffled_training_plot.png', 'counter_autoencoder_model.keras', 'all_combined_training_plot.png', 'x_test_counter.pkl', 'pro_autoencoder_model.keras', 'combined_shuffled_autoencoder_model.keras', 'all_counter_training_plot.png', 'combined_training_log.csv', 'x_train_pro.pkl', 'p

#### Unshuffled training history

In [None]:
# Access training history
loaded_combined_history = pd.DataFrame(pd.read_csv("current-data-dump/nomic-autoencoder/combined_training_log.csv"))
loaded_combined_history = pd.melt(loaded_combined_history, id_vars='Unnamed: 0', value_vars=['metric_choose_argument_combined_y_train', 'val_metric_choose_argument_combined_y_train'], var_name='dataset', value_name='accuracy')
loaded_combined_history = loaded_combined_history.replace(['metric_choose_argument_combined_y_train', 'val_metric_choose_argument_combined_y_train'], ['training set', 'validation set'])
loaded_combined_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True)
loaded_combined_history['shuffled'] = False

loaded_pro_history = pd.DataFrame(pd.read_csv("current-data-dump/nomic-autoencoder/pro_training_log.csv"))
loaded_pro_history = pd.melt(loaded_pro_history, id_vars='Unnamed: 0', value_vars=['metric_choose_argument_pro_y_train', 'val_metric_choose_argument_pro_y_train'], var_name='dataset', value_name='accuracy')
loaded_pro_history = loaded_pro_history.replace(['metric_choose_argument_pro_y_train', 'val_metric_choose_argument_pro_y_train'], ['training set', 'validation set'])
loaded_pro_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True)
loaded_pro_history['shuffled'] = False

loaded_counter_history = pd.DataFrame(pd.read_csv("current-data-dump/nomic-autoencoder/counter_training_log.csv"))
loaded_counter_history = pd.melt(loaded_counter_history, id_vars='Unnamed: 0', value_vars=['metric_choose_argument_counter_y_train', 'val_metric_choose_argument_counter_y_train'], var_name='dataset', value_name='accuracy')
loaded_counter_history = loaded_counter_history.replace(['metric_choose_argument_counter_y_train', 'val_metric_choose_argument_counter_y_train'], ['training set', 'validation set'])
loaded_counter_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True)
loaded_counter_history['shuffled'] = False

In [None]:
linetype_map={'validation set': 'solid', 'training set': 'dashed'}

combined_training_plot = (
    ggplot(loaded_combined_history, aes(x='epoch', y='accuracy', linetype='dataset')) +
    geom_line(size=2) +
    labs(title='Learning Curve of Model Trained on Unshuffled Data', x='Epoch', y='Accuracy') +
    scale_y_continuous(limits=[0,1]) +
    scale_linetype_manual(linetype_map) +
    theme(
        plot_title=element_text(size=40, lineheight=1.5, wrap=True),
        axis_title=element_text(size=32),
        axis_text=element_text(size=24),
        legend_position='bottom',
        legend_text=element_text(size=24),
        legend_title=element_text(size=32),
        figure_size=[16,24]
        )
)
ggsave(combined_training_plot, "current-data-dump/nomic-autoencoder/combined_training_plot.png")



In [None]:
linetype_map={'validation set': 'solid', 'training set': 'dashed'}

pro_training_plot = (
    ggplot(loaded_pro_history, aes(x='epoch', y='accuracy', linetype='dataset')) +
    geom_line(size=2) +
    labs(title='Learning Curve of Model Trained on Unshuffled Data (Pro-evidence Only)', x='Epoch', y='Accuracy') +
    scale_y_continuous(limits=[0,1]) +
    scale_linetype_manual(linetype_map) +
    theme(
        plot_title=element_text(size=40, lineheight=1.5, wrap=True),
        axis_title=element_text(size=32),
        axis_text=element_text(size=24),
        legend_position='bottom',
        legend_text=element_text(size=24),
        legend_title=element_text(size=32),
        figure_size=[16,24]
        )
)
ggsave(pro_training_plot, "current-data-dump/nomic-autoencoder/pro_training_plot.png")



In [None]:
linetype_map={'validation set': 'solid', 'training set': 'dashed'}

counter_training_plot = (
    ggplot(loaded_counter_history, aes(x='epoch', y='accuracy', linetype='dataset')) +
    geom_line(size=2) +
    labs(title='Learning Curve of Model Trained on Unshuffled Data (Counterevidence Only)', x='Epoch', y='Accuracy') +
    scale_y_continuous(limits=[0,1]) +
    scale_linetype_manual(linetype_map) +
    theme(
        plot_title=element_text(size=40, lineheight=1.5, wrap=True),
        axis_title=element_text(size=32),
        axis_text=element_text(size=24),
        legend_position='bottom',
        legend_text=element_text(size=24),
        legend_title=element_text(size=32),
        figure_size=[16,24]
        )
)
ggsave(counter_training_plot, "current-data-dump/nomic-autoencoder/counter_training_plot.png")



#### Shuffled training history

In [None]:
# Access training history
loaded_combined_shuffled_history = pd.DataFrame(pd.read_csv("current-data-dump/nomic-autoencoder/combined_shuffled_training_log.csv"))
loaded_combined_shuffled_history = pd.melt(loaded_combined_shuffled_history, id_vars='Unnamed: 0', value_vars=['metric_choose_argument_combined_y_train', 'val_metric_choose_argument_combined_y_train'], var_name='dataset', value_name='accuracy')
loaded_combined_shuffled_history = loaded_combined_shuffled_history.replace(['metric_choose_argument_combined_y_train', 'val_metric_choose_argument_combined_y_train'], ['training set', 'validation set'])
loaded_combined_shuffled_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True)
loaded_combined_shuffled_history['shuffled'] = True

loaded_pro_shuffled_history = pd.DataFrame(pd.read_csv("current-data-dump/nomic-autoencoder/pro_shuffled_training_log.csv"))
loaded_pro_shuffled_history = pd.melt(loaded_pro_shuffled_history, id_vars='Unnamed: 0', value_vars=['metric_choose_argument_pro_y_train', 'val_metric_choose_argument_pro_y_train'], var_name='dataset', value_name='accuracy')
loaded_pro_shuffled_history = loaded_pro_shuffled_history.replace(['metric_choose_argument_pro_y_train', 'val_metric_choose_argument_pro_y_train'], ['training set', 'validation set'])
loaded_pro_shuffled_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True)
loaded_pro_shuffled_history['shuffled'] = True

loaded_counter_shuffled_history = pd.DataFrame(pd.read_csv("current-data-dump/nomic-autoencoder/counter_shuffled_training_log.csv"))
loaded_counter_shuffled_history = pd.melt(loaded_counter_shuffled_history, id_vars='Unnamed: 0', value_vars=['metric_choose_argument_counter_y_train', 'val_metric_choose_argument_counter_y_train'], var_name='dataset', value_name='accuracy')
loaded_counter_shuffled_history = loaded_counter_shuffled_history.replace(['metric_choose_argument_counter_y_train', 'val_metric_choose_argument_counter_y_train'], ['training set', 'validation set'])
loaded_counter_shuffled_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True)
loaded_counter_shuffled_history['shuffled'] = True

In [None]:
linetype_map={'validation set': 'solid', 'training set': 'dashed'}

combined_shuffled_training_plot = (
    ggplot(loaded_combined_shuffled_history, aes(x='epoch', y='accuracy', linetype='dataset')) +
    geom_line(size=2) +
    labs(title='Learning Curve of Model Trained on Shuffled Data', x='Epoch', y='Accuracy') +
    scale_y_continuous(limits=[0,1]) +
    scale_linetype_manual(linetype_map) +
    theme(
        plot_title=element_text(size=40, lineheight=1.5, wrap=True),
        axis_title=element_text(size=32),
        axis_text=element_text(size=24),
        legend_position='bottom',
        legend_text=element_text(size=24),
        legend_title=element_text(size=32),
        figure_size=[16,24]
        )
)
ggsave(combined_shuffled_training_plot, "current-data-dump/nomic-autoencoder/combined_shuffled_training_plot.png")



In [None]:
linetype_map={'validation set': 'solid', 'training set': 'dashed'}

pro_shuffled_training_plot = (
    ggplot(loaded_pro_shuffled_history, aes(x='epoch', y='accuracy', linetype='dataset')) +
    geom_line(size=2) +
    labs(title='Learning Curve of Model Trained on Shuffled Data (Pro-evidence Only)', x='Epoch', y='Accuracy') +
    scale_y_continuous(limits=[0,1]) +
    scale_linetype_manual(linetype_map) +
    theme(
        plot_title=element_text(size=40, lineheight=1.5, wrap=True),
        axis_title=element_text(size=32),
        axis_text=element_text(size=24),
        legend_position='bottom',
        legend_text=element_text(size=24),
        legend_title=element_text(size=32),
        figure_size=[16,24]
        )
)
ggsave(pro_shuffled_training_plot, "current-data-dump/nomic-autoencoder/pro_shuffled_training_plot.png")



In [None]:
linetype_map={'validation set': 'solid', 'training set': 'dashed'}

counter_shuffled_training_plot = (
    ggplot(loaded_counter_shuffled_history, aes(x='epoch', y='accuracy', linetype='dataset')) +
    geom_line(size=2) +
    labs(title='Learning Curve of Model Trained on Shuffled Data (Counterevidence Only)', x='Epoch', y='Accuracy') +
    scale_y_continuous(limits=[0,1]) +
    scale_linetype_manual(linetype_map) +
    theme(
        plot_title=element_text(size=40, lineheight=1.5, wrap=True),
        axis_title=element_text(size=32),
        axis_text=element_text(size=24),
        legend_position='bottom',
        legend_text=element_text(size=24),
        legend_title=element_text(size=32),
        figure_size=[16,24]
        )
)
ggsave(counter_shuffled_training_plot, "current-data-dump/nomic-autoencoder/counter_shuffled_training_plot.png")



## Combined Training Plots

In [None]:
all_combined_training_df = pd.concat([loaded_combined_history, loaded_combined_shuffled_history])
all_pro_training_df = pd.concat([loaded_pro_history, loaded_pro_shuffled_history])
all_counter_training_df = pd.concat([loaded_counter_history, loaded_counter_shuffled_history])

In [None]:
linetype_map={'validation set': 'solid', 'training set': 'dashed'}

all_combined_plot = (
  ggplot(all_combined_training_df, aes(x='epoch', y='accuracy', linetype='dataset', color='shuffled')) +
  geom_line(size=2) +
  labs(title='Learning Curve of Model Trained on Unshuffled vs. Within-Topic Shuffled Data', x='Epoch', y='Accuracy') +
  scale_linetype_manual(linetype_map) +
  theme(
    figure_size=(16,24),
    axis_title=element_text(size=32),
    axis_text=element_text(size=24),
    legend_title=element_text(size=32, lineheight=1.5),
    legend_text=element_text(size=24, lineheight=1.5),
    plot_title=element_text(size=40, wrap=True, lineheight=1.5),
    legend_position="bottom",
    legend_key_width=64
  ) +
  guides(fill = guide_legend(byrow = True))
)
ggsave(all_combined_plot, "current-data-dump/nomic-autoencoder/all_combined_training_plot.png")



In [None]:
linetype_map={'validation set': 'solid', 'training set': 'dashed'}

all_pro_plot = (
  ggplot(all_pro_training_df, aes(x='epoch', y='accuracy', linetype='dataset', color='shuffled')) +
  geom_line(size=2) +
  labs(title='Learning Curve of Model Trained on Unshuffled vs. Within-Topic Shuffled Data (Pro-evidence Only)', x='Epoch', y='Accuracy') +
  scale_linetype_manual(linetype_map) +
  theme(
    figure_size=(16,24),
    axis_title=element_text(size=32),
    axis_text=element_text(size=24),
    legend_title=element_text(size=32, lineheight=1.5),
    legend_text=element_text(size=24, lineheight=1.5),
    plot_title=element_text(size=40, wrap=True, lineheight=1.5),
    legend_position="bottom",
    legend_key_width=64
  ) +
  guides(fill = guide_legend(byrow = True))
)
ggsave(all_pro_plot, "current-data-dump/nomic-autoencoder/all_pro_training_plot.png")



In [None]:
linetype_map={'validation set': 'solid', 'training set': 'dashed'}

all_counter_plot = (
  ggplot(all_counter_training_df, aes(x='epoch', y='accuracy', linetype='dataset', color='shuffled')) +
  geom_line(size=2) +
  labs(title='Learning Curve of Model Trained on Unshuffled vs. Within-Topic Shuffled Data (Counterevidence Only)', x='Epoch', y='Accuracy') +
  scale_linetype_manual(linetype_map) +
  theme(
    figure_size=(16,24),
    axis_title=element_text(size=32),
    axis_text=element_text(size=24),
    legend_title=element_text(size=32, lineheight=1.5),
    legend_text=element_text(size=24, lineheight=1.5),
    plot_title=element_text(size=40, wrap=True, lineheight=1.5),
    legend_position="bottom",
    legend_key_width=64
  ) +
  guides(fill = guide_legend(byrow = True))
)
ggsave(all_counter_plot, "current-data-dump/nomic-autoencoder/all_counter_training_plot.png")



## Final Export

In [None]:
export_nomic_autoencoder()

NameError: name 'export_nomic_autoencoder' is not defined