<a href="https://colab.research.google.com/github/cchang-vassar/Semantic-Relations-in-Vector-Embeddings/blob/main/study2_6_%5Bnomic%5D_counterev_autoencoder_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [nomic] Autoencoder: Generate Corresponding Embedding

## Set Up

### Imports

In [None]:
!pip install tensorflow



In [None]:
import os
import subprocess
import zipfile
import shutil
import pickle
import tensorflow as tf
import pandas as pd
import numpy as np
from google.colab import userdata
from scipy import spatial
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from plotnine import ggplot, geom_line, aes, ggsave, labs, theme, element_text, guides, guide_legend

### OSF Setup

In [None]:
!pip install osfclient

Collecting osfclient
  Downloading osfclient-0.0.5-py2.py3-none-any.whl (39 kB)
Installing collected packages: osfclient
Successfully installed osfclient-0.0.5


In [None]:
import osfclient.cli

In [None]:
from osfclient.api import OSF
from osfclient.models import Project, Storage
from io import BytesIO

In [None]:
os.environ["OSF_USERNAME"] = userdata.get("OSF_USERNAME")
OSF_USERNAME = os.environ["OSF_USERNAME"]

In [None]:
os.environ["OSF_PASSWORD"] = userdata.get("OSF_PASSWORD")
OSF_PASSWORD = os.environ["OSF_PASSWORD"]

In [None]:
os.environ["OSF_TOKEN"] = userdata.get("OSF_TOKEN")
OSF_TOKEN = os.environ["OSF_TOKEN"]

In [None]:
os.environ["OSF_PROJECT_ID"] = userdata.get("OSF_PROJECT_ID")
OSF_PROJECT_ID = os.environ["OSF_PROJECT_ID"]

## Data

### Import corpora data from OSF

In [None]:
!osf -p sakjg fetch osfstorage/data-dump/nomic-autoencoder/nomic_evidence_embeddings_dump.zip

  0% 0.00/17.7M [00:00<?, ?bytes/s] 43% 7.55M/17.7M [00:00<00:00, 75.3Mbytes/s] 95% 16.8M/17.7M [00:00<00:00, 61.3Mbytes/s]100% 17.7M/17.7M [00:00<00:00, 65.2Mbytes/s]


In [None]:
embeddings_file_path = 'nomic_evidence_embeddings_dump.zip'
output_folder_path = 'current-data-dump/embeddings-dump'
os.makedirs(output_folder_path, exist_ok=True)

with zipfile.ZipFile(embeddings_file_path, 'r') as zip_ref:
  zip_ref.extractall(output_folder_path)

extracted_files = os.listdir(output_folder_path)
print("Files extracted:", extracted_files)

Files extracted: ['argument_embeddings.pkl', 'evidence_embeddings.pkl', 'bad_argument_embeddings.pkl', 'bad_evidence_embeddings.pkl', '.ipynb_checkpoints']


In [None]:
x = pd.read_pickle('current-data-dump/embeddings-dump/argument_embeddings.pkl')
y = pd.read_pickle('current-data-dump/embeddings-dump/evidence_embeddings.pkl')
x_train = x.iloc[:1499]
x_test = x.iloc[1499:]
y_train = y.iloc[:1499]
y_train = y_train.rename(columns={'argument': 'evidence'})
y_test = y.iloc[1499:]
y_test = y_test.rename(columns={'argument': 'evidence'})
combined_train_df = pd.concat([x_train, y_train], axis=1).reset_index(drop=True)
combined_test_df = pd.concat([x_test, y_test], axis=1).reset_index(drop=True)

combined_train_df = combined_train_df.groupby('argument').sample(1)
combined_test_df = combined_test_df.groupby('argument').sample(1)
x_train = combined_train_df.iloc[:, 1:769].reset_index(drop=True)
y_train = combined_train_df.iloc[:, 770:1538].reset_index(drop=True)
x_test = combined_test_df.iloc[:, 1:769].reset_index(drop=True)
y_test = combined_test_df.iloc[:, 770:1538].reset_index(drop=True)

### Make global data shuffled

In [None]:
y_train_shuffled = y_train.copy().sample(frac=1).reset_index(drop=True)

### Save global training df

In [None]:
global_x_train_folder_path = 'current-data-dump/nomic-autoencoder/'
global_x_train_file_path = f'{global_x_train_folder_path}global_x_train.pkl'
global_y_train_folder_path = 'current-data-dump/nomic-autoencoder/'
global_y_train_file_path = f'{global_x_train_folder_path}global_y_train.pkl'
os.makedirs(global_x_train_folder_path, exist_ok=True)
os.makedirs(global_y_train_folder_path, exist_ok=True)
with open(global_x_train_file_path, 'wb') as file:
  pickle.dump(x_train, file)
  print(f"File uploaded to {global_x_train_file_path}")
with open(global_y_train_file_path, 'wb') as file:
  pickle.dump(y_train, file)
  print(f"File uploaded to {global_y_train_file_path}")

File uploaded to current-data-dump/nomic-autoencoder/global_x_train.pkl
File uploaded to current-data-dump/nomic-autoencoder/global_y_train.pkl


In [None]:
global_x_test_folder_path = 'current-data-dump/nomic-autoencoder/'
global_x_test_file_path = f'{global_x_test_folder_path}global_x_test.pkl'
global_y_test_folder_path = 'current-data-dump/nomic-autoencoder/'
global_y_test_file_path = f'{global_x_test_folder_path}global_y_test.pkl'
os.makedirs(global_x_test_folder_path, exist_ok=True)
os.makedirs(global_y_test_folder_path, exist_ok=True)
with open(global_x_test_file_path, 'wb') as file:
  pickle.dump(x_test, file)
  print(f"File uploaded to {global_x_test_file_path}")
with open(global_y_test_file_path, 'wb') as file:
  pickle.dump(y_test, file)
  print(f"File uploaded to {global_y_test_file_path}")

File uploaded to current-data-dump/nomic-autoencoder/global_x_test.pkl
File uploaded to current-data-dump/nomic-autoencoder/global_y_test.pkl


## Model

### Architecture

In [None]:
# Layers
input_layer = tf.keras.layers.Input(shape=(768, ), name="Input")
hidden_layer = tf.keras.layers.Dense(units=768, activation="relu", name="Hidden")(input_layer)
output_layer = tf.keras.layers.Dense(units=768, activation="linear", name="Output")(hidden_layer)

In [None]:
# Model
autoencoder_model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
autoencoder_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 768)]             0         
                                                                 
 Hidden (Dense)              (None, 768)               590592    
                                                                 
 Output (Dense)              (None, 768)               590592    
                                                                 
Total params: 1181184 (4.51 MB)
Trainable params: 1181184 (4.51 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Metric

In [None]:
@tf.keras.saving.register_keras_serializable()
def metric_choose_argument_global_y_train(y_true, y_pred):
  """global_metric"""
  global_training_df_32 = tf.cast(x_train, dtype=tf.float32)

  cos_sim_pred = tf.matmul(global_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(global_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

### Global Training

In [None]:
# Global Model
global_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_global_y_train]
)

In [None]:
checkpoint_callback = ModelCheckpoint(filepath='global_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='global_training_log.csv', separator=',', append=True)
global_history = global_autoencoder_model.fit(
  x=x_train,
  y=y_train,
  batch_size=1,
  epochs=100,
  validation_data = (x_test, y_test),
  callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/100
Epoch 1: saving model to global_autoencoder_weights.keras
Epoch 2/100
Epoch 2: saving model to global_autoencoder_weights.keras
Epoch 3/100
Epoch 3: saving model to global_autoencoder_weights.keras
Epoch 4/100
Epoch 4: saving model to global_autoencoder_weights.keras
Epoch 5/100
Epoch 5: saving model to global_autoencoder_weights.keras
Epoch 6/100
Epoch 6: saving model to global_autoencoder_weights.keras
Epoch 7/100
Epoch 7: saving model to global_autoencoder_weights.keras
Epoch 8/100
Epoch 8: saving model to global_autoencoder_weights.keras
Epoch 9/100
Epoch 9: saving model to global_autoencoder_weights.keras
Epoch 10/100
Epoch 10: saving model to global_autoencoder_weights.keras
Epoch 11/100
Epoch 11: saving model to global_autoencoder_weights.keras
Epoch 12/100
Epoch 12: saving model to global_autoencoder_weights.keras
Epoch 13/100
Epoch 13: saving model to global_autoencoder_weights.keras
Epoch 14/100
Epoch 14: saving model to global_autoencoder_weights.keras
Epoch 15/1

In [None]:
global_history_df = pd.DataFrame(global_history.history)

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
global_history_df.to_csv(f'{output_folder_path}global_training_log.csv')

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
global_autoencoder_model.save(f'{output_folder_path}global_autoencoder_model.keras')

### Global Shuffled Training

In [None]:
# Global Shuffled Model
global_shuffled_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_shuffled_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_global_y_train]
)

global_shuffled_history = global_shuffled_autoencoder_model.fit(
  x=x_train,
  y=y_train_shuffled,
  batch_size=1,
  epochs=20,
  validation_data = (x_test, y_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
global_shuffled_history_df = pd.DataFrame(global_shuffled_history.history)

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
global_shuffled_history_df.to_csv(f'{output_folder_path}global_shuffled_training_log.csv')

In [None]:
output_folder_path = 'current-data-dump/nomic-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
global_shuffled_autoencoder_model.save(f'{output_folder_path}global_shuffled_autoencoder_model.keras')

## Export data

In [None]:
def export_nomic_autoencoder():
  """Export nomic_autoencoder directory"""
  nomic_autoencoder_file_path = 'current-data-dump/nomic-autoencoder'
  nomic_autoencoder_file_path_zip = 'current-data-dump/nomic-autoencoder'
  shutil.make_archive(nomic_autoencoder_file_path_zip, 'zip', nomic_autoencoder_file_path)
  print(f"Zip file created at: {nomic_autoencoder_file_path_zip}")
  result = subprocess.run([f"osf -p sakjg upload --force {nomic_autoencoder_file_path_zip}.zip data-dump/nomic-evidence-autoencoder/nomic_evidence_autoencoder.zip"], shell=True, capture_output=True, text=True)
  print(result.stderr)
  print(f"File: {nomic_autoencoder_file_path_zip} uploaded at osfstorage")

In [None]:
export_nomic_autoencoder()

Zip file created at: current-data-dump/nomic-autoencoder

File: current-data-dump/nomic-autoencoder uploaded at osfstorage


## Import data

In [None]:
def import_nomic_autoencoder():
  """Import nomic_autoencoder directory"""
  subprocess.run("osf -p sakjg fetch --force osfstorage/data-dump/nomic-evidence-autoencoder/nomic_evidence_autoencoder.zip", shell=True)
  print("nomic_autoencoder.zip successfully imported")
  nomic_autoencoder_file_path_zip = 'nomic_evidence_autoencoder.zip'
  nomic_autoencoder_file_path = 'current-data-dump/nomic-autoencoder'
  os.makedirs(nomic_autoencoder_file_path, exist_ok=True)
  with zipfile.ZipFile(nomic_autoencoder_file_path_zip, 'r') as zip_ref:
    zip_ref.extractall(nomic_autoencoder_file_path)
  extracted_files = os.listdir(nomic_autoencoder_file_path)
  print("Files extracted:", extracted_files)

## Load global training history

In [None]:
import_nomic_autoencoder()

nomic_autoencoder.zip successfully imported
Files extracted: ['global_x_train.pkl', 'global_y_train.pkl', 'global_training_log.csv', 'global_shuffled_training_log.csv', 'global_autoencoder_model.keras', 'global_shuffled_autoencoder_model.keras', 'global_x_test.pkl', 'global_y_test.pkl']


#### Unshuffled training history

In [None]:
# Access training history
loaded_global_history = pd.DataFrame(pd.read_csv("current-data-dump/nomic-autoencoder/global_training_log.csv"))
loaded_global_history = pd.melt(loaded_global_history, id_vars='Unnamed: 0', value_vars=['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], var_name='dataset', value_name='accuracy')
loaded_global_history = loaded_global_history.replace(['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], ['training set', 'validation set'])
loaded_global_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True)
loaded_global_history['shuffled'] = False

In [None]:
global_training_plot = ggplot(loaded_global_history, aes(x='epoch', y='accuracy', linetype='dataset')) + geom_line() + labs(title='Learning Curve of Model Trained on Unshuffled Data', x='Epoch', y='Accuracy')
ggsave(global_training_plot, "current-data-dump/nomic-autoencoder/global_training_plot.png")



#### Shuffled training history

In [None]:
# Access training history
loaded_global_shuffled_history = pd.DataFrame(pd.read_csv("current-data-dump/nomic-autoencoder/global_shuffled_training_log.csv"))
loaded_global_shuffled_history = pd.melt(loaded_global_shuffled_history, id_vars='Unnamed: 0', value_vars=['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], var_name='dataset', value_name='accuracy')
loaded_global_shuffled_history = loaded_global_shuffled_history.replace(['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], ['training set', 'validation set'])
loaded_global_shuffled_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True)
loaded_global_shuffled_history['shuffled'] = False

In [None]:
global_shuffled_training_plot = ggplot(loaded_global_shuffled_history, aes(x='epoch', y='accuracy', linetype='dataset')) + geom_line() + labs(title='Learning Curve of Model Trained on Shuffled Data', x='Epoch', y='Accuracy')
ggsave(global_shuffled_training_plot, "current-data-dump/nomic-autoencoder/global_shuffled_training_plot.png")



## Combined Training Plots

In [None]:
combined_global_training_df = pd.concat([loaded_global_history, loaded_global_shuffled_history])

In [None]:
combined_global_plot = (
  ggplot(combined_global_training_df, aes(x='epoch', y='accuracy', linetype='dataset', color='shuffled')) +
  geom_line(size=2) +
  labs(title='Learning Curve of Model Trained on Unshuffled vs. Within-Topic Shuffled Data', x='Epoch', y='Accuracy') +
  theme(
    figure_size=(16,24),
    axis_title=element_text(size=32),
    axis_text=element_text(size=24),
    legend_title=element_text(size=32, lineheight=1.5),
    legend_text=element_text(size=24, lineheight=1.5),
    plot_title=element_text(size=40, wrap=True, lineheight=1.5),
    legend_position="bottom",
    legend_key_width=64
  ) +
  guides(fill = guide_legend(byrow = True))
)
ggsave(combined_global_plot, "current-data-dump/nomic-autoencoder/combined_global_training_plot.png")



## Category and Debate Training

#### Category Data (Economy)

In [None]:
economy_embeddings_data = pd.read_pickle("current-data-dump/embeddings-dump/economy/economy_embeddings.pkl")

In [None]:
economy_training_df = prepare_training_df(economy_embeddings_data)
economy_x_train = make_x_train(economy_training_df)
economy_y_train = make_y_train(economy_training_df)
economy_x_test = make_x_test(economy_training_df)
economy_y_test = make_y_test(economy_training_df)

#### Debate Data (Economy)

In [None]:
economy_debate_embeddings_data = pd.read_pickle("current-data-dump/embeddings-dump/economy/business_economy_general_house_would_prohibit_retailers_selling_certain_items_embeddings.pkl")

In [None]:
economy_debate_training_df = prepare_training_df(economy_debate_embeddings_data)
economy_debate_x_train = make_x_train(economy_debate_training_df)
economy_debate_y_train = make_y_train(economy_debate_training_df)
economy_debate_x_test = make_x_test(economy_debate_training_df)
economy_debate_y_test = make_y_test(economy_debate_training_df)

#### Category Training

In [None]:
# Category Model
category_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
category_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="mse",
  metrics=['accuracy']
)
category_autoencoder_model.fit(
  x=economy_x_train,
  y=economy_y_train,
  batch_size=1,
  epochs=20,
  validation_data=(economy_x_test, economy_y_test)
)

#### Debate Training

In [None]:
# Debate Model
debate_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
debate_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="mse",
  metrics=['accuracy']
)
debate_autoencoder_model.fit(
  x=economy_debate_x_train,
  y=economy_debate_y_train,
  batch_size=1,
  epochs=5
)

## Final Export

In [None]:
export_ada_autoencoder()

Zip file created at: current-data-dump/ada-autoencoder

File: current-data-dump/ada-autoencoder uploaded at osfstorage
