<a href="https://colab.research.google.com/github/cchang-vassar/Semantic-Relations-in-Vector-Embeddings/blob/main/study10_%5Bada003%5Dautoencoder_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [ada-003] Autoencoder: Generate Corresponding Embedding

## Set Up

### Imports

In [1]:
!pip install tensorflow



In [2]:
import os
import subprocess
import zipfile
import shutil
import pickle
import tensorflow as tf
import pandas as pd
import numpy as np
from google.colab import userdata
from scipy import spatial
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from plotnine import ggplot, geom_line, aes, ggsave, labs, theme, element_text, guides, guide_legend

### OSF Setup

In [3]:
!pip install osfclient

Collecting osfclient
  Downloading osfclient-0.0.5-py2.py3-none-any.whl (39 kB)
Installing collected packages: osfclient
Successfully installed osfclient-0.0.5


In [4]:
import osfclient.cli

In [5]:
from osfclient.api import OSF
from osfclient.models import Project, Storage
from io import BytesIO

In [6]:
os.environ["OSF_USERNAME"] = userdata.get("OSF_USERNAME")
OSF_USERNAME = os.environ["OSF_USERNAME"]

In [7]:
os.environ["OSF_PASSWORD"] = userdata.get("OSF_PASSWORD")
OSF_PASSWORD = os.environ["OSF_PASSWORD"]

In [8]:
os.environ["OSF_TOKEN"] = userdata.get("OSF_TOKEN")
OSF_TOKEN = os.environ["OSF_TOKEN"]

In [9]:
os.environ["OSF_PROJECT_ID"] = userdata.get("OSF_PROJECT_ID")
OSF_PROJECT_ID = os.environ["OSF_PROJECT_ID"]

## Data

### Import corpora data from OSF

In [10]:
!osf -p sakjg fetch osfstorage/corpora/arguana_corpus.zip

100% 121M/121M [00:05<00:00, 22.3Mbytes/s]


In [11]:
!osf -p sakjg fetch osfstorage/data-dump/ada003-autoencoder/ada_embeddings_dump.zip

100% 53.4M/53.4M [00:02<00:00, 24.8Mbytes/s]


In [12]:
corpora_file_path = 'arguana_corpus.zip'
output_folder_path = 'arguana-corpus'
os.makedirs(output_folder_path, exist_ok=True)

with zipfile.ZipFile(corpora_file_path, 'r') as zip_ref:
  zip_ref.extractall(output_folder_path)

extracted_files = os.listdir(output_folder_path)
print("Files extracted:", extracted_files)

Files extracted: ['__MACOSX', 'arguana_corpus']


In [13]:
embeddings_dump_file_path = 'ada_embeddings_dump.zip'
output_folder_path = 'current-data-dump/embeddings-dump'
os.makedirs(output_folder_path, exist_ok=True)

with zipfile.ZipFile(embeddings_dump_file_path, 'r') as zip_ref:
  zip_ref.extractall(output_folder_path)

extracted_files = os.listdir(output_folder_path)
print("Files extracted:", extracted_files)

Files extracted: ['global_embeddings.pkl', 'economy']


### Functions for preparing training data

In [14]:
def prepare_training_df(data: pd.DataFrame):
  """Drop rows that do not follow 'point' -> 'counter' pattern"""
  point_indices = data[data['type'] == 'point'].index
  counter_indices = data[data['type'] == 'counter'].index
  drop_indices = []
  for idx in point_indices:
    if (idx == len(data)-1) or (idx + 1 < len(data) and data.loc[idx + 1, 'type'] != 'counter'):
      drop_indices.append(idx)
  for idx in counter_indices:
    if idx > 0 and data.loc[idx - 1, 'type'] != 'point':
      drop_indices.append(idx)
  data = data.drop(drop_indices)
  data = data.select_dtypes(include=[np.number])
  data = data.reset_index(drop=True)
  return data

In [15]:
def prepare_training_df_shuffled(data: pd.DataFrame):
  """Drop rows that do not follow 'point' -> 'counter' pattern"""
  point_indices = data[data['type'] == 'point'].index
  counter_indices = data[data['type'] == 'counter'].index
  drop_indices = []
  for idx in point_indices:
    if (idx == len(data)-1) or (idx + 1 < len(data) and data.loc[idx + 1, 'type'] != 'counter'):
      drop_indices.append(idx)
  for idx in counter_indices:
    if idx > 0 and data.loc[idx - 1, 'type'] != 'point':
      drop_indices.append(idx)
  data = data.drop(drop_indices)
  data = data.reset_index(drop=True)
  return data

In [16]:
def make_x_train(data: pd.DataFrame) -> pd.DataFrame:
  """Make training and testing datasets"""
  cutoff = int(0.8 * data.shape[0])
  if cutoff % 2 != 0:
    cutoff = cutoff - 1
  train_rows_df = data.iloc[:cutoff, :]
  x_train = train_rows_df[train_rows_df.index % 2 == 0].reset_index(drop=True)
  return x_train

def make_y_train(data: pd.DataFrame) -> pd.DataFrame:
  cutoff = int(0.8 * data.shape[0])
  if cutoff % 2 != 0:
    cutoff = cutoff - 1
  train_rows_df = data.iloc[:cutoff, :]
  y_train = train_rows_df[train_rows_df.index % 2 != 0].reset_index(drop=True)
  return y_train

def make_x_test(data: pd.DataFrame) -> pd.DataFrame:
  cutoff = int(0.8 * data.shape[0])
  if cutoff % 2 != 0:
    cutoff = cutoff - 1
  test_rows_df = data.iloc[cutoff:, :]
  x_test = test_rows_df[test_rows_df.index % 2 == 0].reset_index(drop=True)
  return x_test

def make_y_test(data: pd.DataFrame) -> pd.DataFrame:
  cutoff = int(0.8 * data.shape[0])
  if cutoff % 2 != 0:
    cutoff = cutoff - 1
  test_rows_df = data.iloc[cutoff:, :]
  y_test = test_rows_df[test_rows_df.index % 2 != 0].reset_index(drop=True)
  return y_test

### Make global data

In [17]:
global_embeddings_data = pd.read_pickle("current-data-dump/embeddings-dump/global_embeddings.pkl")

In [18]:
global_training_df = prepare_training_df(global_embeddings_data)

In [19]:
global_x_train = make_x_train(global_training_df)

In [20]:
global_y_train = make_y_train(global_training_df)

In [21]:
global_x_test = make_x_test(global_training_df)

In [22]:
global_y_test = make_y_test(global_training_df)

In [23]:
global_y_train_test = pd.concat([global_y_train, global_y_test], axis=0)

### Make global data shuffled

In [24]:
global_training_df_shuffled = prepare_training_df_shuffled(global_embeddings_data)

In [25]:
global_y_train_shuffled = make_y_train(global_training_df_shuffled)

In [26]:
global_y_train_shuffled = global_y_train_shuffled.groupby(['topic'], sort=False)
global_y_train_shuffled = global_y_train_shuffled.sample(frac=1).reset_index(drop=True)
global_y_train_shuffled = global_y_train_shuffled.select_dtypes(include=[np.number])

### Save global training df

In [28]:
global_training_df_folder_path = 'current-data-dump/ada-autoencoder/'
global_training_df_file_path = f'{global_training_df_folder_path}global_training_df.pkl'
os.makedirs(global_training_df_folder_path, exist_ok=True)
with open(global_training_df_file_path, 'wb') as file:
  pickle.dump(global_training_df, file)
  print(f"File uploaded to {global_training_df_file_path}")

File uploaded to current-data-dump/ada-autoencoder/global_training_df.pkl


## Model

### Architecture

In [29]:
# Layers
input_layer = tf.keras.layers.Input(shape=(1536, ), name="Input")
hidden_layer = tf.keras.layers.Dense(units=1536, activation="relu", name="Hidden")(input_layer)
output_layer = tf.keras.layers.Dense(units=1536, activation="linear", name="Output")(hidden_layer)

In [30]:
# Model
autoencoder_model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
autoencoder_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 1536)]            0         
                                                                 
 Hidden (Dense)              (None, 1536)              2360832   
                                                                 
 Output (Dense)              (None, 1536)              2360832   
                                                                 
Total params: 4721664 (18.01 MB)
Trainable params: 4721664 (18.01 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Metric

In [31]:
@tf.keras.saving.register_keras_serializable()
def metric_choose_argument_global_y_train(y_true, y_pred):
  """global_metric"""
  global_training_df_32 = tf.cast(global_training_df, dtype=tf.float32)

  cos_sim_pred = tf.matmul(global_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(global_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

### Global Training

In [32]:
# Global Model
global_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_global_y_train]
)

In [33]:
checkpoint_callback = ModelCheckpoint(filepath='global_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='global_training_log.csv', separator=',', append=True)
global_history = global_autoencoder_model.fit(
  x=global_x_train,
  y=global_y_train,
  batch_size=1,
  epochs=20,
  validation_data = (global_x_test, global_y_test),
  callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/20
Epoch 1: saving model to global_autoencoder_weights.keras
Epoch 2/20
Epoch 2: saving model to global_autoencoder_weights.keras
Epoch 3/20
Epoch 3: saving model to global_autoencoder_weights.keras
Epoch 4/20
Epoch 4: saving model to global_autoencoder_weights.keras
Epoch 5/20
Epoch 5: saving model to global_autoencoder_weights.keras
Epoch 6/20
Epoch 6: saving model to global_autoencoder_weights.keras
Epoch 7/20
Epoch 7: saving model to global_autoencoder_weights.keras
Epoch 8/20
Epoch 8: saving model to global_autoencoder_weights.keras
Epoch 9/20
Epoch 9: saving model to global_autoencoder_weights.keras
Epoch 10/20
Epoch 10: saving model to global_autoencoder_weights.keras
Epoch 11/20
Epoch 11: saving model to global_autoencoder_weights.keras
Epoch 12/20
Epoch 12: saving model to global_autoencoder_weights.keras
Epoch 13/20
Epoch 13: saving model to global_autoencoder_weights.keras
Epoch 14/20
Epoch 14: saving model to global_autoencoder_weights.keras
Epoch 15/20
Epoch 15: sa

In [34]:
global_history_df = pd.DataFrame(global_history.history)

In [35]:
output_folder_path = 'current-data-dump/ada-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
global_history_df.to_csv(f'{output_folder_path}global_training_log.csv')

In [36]:
output_folder_path = 'current-data-dump/ada-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
global_autoencoder_model.save(f'{output_folder_path}global_autoencoder_model.keras')

### Global Shuffled Training

In [37]:
# Global Shuffled Model
global_shuffled_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_shuffled_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="cosine_similarity",
  metrics=[metric_choose_argument_global_y_train]
)

global_shuffled_history = global_shuffled_autoencoder_model.fit(
  x=global_x_train,
  y=global_y_train_shuffled,
  batch_size=1,
  epochs=20,
  validation_data = (global_x_test, global_y_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [44]:
global_shuffled_history_df = pd.DataFrame(global_shuffled_history.history)

In [45]:
output_folder_path = 'current-data-dump/ada-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
global_shuffled_history_df.to_csv(f'{output_folder_path}global_shuffled_training_log.csv')

In [46]:
output_folder_path = 'current-data-dump/ada-autoencoder/'
os.makedirs(output_folder_path, exist_ok=True)
global_shuffled_autoencoder_model.save(f'{output_folder_path}global_shuffled_autoencoder_model.keras')

## Export data

In [47]:
def export_ada_autoencoder():
  """Export ada_autoencoder directory"""
  ada_autoencoder_file_path = 'current-data-dump/ada-autoencoder'
  ada_autoencoder_file_path_zip = 'current-data-dump/ada-autoencoder'
  shutil.make_archive(ada_autoencoder_file_path_zip, 'zip', ada_autoencoder_file_path)
  print(f"Zip file created at: {ada_autoencoder_file_path_zip}")
  result = subprocess.run([f"osf -p sakjg upload --force {ada_autoencoder_file_path_zip}.zip data-dump/ada003-autoencoder/ada_autoencoder.zip"], shell=True, capture_output=True, text=True)
  print(result.stderr)
  print(f"File: {ada_autoencoder_file_path_zip} uploaded at osfstorage")

## Import data

In [48]:
def import_ada_autoencoder():
  """Import ada_autoencoder directory"""
  subprocess.run("osf -p sakjg fetch --force osfstorage/data-dump/ada003-autoencoder/ada_autoencoder.zip", shell=True)
  print("ada_autoencoder.zip successfully imported")
  ada_autoencoder_file_path_zip = 'ada_autoencoder.zip'
  ada_autoencoder_file_path = 'current-data-dump/ada-autoencoder'
  os.makedirs(ada_autoencoder_file_path, exist_ok=True)
  with zipfile.ZipFile(ada_autoencoder_file_path_zip, 'r') as zip_ref:
    zip_ref.extractall(ada_autoencoder_file_path)
  extracted_files = os.listdir(ada_autoencoder_file_path)
  print("Files extracted:", extracted_files)

## Load global training history

In [57]:
import_ada_autoencoder()

ada_autoencoder.zip successfully imported
Files extracted: ['global_autoencoder_model.keras', 'global_shuffled_autoencoder_model.keras', 'global_shuffled_training_log.csv', 'global_shuffled_training_plot.png', 'global_training_plot.png', 'combined_global_training_plot.png', 'global_training_df.pkl', 'global_training_log.csv']


#### Unshuffled training history

In [50]:
# Access training history
loaded_global_history = pd.DataFrame(pd.read_csv("current-data-dump/ada-autoencoder/global_training_log.csv"))
loaded_global_history = pd.melt(loaded_global_history, id_vars='Unnamed: 0', value_vars=['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], var_name='dataset', value_name='accuracy')
loaded_global_history = loaded_global_history.replace(['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], ['training set', 'validation set'])
loaded_global_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True)
loaded_global_history['shuffled'] = False

In [51]:
global_training_plot = ggplot(loaded_global_history, aes(x='epoch', y='accuracy', linetype='dataset')) + geom_line() + labs(title='Learning Curve of Model Trained on Unshuffled Data', x='Epoch', y='Accuracy')
ggsave(global_training_plot, "current-data-dump/ada-autoencoder/global_training_plot.png")



#### Shuffled training history

In [52]:
# Access training history
loaded_global_shuffled_history = pd.DataFrame(pd.read_csv("current-data-dump/ada-autoencoder/global_shuffled_training_log.csv"))
loaded_global_shuffled_history = pd.melt(loaded_global_shuffled_history, id_vars='Unnamed: 0', value_vars=['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], var_name='dataset', value_name='accuracy')
loaded_global_shuffled_history = loaded_global_shuffled_history.replace(['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], ['training set', 'validation set'])
loaded_global_shuffled_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True)
loaded_global_shuffled_history['shuffled'] = False

In [53]:
global_shuffled_training_plot = ggplot(loaded_global_shuffled_history, aes(x='epoch', y='accuracy', linetype='dataset')) + geom_line() + labs(title='Learning Curve of Model Trained on Shuffled Data', x='Epoch', y='Accuracy')
ggsave(global_shuffled_training_plot, "current-data-dump/ada-autoencoder/global_shuffled_training_plot.png")



## Combined Training Plots

In [54]:
combined_global_training_df = pd.concat([loaded_global_history, loaded_global_shuffled_history])

In [55]:
combined_global_plot = (
  ggplot(combined_global_training_df, aes(x='epoch', y='accuracy', linetype='dataset', color='shuffled')) +
  geom_line(size=2) +
  labs(title='Learning Curve of Model Trained on Unshuffled vs. Within-Topic Shuffled Data', x='Epoch', y='Accuracy') +
  theme(
    figure_size=(16,24),
    axis_title=element_text(size=32),
    axis_text=element_text(size=24),
    legend_title=element_text(size=32, lineheight=1.5),
    legend_text=element_text(size=24, lineheight=1.5),
    plot_title=element_text(size=40, wrap=True, lineheight=1.5),
    legend_position="bottom",
    legend_key_width=64
  ) +
  guides(fill = guide_legend(byrow = True))
)
ggsave(combined_global_plot, "current-data-dump/ada-autoencoder/combined_global_training_plot.png")



## Category and Debate Training

#### Category Data (Economy)

In [None]:
economy_embeddings_data = pd.read_pickle("current-data-dump/embeddings-dump/economy/economy_embeddings.pkl")

In [None]:
economy_training_df = prepare_training_df(economy_embeddings_data)
economy_x_train = make_x_train(economy_training_df)
economy_y_train = make_y_train(economy_training_df)
economy_x_test = make_x_test(economy_training_df)
economy_y_test = make_y_test(economy_training_df)

#### Debate Data (Economy)

In [None]:
economy_debate_embeddings_data = pd.read_pickle("current-data-dump/embeddings-dump/economy/business_economy_general_house_would_prohibit_retailers_selling_certain_items_embeddings.pkl")

In [None]:
economy_debate_training_df = prepare_training_df(economy_debate_embeddings_data)
economy_debate_x_train = make_x_train(economy_debate_training_df)
economy_debate_y_train = make_y_train(economy_debate_training_df)
economy_debate_x_test = make_x_test(economy_debate_training_df)
economy_debate_y_test = make_y_test(economy_debate_training_df)

#### Category Training

In [None]:
# Category Model
category_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
category_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="mse",
  metrics=['accuracy']
)
category_autoencoder_model.fit(
  x=economy_x_train,
  y=economy_y_train,
  batch_size=1,
  epochs=20,
  validation_data=(economy_x_test, economy_y_test)
)

#### Debate Training

In [None]:
# Debate Model
debate_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
debate_autoencoder_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
  loss="mse",
  metrics=['accuracy']
)
debate_autoencoder_model.fit(
  x=economy_debate_x_train,
  y=economy_debate_y_train,
  batch_size=1,
  epochs=5
)

## Final Export

In [56]:
export_ada_autoencoder()

Zip file created at: current-data-dump/ada-autoencoder

File: current-data-dump/ada-autoencoder uploaded at osfstorage
