<a href="https://colab.research.google.com/github/cchang-vassar/Semantic-Relations-in-Vector-Embeddings/blob/main/study6_nomic_autoencoder_choose.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [nomic] Autoencoder: Choose Corresponding Embedding

Given an embedding, can a model be trained to choose the correct embeddings corresponding to its counterargument from a list of them?

## Set Up

### Imports

In [None]:
# General imports
import os
import subprocess
import zipfile
import shutil
import time
from google.colab import userdata
import pickle
import statistics
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import load_model
from scipy import spatial
from tenacity import (
  retry,
  stop_after_attempt,
  wait_random_exponential
)

### Nomic Setup

In [None]:
!pip install nomic



In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/156.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m112.6/156.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install einops

Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.7.0


In [None]:
import nomic
from sentence_transformers import SentenceTransformer

### OSF Setup

In [None]:
!pip install osfclient

Collecting osfclient
  Downloading osfclient-0.0.5-py2.py3-none-any.whl (39 kB)
Installing collected packages: osfclient
Successfully installed osfclient-0.0.5


In [None]:
os.environ["OSF_USERNAME"] = userdata.get("OSF_USERNAME")
OSF_USERNAME = os.environ["OSF_USERNAME"]

In [None]:
os.environ["OSF_PASSWORD"] = userdata.get("OSF_PASSWORD")
OSF_PASSWORD = os.environ["OSF_PASSWORD"]

In [None]:
os.environ["OSF_TOKEN"] = userdata.get("OSF_TOKEN")
OSF_TOKEN = os.environ["OSF_TOKEN"]

In [None]:
os.environ["OSF_PROJECT_ID"] = userdata.get("OSF_PROJECT_ID")
OSF_PROJECT_ID = os.environ["OSF_PROJECT_ID"]

## Load Corpora Data

In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/corpora/gpr_corpus.zip", shell=True)
print("gpr_corpus.zip successfully imported")
gpr_corpus_file_path_zip = 'gpr_corpus.zip'
gpr_corpus_file_path = 'corpora/gpr-corpus'
with zipfile.ZipFile(gpr_corpus_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(gpr_corpus_file_path)
extracted_files = os.listdir(gpr_corpus_file_path)
print("Files extracted:", extracted_files)

gpr_corpus.zip successfully imported
Files extracted: ['gpr_corpus', '__MACOSX']


In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/corpora/eacl_corpus.zip", shell=True)
print("eacl_corpus.zip successfully imported")
eacl_corpus_file_path_zip = 'eacl_corpus.zip'
eacl_corpus_file_path = 'corpora/eacl-corpus'
with zipfile.ZipFile(eacl_corpus_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(eacl_corpus_file_path)
extracted_files = os.listdir(eacl_corpus_file_path)
print("Files extracted:", extracted_files)

eacl_corpus.zip successfully imported
Files extracted: ['eacl_corpus', '__MACOSX']


## Data

### GPR 55

In [None]:
gpr_df = pd.read_csv("corpora/gpr-corpus/gpr_corpus/GPR-KB-55/GPR-KB-55.csv")

In [None]:
DIM_EMBEDDING = 768
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

def gpr_get_embeddings(gpr_df: pd.DataFrame) -> pd.DataFrame:
  """Add embeddings column to a df"""
  arguments_list = list(gpr_df['argument'])

  clustering_embeddings = model.encode(['clustering: ' + argument for argument in arguments_list])
  clustering_df = pd.DataFrame(clustering_embeddings, columns=[f"clustering_{str(i)}" for i in range(DIM_EMBEDDING)])
  embeddings_clu_df = pd.concat([gpr_df, clustering_df], axis=1)

  return embeddings_clu_df

In [None]:
gpr_claims_df = gpr_get_embeddings_df(gpr_df['claim'])
gpr_rebuttals_df = gpr_get_embeddings_df(gpr_df['rebuttal'])
gpr_x_test = gpr_claims_df.select_dtypes(include=[np.number])
gpr_y_test = gpr_rebuttals_df.select_dtypes(include=[np.number])
gpr_combined = pd.concat([gpr_claims_df, gpr_rebuttals_df])
gpr_combined = gpr_combined.reset_index(drop=True)
gpr_combined_nums = gpr_combined.select_dtypes(include=[np.number])

In [None]:
def metric_choose_argument_gpr(y_true, y_pred):
  """See if the output vector is closest to the rebuttal to the claim"""
  gpr_training_df_32 = tf.cast(gpr_combined_nums, dtype=tf.float32)
  gpr_norm = tf.norm(gpr_training_df_32, axis=1)

  cos_sim_pred = tf.matmul(gpr_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * gpr_norm, [-1, 1])
  cos_sim_true = tf.matmul(gpr_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * gpr_norm, [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

### EACL

In [None]:
eacl_df = pd.read_csv("corpora/eacl-corpus/eacl_corpus/claim_stance_dataset.csv")
eacl_df = eacl_df[['topicId', 'topicText', 'claims.stance', 'claims.claimCorrectedText']]

In [None]:
topic_lens = []
pro_lens = []
con_lens = []
for topic in eacl_df['topicId'].unique():
  topic_rows = eacl_df[eacl_df['topicId'] == topic]
  topic_lens.append(len(topic_rows))
  pro_lens.append(len(topic_rows[topic_rows['claims.stance'] == "PRO"]))
  con_lens.append(len(topic_rows[topic_rows['claims.stance'] == "CON"]))

In [None]:
DIM_EMBEDDING = 768

"""Add embeddings column to a df"""
  arguments_list = list(gpr_df['argument'])



@retry(wait=wait_random_exponential(min=60, max=500), stop=stop_after_attempt(10))
def eacl_get_embeddings(arguments: list) -> list:
  """Convert an argument into a (1 x 768) embedding df"""
  clustering_embeddings = model.encode(['clustering: ' + argument for argument in arguments_list])
  clustering_df = pd.DataFrame(clustering_embeddings, columns=[f"clustering_{str(i)}" for i in range(DIM_EMBEDDING)])
  embeddings_clu_df = pd.concat([gpr_df, clustering_df], axis=1)

  return embeddings_clu_df.reset_index(drop=True)

In [None]:
API_LIMIT = 1000

def eacl_get_embeddings_df(eacl_df: pd.DataFrame) -> pd.DataFrame:
  """Add embeddings column to a df"""
  embeddings_df = pd.DataFrame()
  arguments_list = list(eacl_df['claims.claimCorrectedText'])
  total_len = len(arguments_list)
  i = 0

  # Grab embeddings from arguments column in chunks
  while i < total_len:
    embeddings = eacl_get_embeddings(arguments_list[i:min(total_len, i+API_LIMIT)])
    embeddings_df = pd.concat([embeddings_df, embeddings], axis=0, ignore_index=True)
    i = i + API_LIMIT
  arguments_embeddings_df = pd.concat([eacl_df, embeddings_df], axis=1)
  return arguments_embeddings_df

In [None]:
eacl_embeddings_df = eacl_get_embeddings_df(eacl_df)

In [None]:
eacl_nums_df = eacl_embeddings_df.select_dtypes(include=[np.number])
eacl_vectors_df = eacl_nums_df.drop('topicId', axis=1)

## Autoencoder Model

### Import model from OSF

In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/data-dump/nomic-autoencoder/nomic_autoencoder.zip", shell=True)
print("nomic_autoencoder.zip successfully imported")
nomic_autoencoder_file_path_zip = 'nomic_autoencoder.zip'
nomic_autoencoder_file_path = 'current-data-dump/nomic-autoencoder'
with zipfile.ZipFile(nomic_autoencoder_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(nomic_autoencoder_file_path)
extracted_files = os.listdir(nomic_autoencoder_file_path)
print("Files extracted:", extracted_files)

ada_autoencoder.zip successfully imported
Files extracted: ['global_training_log.csv', 'global_training_plot.png', 'global_shuffled_autoencoder_model.keras', 'global_shuffled_training_log.csv', 'global_shuffled_training_plot.png', '.ipynb_checkpoints', 'global_training_df.pkl', 'combined_global_training_plot.png', 'global_autoencoder_model.keras']


In [None]:
global_training_df = pd.read_pickle('current-data-dump/nomic-autoencoder/global_training_df.pkl')

### Metric

In [None]:
@tf.keras.saving.register_keras_serializable()
def metric_choose_argument_global_y_train(y_true, y_pred):
  global_training_df_32 = tf.cast(global_training_df, dtype=tf.float32)

  cos_sim_pred = tf.matmul(global_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(global_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

## Load saved model

In [None]:
global_autoencoder_model = tf.keras.models.load_model('current-data-dump/nomic-autoencoder/global_autoencoder_model.keras')

## GPR predict

In [None]:
global_autoencoder_gpr_predictions = global_autoencoder_model.predict(gpr_x_test)
global_autoencoder_gpr_predictions_df = pd.DataFrame(global_autoencoder_gpr_predictions)
global_autoencoder_gpr_predictions_df.columns = [str(i) for i in global_autoencoder_gpr_predictions_df.columns]



In [None]:
successes = 0
for i in range(len(gpr_y_test)):
  gpr_y_test_tf = tf.convert_to_tensor(gpr_y_test.loc[i], dtype=tf.float32)
  gpr_pred_tf = tf.convert_to_tensor(global_autoencoder_gpr_predictions_df.loc[i], dtype=tf.float32)
  gpr_y_test_tf = tf.reshape(gpr_y_test_tf, (1, -1))
  gpr_pred_tf = tf.reshape(gpr_pred_tf, (1, -1))
  if metric_choose_argument_gpr(gpr_y_test_tf, gpr_pred_tf).numpy() == 1:
    successes += 1

In [None]:
gpr_success_rate = successes / len(gpr_y_test) * 100

## EACL Predict

In [None]:
global_autoencoder_eacl_predictions = global_autoencoder_model.predict(eacl_vectors_df)
global_autoencoder_eacl_predictions_df = pd.DataFrame(global_autoencoder_eacl_predictions)
global_autoencoder_eacl_predictions_df.columns = [str(i) for i in global_autoencoder_eacl_predictions_df.columns]



In [None]:
eacl_topk = 10

In [None]:
eacl_embeddings_df_32 = tf.cast(eacl_vectors_df, dtype=tf.float32)
global_autoencoder_eacl_predictions_tf = tf.constant(global_autoencoder_eacl_predictions_df.values, dtype=tf.float32)
pred_topk = []
eacl_embeddings_norm = tf.norm(eacl_embeddings_df_32, axis=1)
eacl_topics = list(eacl_embeddings_df['topicId'])
eacl_stances = list(eacl_embeddings_df['claims.stance'])

In [None]:
for i, row in enumerate(global_autoencoder_eacl_predictions_tf):
  successes = 0
  y_pred = tf.reshape(row, [1, -1])
  target_topic = eacl_topics[i]
  target_type = 'PRO' if eacl_stances[i] == 'CON' else 'CON'

  cos_sim_pred = tf.matmul(eacl_embeddings_df_32, y_pred, transpose_b=True) / tf.reshape((tf.norm(y_pred) * eacl_embeddings_norm), [-1, 1])
  top_k_sim_pred = tf.math.top_k(tf.reshape(cos_sim_pred, [-1]), k=eacl_topk).indices.numpy()

  for index in top_k_sim_pred:
    if eacl_topics[index] == target_topic and eacl_stances[index] == target_type:
      successes += 1
  pred_topk.append(successes / eacl_topk * 100)

In [None]:
eacl_topk_success_rate = statistics.mean(pred_topk)

## Export Values

In [None]:
result_df = pd.DataFrame(columns=['gpr_success_rate', 'eacl_topk_success_rate', 'eacl_topk'])
result_df = result_df.append({'gpr_success_rate': gpr_success_rate, 'eacl_topk_success_rate': eacl_topk_success_rate, 'eacl_topk': eacl_topk}, ignore_index=True)
results_folder_path = 'current-data-dump/nomic-autoencoder/nomic-autoencoder-predictions/'
os.makedirs(results_folder_path, exist_ok=True)
results_file_path = f'{results_folder_path}novel_corpora_prediction.pkl'
with open(results_file_path, 'wb') as file:
  pickle.dump(result_df, file)
  print(f"File uploaded to {results_file_path}")

File uploaded to current_data_dump/ada_autoencoder_predictions/novel_corpora_prediction.pkl


  result_df = result_df.append({'gpr_success_rate': gpr_success_rate, 'eacl_topk_success_rate': eacl_topk_success_rate, 'eacl_topk': eacl_topk}, ignore_index=True)


In [None]:
nomic_autoencoder_file_path = 'current-data-dump/nomic-autoencoder/nomic-autoencoder-predictions'
result = subprocess.run([f"osf -p sakjg upload -r --force {nomic_autoencoder_file_path}/ data-dump/nomic-autoencoder/nomic-autoencoder-predictions"], shell=True, capture_output=True, text=True)
print(result.stderr)
print(f"File: {nomic_autoencoder_file_path} uploaded at osfstorage")


File: /content/current_data_dump/ada_autoencoder_predictions uploaded at osfstorage
