<a href="https://colab.research.google.com/github/cchang-vassar/Semantic-Relations-in-Vector-Embeddings/blob/main/study3_2_%5Bada003%5Dautoencoder_choose.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [ada-003] Autoencoder: Choose Corresponding Embedding

Given an embedding, can a model be trained to choose the correct embeddings corresponding to its counterargument from a list of them?

## Set Up

### Imports

In [None]:
# General imports
import os
import subprocess
import zipfile
import shutil
import time
from google.colab import userdata
import pickle
import statistics
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import load_model
from scipy import spatial
from tenacity import (
  retry,
  stop_after_attempt,
  wait_random_exponential
)

### OpenAI Setup

In [None]:
!pip install openai

Collecting openai
  Downloading openai-1.25.1-py3-none-any.whl (312 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/312.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/312.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m307.2/312.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.9/312.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m2.5 MB/s[

In [None]:
import openai
from openai import OpenAI
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [None]:
client = OpenAI()

### OSF Setup

In [None]:
!pip install osfclient

Collecting osfclient
  Downloading osfclient-0.0.5-py2.py3-none-any.whl (39 kB)
Installing collected packages: osfclient
Successfully installed osfclient-0.0.5


In [None]:
os.environ["OSF_USERNAME"] = userdata.get("OSF_USERNAME")
OSF_USERNAME = os.environ["OSF_USERNAME"]

In [None]:
os.environ["OSF_PASSWORD"] = userdata.get("OSF_PASSWORD")
OSF_PASSWORD = os.environ["OSF_PASSWORD"]

In [None]:
os.environ["OSF_TOKEN"] = userdata.get("OSF_TOKEN")
OSF_TOKEN = os.environ["OSF_TOKEN"]

In [None]:
os.environ["OSF_PROJECT_ID"] = userdata.get("OSF_PROJECT_ID")
OSF_PROJECT_ID = os.environ["OSF_PROJECT_ID"]

## Load Corpora Data

### GPR

In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/corpora/gpr_corpus.zip", shell=True)
print("gpr_corpus.zip successfully imported")
gpr_corpus_file_path_zip = 'gpr_corpus.zip'
gpr_corpus_file_path = 'corpora/gpr-corpus'
with zipfile.ZipFile(gpr_corpus_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(gpr_corpus_file_path)
extracted_files = os.listdir(gpr_corpus_file_path)
print("Files extracted:", extracted_files)

gpr_corpus.zip successfully imported
Files extracted: ['gpr_corpus', '__MACOSX']


### EACL

In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/corpora/eacl_corpus.zip", shell=True)
print("eacl_corpus.zip successfully imported")
eacl_corpus_file_path_zip = 'eacl_corpus.zip'
eacl_corpus_file_path = 'corpora/eacl-corpus'
with zipfile.ZipFile(eacl_corpus_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(eacl_corpus_file_path)
extracted_files = os.listdir(eacl_corpus_file_path)
print("Files extracted:", extracted_files)

eacl_corpus.zip successfully imported
Files extracted: ['__MACOSX', 'eacl_corpus']


### persuade_corpus

In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/corpora/persuade_corpus.zip", shell=True)
print("persuade_corpus.zip successfully imported")
persuade_corpus_file_path_zip = 'persuade_corpus.zip'
persuade_corpus_file_path = 'corpora/persuade-corpus'
with zipfile.ZipFile(persuade_corpus_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(persuade_corpus_file_path)
extracted_files = os.listdir(persuade_corpus_file_path)
print("Files extracted:", extracted_files)

persuade_corpus.zip successfully imported
Files extracted: ['persuade_corpus', '__MACOSX']


### SciFact

In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/corpora/scifact_corpus.zip", shell=True)
print("scifact_corpus.zip successfully imported")
scifact_corpus_file_path_zip = 'scifact_corpus.zip'
scifact_corpus_file_path = 'corpora/scifact-corpus'
with zipfile.ZipFile(scifact_corpus_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(scifact_corpus_file_path)
extracted_files = os.listdir(scifact_corpus_file_path)
print("Files extracted:", extracted_files)

scifact_corpus.zip successfully imported
Files extracted: ['__MACOSX', 'scifact_corpus']


## Data

### GPR 55

In [None]:
gpr_df = pd.read_csv("corpora/gpr-corpus/gpr_corpus/GPR-KB-55/GPR-KB-55.csv")

In [None]:
DIM_EMBEDDING = 1536

def gpr_get_embeddings_df(gpr_df: pd.DataFrame) -> pd.DataFrame:
  """Add embeddings column to a df"""
  gpr_embeddings_df = pd.DataFrame()
  arguments_list = list(gpr_df)

  claims_embeddings = client.embeddings.create(input=arguments_list, model="text-embedding-ada-002")
  claims_embeddings_data = [embedding_data.embedding for embedding_data in claims_embeddings.data]
  claims_embeddings_df = pd.DataFrame(claims_embeddings_data, columns=[f"{str(i)}" for i in range(DIM_EMBEDDING)])
  claims_embeddings_df = claims_embeddings_df.reset_index(drop=True)

  claims_embeddings_df = pd.concat([gpr_df, claims_embeddings_df], axis=1)
  return claims_embeddings_df

In [None]:
gpr_claims_df = gpr_get_embeddings_df(gpr_df['claim'])
gpr_rebuttals_df = gpr_get_embeddings_df(gpr_df['rebuttal'])
gpr_x_test = gpr_claims_df.select_dtypes(include=[np.number])
gpr_y_test = gpr_rebuttals_df.select_dtypes(include=[np.number])
gpr_combined = pd.concat([gpr_claims_df, gpr_rebuttals_df])
gpr_combined = gpr_combined.reset_index(drop=True)
gpr_combined_nums = gpr_combined.select_dtypes(include=[np.number])

In [None]:
min([len(claim) for claim in gpr_rebuttals_df['rebuttal']])

73

In [None]:
def metric_choose_argument_gpr(y_true, y_pred):
  """See if the output vector is closest to the rebuttal to the claim"""
  gpr_training_df_32 = tf.cast(gpr_combined_nums, dtype=tf.float32)
  gpr_norm = tf.norm(gpr_training_df_32, axis=1)

  cos_sim_pred = tf.matmul(gpr_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * gpr_norm, [-1, 1])
  cos_sim_true = tf.matmul(gpr_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * gpr_norm, [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

### EACL

In [None]:
eacl_df = pd.read_csv("corpora/eacl-corpus/eacl_corpus/claim_stance_dataset.csv")
eacl_df = eacl_df[['topicId', 'topicText', 'claims.stance', 'claims.claimCorrectedText']]

In [None]:
topic_lens = []
pro_lens = []
con_lens = []
for topic in eacl_df['topicId'].unique():
  topic_rows = eacl_df[eacl_df['topicId'] == topic]
  topic_lens.append(len(topic_rows))
  pro_lens.append(len(topic_rows[topic_rows['claims.stance'] == "PRO"]))
  con_lens.append(len(topic_rows[topic_rows['claims.stance'] == "CON"]))

In [None]:
DIM_EMBEDDING = 1536

@retry(wait=wait_random_exponential(min=60, max=500), stop=stop_after_attempt(10))
def eacl_get_embeddings(arguments: list) -> list:
  """Convert an argument into a (1 x 1536) embedding df"""
  embeddings = client.embeddings.create(input=arguments, model="text-embedding-ada-002")
  embeddings_data = [embedding_data.embedding for embedding_data in embeddings.data]
  embeddings_df = pd.DataFrame(embeddings_data, columns=[f"{str(i)}" for i in range(DIM_EMBEDDING)])
  return embeddings_df.reset_index(drop=True)

In [None]:
API_LIMIT = 1000

def eacl_get_embeddings_df(eacl_df: pd.DataFrame) -> pd.DataFrame:
  """Add embeddings column to a df"""
  embeddings_df = pd.DataFrame()
  arguments_list = list(eacl_df['claims.claimCorrectedText'])
  total_len = len(arguments_list)
  i = 0

  # Grab embeddings from arguments column in chunks
  while i < total_len:
    embeddings = eacl_get_embeddings(arguments_list[i:min(total_len, i+API_LIMIT)])
    embeddings_df = pd.concat([embeddings_df, embeddings], axis=0, ignore_index=True)
    i = i + API_LIMIT
  arguments_embeddings_df = pd.concat([eacl_df, embeddings_df], axis=1)
  return arguments_embeddings_df

In [None]:
eacl_embeddings_df = eacl_get_embeddings_df(eacl_df)

In [None]:
eacl_nums_df = eacl_embeddings_df.select_dtypes(include=[np.number])
eacl_vectors_df = eacl_nums_df.drop('topicId', axis=1)

### Persuade Corpus

In [None]:
persuade_corpus = pd.read_csv('/content/corpora/persuade-corpus/persuade_corpus/persuade_corpus_1.0.csv')
persuade_source = pd.read_csv('/content/corpora/persuade-corpus/persuade_corpus/persuade_2.0_human_scores_demo_id_github.csv')

In [None]:
persuade_corpus = persuade_corpus[persuade_corpus['discourse_type'] == "Evidence"]
persuade_corpus = persuade_corpus.drop(columns=['competition_set', 'full_text', 'discourse_id', 'discourse_start', 'discourse_end', 'discourse_type_num'])

In [None]:
persuade_source = persuade_source.drop(columns=['full_text', 'holistic_essay_score', 'word_count', 'task', 'assignment', 'source_text', 'gender', 'grade_level', 'ell_status', 'race_ethnicity', 'economically_disadvantaged', 'student_disability_status'])

In [None]:
persuade_source_dict = zip(persuade_source['essay_id_comp'], persuade_source['prompt_name'])

In [None]:
persuade_source_dict = dict(persuade_source_dict)

In [None]:
persuade_corpus['argument'] = [persuade_source_dict[essay_id] if essay_id in persuade_source_dict.keys() else "" for essay_id in persuade_corpus['essay_id_comp']]

In [None]:
persuade_corpus = persuade_corpus[persuade_corpus['argument'] != ""]

In [None]:
persuade_corpus = persuade_corpus.drop(columns=['discourse_type', 'essay_id_comp'])
persuade_corpus = persuade_corpus.rename(columns={'discourse_text': 'evidence'})

In [None]:
persuade_corpus = persuade_corpus.reset_index(drop=True)

In [None]:
persuade_argument_dict = {
    'Phones and driving': 'Drivers should not be allowed to use phones while driving',
    'Car-free cities': 'We should develop cities to be car-free from now on',
    'Summer projects': 'Summer projects are valuable learning opportunities for students',
    '"A Cowboy Who Rode the Waves"': "The Seagoing Cowboys' work is adventurous, meaningful and transformative",
    'Mandatory extracurricular activities': "Extracurricular activities play an irreplacable role in students' education",
    'Exploring Venus': 'Venus is a challenging but rewarding planet to explore',
    'Facial action coding system': 'Having a large-scale software that analyzes and codifies human facial expressions is pointless',
    'The Face on Mars': 'The face on Mars suggests alien activity in the universe',
    'Community service': 'Doing community service is important to both societal and personal benefit',
    'Grades for extracurricular activities': 'Extracurricular activities should not be graded',
    'Driverless cars': 'Driverless cars are the future and should be fully embraced',
    'Does the electoral college work?': 'The electoral college does not work',
    'Cell phones at school': 'Students should be allowed to bring cell phones to school',
    'Distance learning': 'Online classes are the bane of real education',
    'Seeking multiple opinions': 'It is always a good idea to seek the opinions of multiple people'
}

In [None]:
persuade_corpus['argument'] = [persuade_argument_dict[topic] for topic in persuade_corpus['argument']]

In [None]:
persuade_arguments_df = pd.DataFrame(persuade_argument_dict.values(), columns=['argument'])

In [None]:
DIM_EMBEDDING = 1536

@retry(wait=wait_random_exponential(min=60, max=500), stop=stop_after_attempt(10))
def persuade_get_embeddings(arguments: list) -> list:
  """Convert an argument into a (1 x 1536) embedding df"""
  embeddings = client.embeddings.create(input=arguments, model="text-embedding-ada-002")
  embeddings_data = [embedding_data.embedding for embedding_data in embeddings.data]
  embeddings_df = pd.DataFrame(embeddings_data, columns=[f"{str(i)}" for i in range(DIM_EMBEDDING)])
  return embeddings_df.reset_index(drop=True)

In [None]:
API_LIMIT = 1000

def persuade_get_embeddings_df(persuade_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
  """Add embeddings column to a df"""
  embeddings_df = pd.DataFrame()
  arguments_list = list(persuade_df[column_name])
  total_len = len(arguments_list)
  i = 0

  # Grab embeddings from arguments column in chunks
  while i < total_len:
    embeddings = persuade_get_embeddings(arguments_list[i:min(total_len, i+API_LIMIT)])
    embeddings_df = pd.concat([embeddings_df, embeddings], axis=0, ignore_index=True)
    i = i + API_LIMIT
  arguments_embeddings_df = pd.concat([persuade_df, embeddings_df], axis=1)
  return arguments_embeddings_df

In [None]:
persuade_arguments_embeddings_df = persuade_get_embeddings_df(persuade_arguments_df, 'argument')

In [None]:
persuade_arguments_vector_df = persuade_arguments_embeddings_df.select_dtypes(include=[np.number])

In [None]:
persuade_evidence_embeddings_df = persuade_get_embeddings_df(persuade_corpus, 'evidence')

In [None]:
persuade_evidence_vector_df = persuade_evidence_embeddings_df.select_dtypes(include=[np.number])

In [None]:
from collections import Counter
argument_evidence_counts = Counter(persuade_evidence_embeddings_df['argument'])

In [None]:
max(argument_evidence_counts.values())

7518

### SciFact Corpus

In [None]:
DIM_EMBEDDING = 1536

@retry(wait=wait_random_exponential(min=60, max=500), stop=stop_after_attempt(10))
def scifact_get_embeddings(arguments: list) -> list:
  """Convert an argument into a (1 x 1536) embedding df"""
  embeddings = client.embeddings.create(input=arguments, model="text-embedding-ada-002")
  embeddings_data = [embedding_data.embedding for embedding_data in embeddings.data]
  embeddings_df = pd.DataFrame(embeddings_data, columns=[f"{str(i)}" for i in range(DIM_EMBEDDING)])
  return embeddings_df.reset_index(drop=True)

In [None]:
API_LIMIT = 1000

def scifact_get_embeddings_df(scifact_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
  """Add embeddings column to a df"""
  embeddings_df = pd.DataFrame()
  arguments_list = list(scifact_df[column_name])
  total_len = len(arguments_list)
  i = 0

  # Grab embeddings from arguments column in chunks
  while i < total_len:
    embeddings = scifact_get_embeddings(arguments_list[i:min(total_len, i+API_LIMIT)])
    embeddings_df = pd.concat([embeddings_df, embeddings], axis=0, ignore_index=True)
    i = i + API_LIMIT
  arguments_embeddings_df = pd.concat([scifact_df, embeddings_df], axis=1)
  return arguments_embeddings_df

In [None]:
scifact_corpus = pd.read_json('/content/corpora/scifact-corpus/scifact_corpus/corpus.jsonl', lines=True)

In [None]:
scifact_evidence_corpus = []

for abstract in scifact_corpus['abstract']:
  for sentence in abstract:
    scifact_evidence_corpus.append(sentence)

In [None]:
scifact_evidence_embeddings_corpus = scifact_get_embeddings_df(pd.DataFrame(scifact_evidence_corpus, columns=['evidence']), 'evidence')

In [None]:
scifact_evidence_embeddings = scifact_evidence_embeddings_corpus.select_dtypes(include=[np.number])

In [None]:
scifact_test = pd.read_json('/content/corpora/scifact-corpus/scifact_corpus/claims_test.jsonl', lines=True)

In [None]:
scifact_test = scifact_test.drop(columns=['id'])

In [None]:
scifact_test_embeddings_corpus = scifact_get_embeddings_df(scifact_test, 'claim')

In [None]:
scifact_test_embeddings = scifact_test_embeddings_corpus.select_dtypes(include=[np.number])

## Autoencoder Model

### Counterargument model

In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/data-dump/ada003-autoencoder/ada_autoencoder.zip", shell=True)
print("ada_autoencoder.zip successfully imported")
ada_autoencoder_file_path_zip = 'ada_autoencoder.zip'
ada_autoencoder_file_path = 'current-data-dump/ada-autoencoder'
with zipfile.ZipFile(ada_autoencoder_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(ada_autoencoder_file_path)
extracted_files = os.listdir(ada_autoencoder_file_path)
print("Files extracted:", extracted_files)

ada_autoencoder.zip successfully imported
Files extracted: ['global_training_plot.png', 'x_test.pkl', 'y_test.pkl', 'y_teset.pkl', 'global_shuffled_training_plot.png', 'global_shuffled_training_log.csv', 'training_df.pkl', 'combined_global_training_plot.png', 'global_training_df.pkl', 'global_shuffled_autoencoder_model.keras', 'x_train.pkl', 'y_train.pkl', 'global_autoencoder_model.keras', 'global_training_log.csv']


In [None]:
x_train = pd.read_pickle('current-data-dump/ada-autoencoder/x_train.pkl')
y_train = pd.read_pickle('current-data-dump/ada-autoencoder/y_train.pkl')
x_test = pd.read_pickle('current-data-dump/ada-autoencoder/x_test.pkl')
y_test = pd.read_pickle('current-data-dump/ada-autoencoder/y_test.pkl')

### Evidence model

In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/data-dump/ada003-evidence-autoencoder/ada003_evidence_autoencoder.zip", shell=True)
print("ada003_evidence_autoencoder.zip successfully imported")
ada_autoencoder_file_path_zip = 'ada003_evidence_autoencoder.zip'
ada_autoencoder_file_path = 'current-data-dump/ada-evidence-autoencoder'
with zipfile.ZipFile(ada_autoencoder_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(ada_autoencoder_file_path)
extracted_files = os.listdir(ada_autoencoder_file_path)
print("Files extracted:", extracted_files)

ada003_evidence_autoencoder.zip successfully imported
Files extracted: ['global_x_test.pkl', 'global_shuffled_training_log.csv', 'global_shuffled_autoencoder_model.keras', 'global_y_test.pkl', 'global_y_train.pkl', 'global_autoencoder_model.keras', 'global_x_train.pkl', 'global_training_log.csv']


In [None]:
x_evidence_train = pd.read_pickle('current-data-dump/ada-evidence-autoencoder/global_x_train.pkl')
y_evidence_train = pd.read_pickle('current-data-dump/ada-evidence-autoencoder/global_y_train.pkl')
x_evidence_test = pd.read_pickle('current-data-dump/ada-evidence-autoencoder/global_x_test.pkl')
y_evidence_test = pd.read_pickle('current-data-dump/ada-evidence-autoencoder/global_y_test.pkl')

### SciFact models

In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/data-dump/ada003-scifact-autoencoder/ada003_scifact_autoencoder.zip", shell=True)
print("ada003_scifact_autoencoder.zip successfully imported")
ada_autoencoder_file_path_zip = 'ada003_scifact_autoencoder.zip'
ada_autoencoder_file_path = 'current-data-dump/ada-scifact-autoencoder'
with zipfile.ZipFile(ada_autoencoder_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(ada_autoencoder_file_path)
extracted_files = os.listdir(ada_autoencoder_file_path)
print("Files extracted:", extracted_files)

ada003_scifact_autoencoder.zip successfully imported
Files extracted: ['y_train_combined.pkl', 'combined_training_log.csv', 'combined_shuffled_training_log.csv', 'counter_shuffled_autoencoder_model.keras', 'pro_autoencoder_model.keras', 'all_shuffled_training_plot.png', 'counter_shuffled_training_plot.png', 'y_test_counter.pkl', 'pro_shuffled_training_log.csv', 'pro_shuffled_autoencoder_model.keras', 'x_train_pro.pkl', 'pro_training_log.csv', 'x_test_combined.pkl', 'counter_training_plot.png', 'x_test_pro.pkl', 'pro_shuffled_training_plot.png', 'x_train_combined.pkl', 'y_test_combined.pkl', 'all_training_plot.png', 'x_test_counter.pkl', 'y_test_pro.pkl', 'combined_counter_training_plot.png', 'counter_training_log.csv', 'counter_shuffled_training_log.csv', 'combined_pro_training_plot.png', 'combined_autoencoder_model.keras', 'combined_shuffled_autoencoder_model.keras', 'y_train_counter.pkl', 'y_train_pro.pkl', 'combined_training_plot.png', 'counter_autoencoder_model.keras', 'pro_trainin

In [None]:
x_scifact_train = pd.read_pickle('current-data-dump/ada-scifact-autoencoder/x_train_combined.pkl')
y_scifact_train = pd.read_pickle('current-data-dump/ada-scifact-autoencoder/y_train_combined.pkl')
x_scifact_test = pd.read_pickle('current-data-dump/ada-scifact-autoencoder/x_test_combined.pkl')
y_scifact_test = pd.read_pickle('current-data-dump/ada-scifact-autoencoder/y_test_combined.pkl')

In [None]:
x_scifact_pro_train = pd.read_pickle('current-data-dump/ada-scifact-autoencoder/x_train_pro.pkl')
y_scifact_pro_train = pd.read_pickle('current-data-dump/ada-scifact-autoencoder/y_train_pro.pkl')
x_scifact_pro_test = pd.read_pickle('current-data-dump/ada-scifact-autoencoder/x_test_pro.pkl')
y_scifact_pro_test = pd.read_pickle('current-data-dump/ada-scifact-autoencoder/y_test_pro.pkl')

In [None]:
x_scifact_counter_train = pd.read_pickle('current-data-dump/ada-scifact-autoencoder/x_train_counter.pkl')
y_scifact_counter_train = pd.read_pickle('current-data-dump/ada-scifact-autoencoder/y_train_counter.pkl')
x_scifact_counter_test = pd.read_pickle('current-data-dump/ada-scifact-autoencoder/x_test_counter.pkl')
y_scifact_counter_test = pd.read_pickle('current-data-dump/ada-scifact-autoencoder/y_test_counter.pkl')

### Metric

In [None]:
@tf.keras.saving.register_keras_serializable()
def metric_choose_argument_global_y_train(y_true, y_pred):
  global_training_df_32 = tf.cast(pd.concat([x_train, y_train, x_test, y_test]), dtype=tf.float32)

  cos_sim_pred = tf.matmul(global_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(global_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

In [None]:
@tf.keras.saving.register_keras_serializable()
def metric_choose_argument_combined_y_train(y_true, y_pred):
  global_training_df_32 = tf.cast(pd.concat([x_scifact_train, y_scifact_train, x_scifact_test, y_scifact_test]), dtype=tf.float32)

  cos_sim_pred = tf.matmul(global_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(global_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

In [None]:
@tf.keras.saving.register_keras_serializable()
def metric_choose_argument_pro_y_train(y_true, y_pred):
  global_training_df_32 = tf.cast(pd.concat([x_scifact_pro_train, y_scifact_pro_train, x_scifact_pro_test, y_scifact_pro_test]), dtype=tf.float32)

  cos_sim_pred = tf.matmul(global_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(global_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

In [None]:
@tf.keras.saving.register_keras_serializable()
def metric_choose_argument_counter_y_train(y_true, y_pred):
  global_training_df_32 = tf.cast(pd.concat([x_scifact_counter_train, y_scifact_counter_train, x_scifact_counter_test, y_scifact_counter_test]), dtype=tf.float32)

  cos_sim_pred = tf.matmul(global_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_training_df_32, axis=1), [-1, 1])
  cos_sim_true = tf.matmul(global_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_training_df_32, axis=1), [-1, 1])

  max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
  max_cos_sim_true = tf.math.argmax(cos_sim_true)

  return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

## Load saved model

In [None]:
global_autoencoder_model = tf.keras.models.load_model('current-data-dump/ada-autoencoder/global_autoencoder_model.keras')

In [None]:
global_evidence_autoencoder_model = tf.keras.models.load_model('current-data-dump/ada-evidence-autoencoder/global_autoencoder_model.keras')

In [None]:
global_scifact_autoencoder_model = tf.keras.models.load_model('/content/current-data-dump/ada-scifact-autoencoder/combined_autoencoder_model.keras')

In [None]:
global_scifact_pro_autoencoder_model = tf.keras.models.load_model('/content/current-data-dump/ada-scifact-autoencoder/pro_autoencoder_model.keras')

In [None]:
global_scifact_counter_autoencoder_model = tf.keras.models.load_model('/content/current-data-dump/ada-scifact-autoencoder/counter_autoencoder_model.keras')

## GPR predict (Counterargument)

In [None]:
global_autoencoder_gpr_predictions = global_autoencoder_model.predict(gpr_x_test)
global_autoencoder_gpr_predictions_df = pd.DataFrame(global_autoencoder_gpr_predictions)
global_autoencoder_gpr_predictions_df.columns = [str(i) for i in global_autoencoder_gpr_predictions_df.columns]



In [None]:
successes = 0
for i in range(len(gpr_y_test)):
  gpr_y_test_tf = tf.convert_to_tensor(gpr_y_test.loc[i], dtype=tf.float32)
  gpr_pred_tf = tf.convert_to_tensor(global_autoencoder_gpr_predictions_df.loc[i], dtype=tf.float32)
  gpr_y_test_tf = tf.reshape(gpr_y_test_tf, (1, -1))
  gpr_pred_tf = tf.reshape(gpr_pred_tf, (1, -1))
  if metric_choose_argument_gpr(gpr_y_test_tf, gpr_pred_tf).numpy() == 1:
    successes += 1

In [None]:
gpr_success_rate = successes / len(gpr_y_test) * 100

In [None]:
gpr_success_rate

1.8181818181818181

## EACL Predict (Counterargument)

In [None]:
global_autoencoder_eacl_predictions = global_autoencoder_model.predict(eacl_vectors_df)
global_autoencoder_eacl_predictions_df = pd.DataFrame(global_autoencoder_eacl_predictions)
global_autoencoder_eacl_predictions_df.columns = [str(i) for i in global_autoencoder_eacl_predictions_df.columns]



In [None]:
eacl_topk = 1
pred_topk = []

In [None]:
eacl_embeddings_df_32 = tf.cast(eacl_vectors_df, dtype=tf.float32)
global_autoencoder_eacl_predictions_tf = tf.constant(global_autoencoder_eacl_predictions_df.values, dtype=tf.float32)
eacl_embeddings_norm = tf.norm(eacl_embeddings_df_32, axis=1)
eacl_topics = list(eacl_embeddings_df['topicId'])
eacl_stances = list(eacl_embeddings_df['claims.stance'])

In [None]:
for i, row in enumerate(global_autoencoder_eacl_predictions_tf):
  successes = 0
  y_pred = tf.reshape(row, [1, -1])
  target_topic = eacl_topics[i]
  target_type = 'PRO' if eacl_stances[i] == 'CON' else 'CON'

  cos_sim_pred = tf.matmul(eacl_embeddings_df_32, y_pred, transpose_b=True) / tf.reshape((tf.norm(y_pred) * eacl_embeddings_norm), [-1, 1])
  top_k_sim_pred = tf.math.top_k(tf.reshape(cos_sim_pred, [-1]), k=eacl_topk).indices.numpy()

  for index in top_k_sim_pred:
    if eacl_topics[index] == target_topic and eacl_stances[index] == target_type:
      successes += 1
  pred_topk.append(successes / eacl_topk * 100)

In [None]:
eacl_topk_success_rate = statistics.mean(pred_topk)

In [None]:
eacl_topk_success_rate

3.4586466165413534

## persuade Predict (Evidence)

In [None]:
global_evidence_autoencoder_persuade_predictions = global_evidence_autoencoder_model.predict(persuade_arguments_vector_df)
global_evidence_autoencoder_persuade_predictions_df = pd.DataFrame(global_evidence_autoencoder_persuade_predictions)
global_evidence_autoencoder_persuade_predictions_df.columns = [str(i) for i in global_evidence_autoencoder_persuade_predictions_df.columns]



In [None]:
persuade_topk = 50000
pred_topk=[]

In [None]:
persuade_embeddings_df_32 = tf.cast(persuade_evidence_vector_df, dtype=tf.float32)
global_autoencoder_persuade_predictions_tf = tf.constant(global_evidence_autoencoder_persuade_predictions_df.values, dtype=tf.float32)
persuade_embeddings_norm = tf.norm(persuade_embeddings_df_32, axis=1)
persuade_topics = list(persuade_evidence_embeddings_df['argument'])

In [None]:
for i, row in enumerate(global_autoencoder_persuade_predictions_tf):
  successes = 0
  y_pred = tf.reshape(row, [1, -1])
  target_topic = persuade_topics[i]

  cos_sim_pred = tf.matmul(persuade_embeddings_df_32, y_pred, transpose_b=True) / tf.reshape((tf.norm(y_pred) * persuade_embeddings_norm), [-1, 1])
  top_k_sim_pred = tf.math.top_k(tf.reshape(cos_sim_pred, [-1]), k=persuade_topk).indices.numpy()

  for index in top_k_sim_pred:
    if persuade_topics[index] == target_topic:
      successes += 1
  pred_topk.append(successes / persuade_topk * 100)

In [None]:
persuade_topk_success_rate = statistics.mean(pred_topk)

In [None]:
persuade_topk_success_rate

2.2654666666666667

## Scifact Predict (Evidence + Combined + Qualitative)

In [None]:
global_autoencoder_scifact_predictions = global_scifact_autoencoder_model.predict(scifact_test_embeddings)
global_autoencoder_scifact_predictions_df = pd.DataFrame(global_autoencoder_scifact_predictions)
global_autoencoder_scifact_predictions_df.columns = [str(i) for i in global_autoencoder_scifact_predictions_df.columns]



In [None]:
scifact_topk = 10
pred_topk=[]

In [None]:
scifact_embeddings_df_32 = tf.cast(scifact_evidence_embeddings, dtype=tf.float32)
global_autoencoder_scifact_predictions_tf = tf.constant(global_autoencoder_scifact_predictions_df.values, dtype=tf.float32)
scifact_embeddings_norm = tf.norm(scifact_embeddings_df_32, axis=1)

In [None]:
evidence_column = []
for i, row in enumerate(global_autoencoder_scifact_predictions_tf):
  pred_topk = []
  successes = 0
  y_pred = tf.reshape(row, [1, -1])

  cos_sim_pred = tf.matmul(scifact_embeddings_df_32, y_pred, transpose_b=True) / tf.reshape((tf.norm(y_pred) * scifact_embeddings_norm), [-1, 1])
  top_k_sim_pred = tf.math.top_k(tf.reshape(cos_sim_pred, [-1]), k=scifact_topk).indices.numpy()

  for index in top_k_sim_pred:
    pred_topk.append(scifact_evidence_embeddings_corpus.iloc[index]['evidence'])
  evidence_column.append(pred_topk)

In [None]:
scifact_topk_results = scifact_test.copy()
scifact_topk_results['topk'] = evidence_column

In [None]:
scifact_topk_results.to_csv('current-data-dump/ada-scifact-autoencoder/ada_autoencoder_predictions/scifact_combined_topk_results.csv')

OSError: Cannot save file into a non-existent directory: 'current-data-dump/ada-scifact-autoencoder/ada_autoencoder_predictions'

## Scifact Predict (Evidence + Pro + Qualitative)

In [None]:
global_autoencoder_scifact_pro_predictions = global_scifact_pro_autoencoder_model.predict(scifact_test_embeddings)
global_autoencoder_scifact_pro_predictions_df = pd.DataFrame(global_autoencoder_scifact_pro_predictions)
global_autoencoder_scifact_pro_predictions_df.columns = [str(i) for i in global_autoencoder_scifact_pro_predictions_df.columns]

In [None]:
scifact_pro_topk = 10
pred_pro_topk=[]

In [None]:
# scifact_embeddings_df_32 = tf.cast(scifact_evidence_embeddings, dtype=tf.float32)
global_autoencoder_scifact_pro_predictions_tf = tf.constant(global_autoencoder_scifact_pro_predictions_df.values, dtype=tf.float32)
# scifact_embeddings_norm = tf.norm(scifact_embeddings_df_32, axis=1)

In [None]:
evidence_pro_column = []
for i, row in enumerate(global_autoencoder_scifact_pro_predictions_tf):
  pred_pro_topk = []
  successes = 0
  y_pred = tf.reshape(row, [1, -1])

  cos_sim_pred = tf.matmul(scifact_embeddings_df_32, y_pred, transpose_b=True) / tf.reshape((tf.norm(y_pred) * scifact_embeddings_norm), [-1, 1])
  top_k_sim_pred = tf.math.top_k(tf.reshape(cos_sim_pred, [-1]), k=scifact_topk).indices.numpy()

  for index in top_k_sim_pred:
    pred_pro_topk.append(scifact_evidence_embeddings_corpus.iloc[index]['evidence'])
  evidence_pro_column.append(pred_pro_topk)

In [None]:
scifact_pro_topk_results = scifact_test.copy()
scifact_pro_topk_results['topk'] = evidence_pro_column

In [None]:
scifact_pro_topk_results.to_csv('current-data-dump/ada-scifact-autoencoder/ada_autoencoder_predictions/scifact_pro_topk_results.csv')

## Scifact Predict (Evidence + Counter + Qualitative)

In [None]:
global_autoencoder_scifact_counter_predictions = global_scifact_counter_autoencoder_model.predict(scifact_test_embeddings)
global_autoencoder_scifact_counter_predictions_df = pd.DataFrame(global_autoencoder_scifact_counter_predictions)
global_autoencoder_scifact_counter_predictions_df.columns = [str(i) for i in global_autoencoder_scifact_counter_predictions_df.columns]

In [None]:
scifact_counter_topk = 10
pred_counter_topk=[]

In [None]:
# scifact_embeddings_df_32 = tf.cast(scifact_evidence_embeddings, dtype=tf.float32)
global_autoencoder_scifact_counter_predictions_tf = tf.constant(global_autoencoder_scifact_counter_predictions_df.values, dtype=tf.float32)
# scifact_embeddings_norm = tf.norm(scifact_embeddings_df_32, axis=1)

In [None]:
evidence_counter_column = []
for i, row in enumerate(global_autoencoder_scifact_counter_predictions_tf):
  pred_counter_topk = []
  successes = 0
  y_pred = tf.reshape(row, [1, -1])

  cos_sim_pred = tf.matmul(scifact_embeddings_df_32, y_pred, transpose_b=True) / tf.reshape((tf.norm(y_pred) * scifact_embeddings_norm), [-1, 1])
  top_k_sim_pred = tf.math.top_k(tf.reshape(cos_sim_pred, [-1]), k=scifact_topk).indices.numpy()

  for index in top_k_sim_pred:
    pred_counter_topk.append(scifact_evidence_embeddings_corpus.iloc[index]['evidence'])
  evidence_counter_column.append(pred_counter_topk)

In [None]:
scifact_counter_topk_results = scifact_test.copy()
scifact_counter_topk_results['topk'] = evidence_counter_column

In [None]:
scifact_counter_topk_results.to_csv('current-data-dump/ada-scifact-autoencoder/ada_autoencoder_predictions/scifact_counter_topk_results.csv')

## Export Values

In [None]:
result_df = {'gpr_success_rate': gpr_success_rate, 'eacl_topk_success_rate': eacl_topk_success_rate, 'eacl_topk': eacl_topk}
result_df = pd.DataFrame([result_df])
results_folder_path = 'current-data-dump/ada-autoencoder/ada_autoencoder_predictions/'
os.makedirs(results_folder_path, exist_ok=True)
results_file_path = f'{results_folder_path}novel_corpora_prediction.pkl'
with open(results_file_path, 'wb') as file:
  pickle.dump(result_df, file)
  print(f"File uploaded to {results_file_path}")

In [None]:
evidence_result_df = {'persuade_topk_success_rate': persuade_topk_success_rate}
evidence_result_df = pd.DataFrame([evidence_result_df])
evidence_results_folder_path = 'current-data-dump/ada-evidence-autoencoder/ada_autoencoder_predictions/'
os.makedirs(evidence_results_folder_path, exist_ok=True)
evidence_results_file_path = f'{evidence_results_folder_path}persuade_corpus_prediction.pkl'
with open(evidence_results_file_path, 'wb') as file:
  pickle.dump(evidence_result_df, file)
  print(f"File uploaded to {evidence_results_file_path}")

In [None]:
ada_autoencoder_file_path = 'current-data-dump/ada-autoencoder/ada_autoencoder_predictions'
result = subprocess.run([f"osf -p sakjg upload -r --force {ada_autoencoder_file_path}/ data-dump/ada003-autoencoder/ada-autoencoder-predictions"], shell=True, capture_output=True, text=True)
print(result.stderr)
print(f"File: {ada_autoencoder_file_path} uploaded at osfstorage")

In [None]:
ada_evidence_autoencoder_file_path = 'current-data-dump/ada-evidence-autoencoder/ada_autoencoder_predictions'
result = subprocess.run([f"osf -p sakjg upload -r --force {ada_evidence_autoencoder_file_path}/ data-dump/ada003-evidence-autoencoder/ada-autoencoder-predictions"], shell=True, capture_output=True, text=True)
print(result.stderr)
print(f"File: {ada_evidence_autoencoder_file_path} uploaded at osfstorage")

In [None]:
ada_scifact_autoencoder_file_path = 'current-data-dump/ada-scifact-autoencoder/ada_autoencoder_predictions'
result = subprocess.run([f"osf -p sakjg upload -r --force {ada_scifact_autoencoder_file_path}/ data-dump/ada003-scifact-autoencoder/ada-autoencoder-predictions"], shell=True, capture_output=True, text=True)
print(result.stderr)
print(f"File: {ada_scifact_autoencoder_file_path} uploaded at osfstorage")

## Import

In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/data-dump/ada003-autoencoder/ada_autoencoder.zip", shell=True)
print("ada003_autoencoder.zip successfully imported")
ada003_autoencoder_file_path_zip = 'ada003_autoencoder.zip'
ada003_autoencoder_file_path = 'current-data-dump/ada003-autoencoder'
with zipfile.ZipFile(ada003_autoencoder_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(ada003_autoencoder_file_path)
extracted_files = os.listdir(ada003_autoencoder_file_path)
print("Files extracted:", extracted_files)

In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/data-dump/ada003-evidence-autoencoder/ada003_evidence_autoencoder.zip", shell=True)
print("ada003_evidence_autoencoder.zip successfully imported")
ada003_autoencoder_file_path_zip = 'ada003_evidence_autoencoder.zip'
ada003_autoencoder_file_path = 'current-data-dump/ada003-evidence-autoencoder'
with zipfile.ZipFile(ada003_autoencoder_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(ada003_autoencoder_file_path)
extracted_files = os.listdir(ada003_autoencoder_file_path)
print("Files extracted:", extracted_files)

In [None]:
subprocess.run("osf -p sakjg fetch --force osfstorage/data-dump/ada003-scifact-autoencoder/ada003_scifact_autoencoder.zip", shell=True)
print("ada003_scifact_autoencoder.zip successfully imported")
ada003_autoencoder_file_path_zip = 'ada003_scifact_autoencoder.zip'
ada003_autoencoder_file_path = 'current-data-dump/ada003-scifact-autoencoder'
with zipfile.ZipFile(ada003_autoencoder_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(ada003_autoencoder_file_path)
extracted_files = os.listdir(ada003_autoencoder_file_path)
print("Files extracted:", extracted_files)