<a href="https://colab.research.google.com/github/cchang-vassar/Semantic-Relations-in-Vector-Embeddings/blob/main/study8_counterarg_retrieval_in_articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [ada-002] Autoencoder: Choose Corresponding Embedding from Article Database

Given an embedding, can a model be trained to choose the correct embeddings corresponding to its counterargument from a list of them tokenized from a database of articles?

## Set Up

### Imports

In [None]:
# General imports
import os
import subprocess
import zipfile
import shutil
import time
import re
from google.colab import userdata
import requests
from bs4 import BeautifulSoup
import re
import pickle
import statistics
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import load_model
from scipy import spatial
from typing import Optional

### OpenAI Setup

In [145]:
!pip install openai

Collecting openai
  Downloading openai-1.14.2-py3-none-any.whl (262 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/262.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m174.1/262.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.4/262.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-p

In [146]:
import openai
from openai import OpenAI
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [147]:
client = OpenAI()

### OSF Setup

In [196]:
!pip install osfclient

Collecting osfclient
  Downloading osfclient-0.0.5-py2.py3-none-any.whl (39 kB)
Installing collected packages: osfclient
Successfully installed osfclient-0.0.5


In [197]:
os.environ["OSF_USERNAME"] = userdata.get("OSF_USERNAME")
OSF_USERNAME = os.environ["OSF_USERNAME"]

In [198]:
os.environ["OSF_PASSWORD"] = userdata.get("OSF_PASSWORD")
OSF_PASSWORD = os.environ["OSF_PASSWORD"]

In [199]:
os.environ["OSF_TOKEN"] = userdata.get("OSF_TOKEN")
OSF_TOKEN = os.environ["OSF_TOKEN"]

In [200]:
os.environ["OSF_PROJECT_ID"] = userdata.get("OSF_PROJECT_ID")
OSF_PROJECT_ID = os.environ["OSF_PROJECT_ID"]

## Data

### Functions to parse file data

In [None]:
def _tokenize_text_file(
    text_file_path: str
  ) -> list[str]:
  with open(text_file_path, 'r') as file:
    data = file.read()
  lines = [line.strip() for line in re.split(r'\n', data) if line.strip() and any(c.isalnum() for c in line)]
  return lines

In [None]:
def _tokenize_html_file(
    html_file_path: str,
    class_identifier: str = False,
    id_identifier: str = False
  ) -> list[str]:
  with open(html_file_path, 'r') as file:
    data = file.read()
  soup = BeautifulSoup(data, 'html.parser')
  if not class_identifier and not id_identifier:
    paragraphs = soup.find_all('p')
  else:
    paragraphs = soup.find_all(class_ = class_identifier, id = id_identifier)
  paragraph_list = []
  for paragraph in paragraphs:
    if len(paragraph.text) < 1:
      continue
    paragraph_list.append(paragraph.text)
  return paragraph_list

In [None]:
def _tokenize_html_file_wr(html_file_path: str) -> list[str]:
  with open(html_file_path, "r") as file:
    first_line = file.readline()
    class_identifier = re.search(r'class_identifier=(.+$)', first_line)
    class_identifier = class_identifier.group(1) if class_identifier else False
    second_line = file.readline()
    id_identifier = re.search(r'id_identifier=(.+$)', second_line)
    id_identifier = id_identifier.group(1) if id_identifier else False
    return _tokenize_html_file(html_file_path, class_identifier, id_identifier)

In [None]:
def tokenize_file(file_path: str) -> list[str]:
  if re.match(r'.+\.html', file_path):
    return _tokenize_html_file_wr(file_path)
  else:
    return _tokenize_text_file(file_path)

### Parse data

In [152]:
corpora_path = 'corpora/'
os.makedirs(corpora_path, exist_ok=True)

plastic_surgery_corpus_path = f'{corpora_path}plastic-surgery-corpus/'
os.makedirs(plastic_surgery_corpus_path, exist_ok=True)

write_data_path = 'current-data-dump/'
os.makedirs(write_data_path, exist_ok=True)

In [84]:
parsed_data_path = 'current-data-dump/'

In [183]:
plastic_surgery_parsed_data = pd.DataFrame()

for filename in os.listdir(plastic_surgery_corpus_path):
    file_path = os.path.join(plastic_surgery_corpus_path, filename)
    if os.path.isfile(file_path):
        arguments = pd.DataFrame(tokenize_file(file_path), columns=['paragraph'])
        arguments['file_path'] = filename
        plastic_surgery_parsed_data = pd.concat([plastic_surgery_parsed_data, arguments])
plastic_surgery_parsed_data = plastic_surgery_parsed_data.reset_index()

with open(f'{write_data_path}plastic_surgery_parsed_data.pkl', "wb") as file:
  pickle.dump(plastic_surgery_parsed_data, file)

<class 'bool'>
<class 'str'>
<class 'bool'>
<class 'bool'>
<class 'str'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>


## Embeddings

### Imports for getting embeddings

In [148]:
!pip install tenacity



In [149]:
import time
from tenacity import (
  retry,
  stop_after_attempt,
  wait_random_exponential
)

In [289]:
plastic_surgery_parsed_data = pd.read_pickle("/content/current-data-dump/plastic_surgery_parsed_data.pkl")

### Functions to get embeddings

In [187]:
DIM_EMBEDDING = 1536

@retry(wait=wait_random_exponential(min=60, max=500), stop=stop_after_attempt(10))
def _get_embeddings(paragraphs: list) -> list:
  """Convert an argument into a (1 x 1536) embedding df"""
  embeddings = client.embeddings.create(input=paragraphs, model="text-embedding-ada-002")
  embeddings_data = [embedding_data.embedding for embedding_data in embeddings.data]
  embeddings_df = pd.DataFrame(embeddings_data, columns=[f"{str(i)}" for i in range(DIM_EMBEDDING)])
  return embeddings_df.reset_index(drop=True)

In [221]:
API_LIMIT = 1000

def get_embeddings_df(paragraph_df: pd.DataFrame, corpus_name: str) -> pd.DataFrame:
  """Add embeddings column to a df"""
  embeddings_df = pd.DataFrame()
  paragraph_list = list(paragraph_df['paragraph'])
  total_len = len(paragraph_list)
  i = 0

  # Grab embeddings from paragraph column in chunks
  while i < total_len:
    embeddings = _get_embeddings(paragraph_list[i:min(total_len, i+API_LIMIT)])
    embeddings_df = pd.concat([embeddings_df, embeddings], axis=0, ignore_index=True)
    i = i + API_LIMIT
  paragraph_embeddings_df = pd.concat([paragraph_df, embeddings_df], axis=1)
  paragraph_embeddings_df.drop(columns=['index'], inplace=True)

  # Write embeddings df to file
  with open(f'{write_data_path}{corpus_name}_embeddings.pkl', "wb") as file:
    pickle.dump(paragraph_embeddings_df, file)

  return paragraph_embeddings_df

In [222]:
res = get_embeddings_df(plastic_surgery_parsed_data, "plastic_surgery_corpus")

## Model

### Import model

In [201]:
subprocess.run("osf -p sakjg fetch --force osfstorage/data-dump/ada-autoencoder/ada_autoencoder.zip", shell=True)
print("ada_autoencoder.zip successfully imported")
ada_autoencoder_file_path_zip = 'ada_autoencoder.zip'
ada_autoencoder_file_path = 'current-data-dump/ada-autoencoder'
os.makedirs(ada_autoencoder_file_path, exist_ok=True)
with zipfile.ZipFile(ada_autoencoder_file_path_zip, 'r') as zip_ref:
  zip_ref.extractall(ada_autoencoder_file_path)
extracted_files = os.listdir(ada_autoencoder_file_path)
print("Files extracted:", extracted_files)

ada_autoencoder.zip successfully imported
Files extracted: ['global_training_plot.png', 'global_shuffled_training_log.csv', 'combined_global_training_plot.png', 'global_autoencoder_model.keras', 'global_shuffled_autoencoder_model.keras', '.ipynb_checkpoints', 'global_shuffled_training_plot.png', 'global_training_df.pkl', 'global_training_log.csv']


In [205]:
ada_autoencoder_model = tf.keras.models.load_model('current-data-dump/ada-autoencoder/global_autoencoder_model.keras')

## Testing on novel arguments

In [224]:
global_df = pd.read_pickle("/content/current-data-dump/plastic_surgery_corpus_embeddings.pkl")
global_df = global_df.select_dtypes(include='number')

In [297]:
def generate_y_pred(y_test: list[str], global_df: pd.DataFrame, paragraph_df: pd.DataFrame) -> str:
  y_test_embeddings = _get_embeddings(y_test)
  y_pred = ada_autoencoder_model.predict(y_test_embeddings)
  y_pred_paragraph_indices = []
  global_df_32 = tf.cast(global_df, dtype=tf.float32)
  y_pred = tf.cast(y_pred, dtype=tf.float32)

  for pred in y_pred:
    pred = tf.reshape(pred, [1, -1])
    cos_sim_pred = tf.matmul(global_df_32, pred, transpose_b=True) / tf.reshape(tf.norm(pred) * tf.norm(global_df_32, axis=1), [-1, 1])
    top_k_sim_pred = tf.math.top_k(tf.reshape(cos_sim_pred, [-1]), k=1).indices.numpy()
    y_pred_paragraph_indices.append(top_k_sim_pred[0])
  return paragraph_df.iloc[y_pred_paragraph_indices]

### Pro plastic surgery

In [299]:
pro_plastic_surgery_y_test = _tokenize_text_file("/content/corpora-y-test/plastic_surgery_corpus/gpt_pro_arguments.txt")
con_plastic_surgery_y_test = _tokenize_text_file("/content/corpora-y-test/plastic_surgery_corpus/gpt_con_arguments.txt")

In [298]:
generate_y_pred(pro_plastic_surgery_y_test, global_df, plastic_surgery_parsed_data)



Unnamed: 0,index,paragraph,file_path
634,15,Plastic surgery was seen as an effective way t...,influences_on_decision_making_in_plastic_surge...
536,36,We have said that the primary reason to do pla...,plastic_surgery_and_the_teenage_patient.html
538,38,"For the former, the suboptimal result should b...",plastic_surgery_and_the_teenage_patient.html
634,15,Plastic surgery was seen as an effective way t...,influences_on_decision_making_in_plastic_surge...
263,54,The thing is there's no clear-cut line between...,reddit_cmv_plastic_surgery_is_unnecessary.html
509,9,The public is assaulted continually with a bar...,plastic_surgery_and_the_teenage_patient.html
263,54,The thing is there's no clear-cut line between...,reddit_cmv_plastic_surgery_is_unnecessary.html
302,93,"i wouldn't call them cosmetic then. ""cosmetic""...",reddit_cmv_plastic_surgery_is_unnecessary.html
903,199,Your approach aims for the patient receiving p...,reddit_cmv_if_you_have_money_you_should_get_pl...
263,54,The thing is there's no clear-cut line between...,reddit_cmv_plastic_surgery_is_unnecessary.html


In [300]:
generate_y_pred(con_plastic_surgery_y_test, global_df, plastic_surgery_parsed_data)



Unnamed: 0,index,paragraph,file_path
709,5,Plastic surgery can be dangerous and sometimes...,reddit_cmv_if_you_have_money_you_should_get_pl...
536,36,We have said that the primary reason to do pla...,plastic_surgery_and_the_teenage_patient.html
536,36,We have said that the primary reason to do pla...,plastic_surgery_and_the_teenage_patient.html
536,36,We have said that the primary reason to do pla...,plastic_surgery_and_the_teenage_patient.html
634,15,Plastic surgery was seen as an effective way t...,influences_on_decision_making_in_plastic_surge...
634,15,Plastic surgery was seen as an effective way t...,influences_on_decision_making_in_plastic_surge...
536,36,We have said that the primary reason to do pla...,plastic_surgery_and_the_teenage_patient.html
89,89,"To conclude, cosmetic surgery has some remedyi...",healing_childhood_psychological_trauma_and_imp...
903,199,Your approach aims for the patient receiving p...,reddit_cmv_if_you_have_money_you_should_get_pl...
894,190,The safety issue is very real. It's an industr...,reddit_cmv_if_you_have_money_you_should_get_pl...
