# [Nomic] Autoencoder: Choose Corresponding Embedding

Given an embedding, can a model be trained to choose the correct embeddings corresponding to its counterargument from a list of them?

### Nomic Setup

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import nomic
from sentence_transformers import SentenceTransformer

2024-02-27 14:28:18.686130: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


## Data

#### GPR 55

In [34]:
gpr_df = pd.read_csv("../corpora/no_wavs/GPR-KB-55/GPR-KB-55.csv")

In [35]:
DIM_EMBEDDING = 768
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

def gpr_get_embeddings_df(gpr_df: pd.DataFrame) -> pd.DataFrame:
    """ Add embeddings column to a df"""
    gpr_embeddings_df = pd.DataFrame()
    arguments_list = list(gpr_df)

    claims_embeddings = model.encode(['search_document: ' + argument for argument in arguments_list])
    claims_embeddings_df = pd.DataFrame(claims_embeddings, columns=[f"{str(i)}" for i in range(DIM_EMBEDDING)])
    claims_embeddings_df = claims_embeddings_df.reset_index(drop=True)
    return claims_embeddings_df

You try to use a model that was created with version 2.4.0.dev0, however, your version is 2.3.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



<All keys matched successfully>


In [36]:
gpr_claims_df = gpr_get_embeddings_df(gpr_df['claim'])
gpr_rebuttals_df = gpr_get_embeddings_df(gpr_df['rebuttal'])

gpr_combined = pd.concat([gpr_claims_df, gpr_rebuttals_df])
gpr_combined['pair_id'] = gpr_combined.index
gpr_combined = gpr_combined.reset_index(drop=True)

gpr_combined_nums = gpr_combined.select_dtypes(include=[np.number])

gpr_x_train = gpr_claims_df.select_dtypes(include=[np.number])
gpr_x_train = gpr_x_train.loc[:len(gpr_x_train) * 0.8-1]
gpr_x_train['pair_id'] = gpr_x_train.index
gpr_y_train = gpr_rebuttals_df.select_dtypes(include=[np.number])
gpr_y_train = gpr_y_train.loc[:len(gpr_y_train) * 0.8-1]
gpr_y_train['pair_id'] = gpr_y_train.index

gpr_x_test = gpr_claims_df.select_dtypes(include=[np.number])
gpr_x_test = gpr_x_test.loc[len(gpr_x_test) * 0.8:]
gpr_x_test['pair_id'] = gpr_x_test.index
gpr_y_test = gpr_rebuttals_df.select_dtypes(include=[np.number])
gpr_y_test = gpr_y_test.loc[len(gpr_y_test) * 0.8:]
gpr_y_test['pair_id'] = gpr_y_test.index

In [17]:
def metric_choose_argument_gpr(y_true, y_pred):
    """ See if the output vector is closest to the rebuttal to the claim"""
    gpr_training_df_32 = tf.cast(gpr_combined_nums, dtype=tf.float32)
    
    cos_sim_pred = tf.matmul(gpr_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(gpr_training_df_32, axis=1), [-1, 1])
    cos_sim_true = tf.matmul(gpr_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(gpr_training_df_32, axis=1), [-1, 1])

    max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
    max_cos_sim_true = tf.math.argmax(cos_sim_true)

    return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

#### IBM EACL

In [29]:
eacl_df = pd.read_csv("../corpora/IBM_EACL/claim_stance_dataset.csv")
eacl_df = eacl_df[['topicId', 'topicText', 'claims.stance', 'claims.claimCorrectedText']]

In [30]:
topic_lens = []
pro_lens = []
con_lens = []
for topic in eacl_df['topicId'].unique():
    topic_rows = eacl_df[eacl_df['topicId'] == topic]
    topic_lens.append(len(topic_rows))
    pro_lens.append(len(topic_rows[topic_rows['claims.stance'] == "PRO"]))
    con_lens.append(len(topic_rows[topic_rows['claims.stance'] == "CON"]))

In [69]:
DIM_EMBEDDING = 768

def eacl_get_embeddings(arguments: []) -> []:
    """ Convert an argument into a (1 x 768) embedding df"""
    embeddings = model.encode(['search_document: ' + argument for argument in arguments])
    embeddings_df = pd.DataFrame(embeddings, columns=[f"{str(i)}" for i in range(DIM_EMBEDDING)])
    return embeddings_df.reset_index(drop=True)

In [72]:
def eacl_get_embeddings_df(eacl_df: pd.DataFrame) -> pd.DataFrame:
    """ Add embeddings column to a df"""
    arguments_list = list(eacl_df['claims.claimCorrectedText'])

    embeddings = eacl_get_embeddings(arguments_list)
    eacl_embeddings_df = pd.concat([eacl_df, embeddings], axis=1)
    return embeddings

In [77]:
eacl_embeddings_df = eacl_get_embeddings_df(eacl_df) 

In [76]:
eacl_embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.049874,0.013107,-0.007577,-0.024254,0.067783,0.009876,0.012079,0.017946,0.000345,0.007255,...,0.026894,-0.017732,0.016788,-0.020725,0.034759,0.058129,0.022553,0.014465,-0.012663,-0.044741
1,0.055980,0.013979,-0.013814,0.017152,0.011371,0.040922,-0.005741,-0.018229,0.020254,-0.036058,...,0.022323,-0.027315,0.019291,-0.039770,0.019973,0.049177,0.027728,-0.019229,-0.021458,-0.019334
2,0.037693,0.025649,0.001994,0.011096,0.056873,0.038882,-0.053314,-0.003000,-0.016507,-0.010372,...,-0.009261,-0.026295,0.017394,-0.003609,0.034767,0.025998,0.036734,0.008071,0.003509,-0.022177
3,0.048372,0.015080,-0.010250,-0.020087,0.050388,0.035655,0.023355,0.004694,-0.006841,0.012472,...,-0.002624,-0.029527,0.030315,-0.022738,0.053005,0.052406,0.063864,-0.005731,0.002890,-0.044401
4,0.047272,0.027109,0.002749,0.008166,0.055120,0.023512,-0.036042,0.002162,0.012324,-0.006414,...,-0.000081,-0.037213,0.007959,-0.011588,0.043315,0.046284,0.001510,0.009268,-0.038937,-0.023647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2389,0.006033,0.051526,0.004067,-0.003753,0.024503,0.043041,-0.049997,-0.021701,-0.017122,-0.008775,...,0.056819,0.030391,0.014751,0.010741,0.038214,0.030835,0.006191,-0.009133,-0.049982,-0.008307
2390,0.009622,0.070927,0.004357,-0.016953,0.020167,0.073462,-0.023299,-0.018248,0.000063,-0.005787,...,0.049184,-0.006163,-0.035925,0.003969,0.072928,0.008629,0.013976,0.000166,-0.049768,-0.026312
2391,-0.011114,0.028073,-0.001602,0.001067,0.017298,0.042153,0.003805,0.018086,0.009299,0.031523,...,0.052650,0.001413,-0.016029,0.008743,0.027783,0.039727,-0.022012,0.012638,-0.040540,-0.024392
2392,-0.048299,0.071711,0.000894,-0.003652,0.043011,0.031416,-0.064888,0.008671,-0.018545,0.029046,...,0.055278,0.033350,0.032543,0.037972,0.040877,0.026795,0.000523,0.026579,-0.039847,-0.051416


In [75]:
eacl_nums_df = eacl_embeddings_df.select_dtypes(include=[np.number])
eacl_vectors_df = eacl_nums_df.drop('topicId', axis=1)

KeyError: "['topicId'] not found in axis"

## Model

In [19]:
"""
global_metric
"""
@tf.keras.saving.register_keras_serializable()
def metric_choose_argument_global_y_train(y_true, y_pred):
    global_training_df_32 = tf.cast(global_training_df, dtype=tf.float32)
    
    cos_sim_pred = tf.matmul(global_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_training_df_32, axis=1), [-1, 1])
    cos_sim_true = tf.matmul(global_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_training_df_32, axis=1), [-1, 1])

    max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
    max_cos_sim_true = tf.math.argmax(cos_sim_true)

    return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

In [20]:
global_autoencoder_model = tf.keras.saving.load_model('global_autoencoder_model.keras')