# Study 3: Choose Corresponding Embedding

Given an embedding, can a model be trained to choose the correct embeddings corresponding to its counterargument from a list of them?

### OpenAI Setup

In [17]:
import openai
from openai import OpenAI

In [18]:
client = OpenAI()

### Imports

In [6]:
import tensorflow as tf
import pandas as pd
import numpy as np
from scipy import spatial
import time
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential
)
import statistics

In [7]:
from tensorflow.keras.models import load_model

## Model Custom Metric

In [10]:
"""
global_metric
"""
@tf.keras.saving.register_keras_serializable()
def metric_choose_argument_global_y_train(y_true, y_pred):
    global_training_df_32 = tf.cast(global_training_df, dtype=tf.float32)
    
    cos_sim_pred = tf.matmul(global_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_training_df_32, axis=1), [-1, 1])
    cos_sim_true = tf.matmul(global_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_training_df_32, axis=1), [-1, 1])

    max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
    max_cos_sim_true = tf.math.argmax(cos_sim_true)

    return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

## Data

#### GPR 55

In [14]:
gpr_df = pd.read_csv("../corpora/no_wavs/GPR-KB-55/GPR-KB-55.csv")

In [15]:
DIM_EMBEDDING = 1536

def gpr_get_embeddings_df(gpr_df: pd.DataFrame) -> pd.DataFrame:
    """ Add embeddings column to a df
    """
    gpr_embeddings_df = pd.DataFrame()
    arguments_list = list(gpr_df)

    claims_embeddings = client.embeddings.create(input=arguments_list, model="text-embedding-ada-002")
    claims_embeddings_data = [embedding_data.embedding for embedding_data in claims_embeddings.data]
    claims_embeddings_df = pd.DataFrame(claims_embeddings_data, columns=[f"{str(i)}" for i in range(DIM_EMBEDDING)])
    claims_embeddings_df = claims_embeddings_df.reset_index(drop=True)

    claims_embeddings_df = pd.concat([gpr_df, claims_embeddings_df], axis=1)
    return claims_embeddings_df

In [19]:
gpr_claims_df = gpr_get_embeddings_df(gpr_df['claim'])
gpr_rebuttals_df = gpr_get_embeddings_df(gpr_df['rebuttal'])

gpr_combined = pd.concat([gpr_claims_df, gpr_rebuttals_df])
gpr_combined = gpr_combined.reset_index(drop=True)

gpr_combined_nums = gpr_combined.select_dtypes(include=[np.number])

gpr_x_train = gpr_claims_df.select_dtypes(include=[np.number])
gpr_x_train = gpr_x_train.loc[:len(gpr_x_train) * 0.8-1]
gpr_y_train = gpr_rebuttals_df.select_dtypes(include=[np.number])
gpr_y_train = gpr_y_train.loc[:len(gpr_y_train) * 0.8-1]

gpr_x_test = gpr_claims_df.select_dtypes(include=[np.number])
gpr_x_test = gpr_x_test.loc[len(gpr_x_test) * 0.8:]
gpr_x_test = gpr_x_test.reset_index(drop=True)
gpr_y_test = gpr_rebuttals_df.select_dtypes(include=[np.number])
gpr_y_test = gpr_y_test.loc[len(gpr_y_test) * 0.8:]
gpr_y_test = gpr_y_test.reset_index(drop=True)

In [20]:
def metric_choose_argument_gpr(y_true, y_pred):
    """ See if the output vector is closest to the rebuttal to the claim"""
    gpr_training_df_32 = tf.cast(gpr_combined_nums, dtype=tf.float32)
    
    cos_sim_pred = tf.matmul(gpr_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(gpr_training_df_32, axis=1), [-1, 1])
    cos_sim_true = tf.matmul(gpr_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(gpr_training_df_32, axis=1), [-1, 1])

    max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
    max_cos_sim_true = tf.math.argmax(cos_sim_true)

    return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

#### IBM EACL

In [22]:
eacl_df = pd.read_csv("../corpora/IBM_EACL/claim_stance_dataset.csv")
eacl_df = eacl_df[['topicId', 'topicText', 'claims.stance', 'claims.claimCorrectedText']]

In [23]:
topic_lens = []
pro_lens = []
con_lens = []
for topic in eacl_df['topicId'].unique():
    topic_rows = eacl_df[eacl_df['topicId'] == topic]
    topic_lens.append(len(topic_rows))
    pro_lens.append(len(topic_rows[topic_rows['claims.stance'] == "PRO"]))
    con_lens.append(len(topic_rows[topic_rows['claims.stance'] == "CON"]))

In [35]:
DIM_EMBEDDING = 1536

@retry(wait=wait_random_exponential(min=60, max=500), stop=stop_after_attempt(10))
def eacl_get_embeddings(arguments: []) -> []:
    """ Convert an argument into a (1 x 1536) embedding df
    """
    embeddings = client.embeddings.create(input=arguments, model="text-embedding-ada-002")
    embeddings_data = [embedding_data.embedding for embedding_data in embeddings.data]
    embeddings_df = pd.DataFrame(embeddings_data, columns=[f"{str(i)}" for i in range(DIM_EMBEDDING)])
    return embeddings_df.reset_index(drop=True)

In [25]:
API_LIMIT = 1000

def eacl_get_embeddings_df(eacl_df: pd.DataFrame) -> pd.DataFrame:
    """ Add embeddings column to a df
    """
    embeddings_df = pd.DataFrame()
    arguments_list = list(eacl_df['claims.claimCorrectedText'])
    total_len = len(arguments_list)
    i = 0

    # Grab embeddings from arguments column in chunks
    while i < total_len:
        embeddings = eacl_get_embeddings(arguments_list[i:min(total_len, i+API_LIMIT)])
        embeddings_df = pd.concat([embeddings_df, embeddings], axis=0, ignore_index=True)
        i = i + API_LIMIT
    arguments_embeddings_df = pd.concat([eacl_df, embeddings_df], axis=1)
    return arguments_embeddings_df

In [26]:
eacl_embeddings_df = eacl_get_embeddings_df(eacl_df)

In [27]:
eacl_nums_df = eacl_embeddings_df.select_dtypes(include=[np.number])
eacl_vectors_df = eacl_nums_df.drop('topicId', axis=1)

## Load model

In [28]:
global_autoencoder_model = tf.keras.models.load_model('global_autoencoder_model.keras')

## GPR predict 

In [29]:
global_autoencoder_gpr_predictions = global_autoencoder_model.predict(gpr_x_train)
global_autoencoder_gpr_predictions_df = pd.DataFrame(global_autoencoder_gpr_predictions)
global_autoencoder_gpr_predictions_df.columns = [str(i) for i in global_autoencoder_gpr_predictions_df.columns]



In [30]:
successes = 0
for i in range(len(gpr_y_train)):
    gpr_y_train_tf = tf.convert_to_tensor(gpr_y_train.loc[i], dtype=tf.float32)
    gpr_pred_tf = tf.convert_to_tensor(global_autoencoder_gpr_predictions_df.loc[i], dtype=tf.float32)
    gpr_y_train_tf = tf.reshape(gpr_y_train_tf, (1, -1))
    gpr_pred_tf = tf.reshape(gpr_pred_tf, (1, -1))
    if metric_choose_argument_gpr(gpr_y_train_tf, gpr_pred_tf).numpy() == 1:
        successes += 1

In [31]:
successes

7

## EACL Predict

In [32]:
global_autoencoder_eacl_predictions = global_autoencoder_model.predict(eacl_vectors_df)
global_autoencoder_eacl_predictions_df = pd.DataFrame(global_autoencoder_eacl_predictions)
global_autoencoder_eacl_predictions_df.columns = [str(i) for i in global_autoencoder_eacl_predictions_df.columns]
global_autoencoder_eacl_predictions_df



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,-0.234743,-0.806062,1.213010,-2.386355,-1.642136,0.362799,-0.828601,-0.817532,-0.548924,-1.505051,...,0.347490,0.234846,0.677391,-0.980352,-1.095090,-0.218731,0.321748,-1.225204,-0.444810,-0.301135
1,-0.566521,-0.412140,1.173303,-1.985332,-1.037457,0.217972,-0.845443,-0.822015,-1.104964,-1.020254,...,0.936049,0.825018,1.478796,-1.417859,-1.287977,0.166920,0.976596,-1.626607,-0.360272,-0.723820
2,-0.865971,-1.088447,0.989519,-1.992059,-1.632770,0.426220,-1.423446,-0.268294,-1.429835,-1.564326,...,0.861392,0.704746,0.821876,-1.866559,-1.610096,0.453612,-0.044070,-1.098462,-0.431195,-0.432478
3,-0.285841,-1.078161,1.550538,-2.921949,-2.363046,0.864056,-0.988842,-1.087094,-1.144951,-1.663319,...,0.581102,0.115377,0.999071,-1.209213,-0.905579,-0.047955,-0.216754,-1.494212,-0.693178,-0.366461
4,-0.363462,-0.879038,1.130493,-1.861199,-1.281303,0.497673,-1.356970,-1.102218,-0.496216,-1.627935,...,1.538122,0.588024,0.520973,-1.545406,-1.705850,0.040309,0.532974,-1.503863,-0.458906,-0.311404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2389,-0.048363,-0.139224,-0.450628,-1.276113,-0.707823,0.442836,-1.184884,-0.092617,-0.639359,-0.362929,...,0.435736,0.404843,1.110425,-0.812194,-1.444957,-0.316082,0.695999,-0.296916,0.215947,-0.074273
2390,-0.642618,-0.262142,-0.505492,-0.697717,-0.850350,0.426705,-0.750801,-0.009065,-0.653379,-0.695532,...,0.734561,-0.035876,0.560293,-1.112106,-1.784598,-0.284791,0.924373,-0.093671,-1.009828,-0.634742
2391,-0.267613,-0.203720,-0.782190,-1.636763,-0.828422,0.758681,-1.056346,-0.402965,-0.671958,-0.634652,...,0.080082,0.707256,0.503449,-1.156869,-1.185972,0.111946,-0.485445,-0.126558,-0.210100,0.093084
2392,-0.282742,-0.320851,-0.960387,-1.226236,-1.301866,0.649752,-0.760150,-0.355770,-0.319835,-0.526426,...,0.849300,0.356356,0.046205,-1.044858,-1.629824,-0.510141,0.605330,-0.475057,-0.338870,-0.175601


In [33]:
eacl_embeddings_df_32 = tf.cast(eacl_vectors_df, dtype=tf.float32)
pred_topk = []

for i, row in global_autoencoder_eacl_predictions_df.iterrows():
    successes = 0
    y_pred = tf.reshape(row.values, [1, -1])
    target_topic = eacl_embeddings_df.loc[i]['topicId']
    target_type = 'PRO' if eacl_embeddings_df.loc[i]['claims.stance'] == 'CON' else 'CON'

    cos_sim_pred = tf.matmul(eacl_embeddings_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(eacl_embeddings_df_32, axis=1), [-1, 1])
    top_k_sim_pred = tf.math.top_k(tf.reshape(cos_sim_pred, [-1]), k=200).indices
    top_k_sim_pred = top_k_sim_pred.numpy()
    
    for index in top_k_sim_pred:
        match_row = eacl_embeddings_df.loc[index]
        if match_row['topicId'] == target_topic and match_row['claims.stance'] == target_type:
            successes += 1 
    pred_topk.append(successes/200 * 100)

In [34]:
statistics.mean(pred_topk)

12.064118629908103

## Predict

#### Global Predict

In [267]:
# Global model predictions
global_autoencoder_predictions = global_autoencoder_model.predict(global_x_test)
global_autoencoder_predictions_df = pd.DataFrame(global_autoencoder_predictions)
global_autoencoder_predictions_df.columns = [str(i) for i in global_autoencoder_predictions_df.columns]
output_folder = f'../data_dump/autoencoder_predictions_dump/global/'
output_file_path = f'{output_folder}global_predictions.pkl'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
global_autoencoder_predictions_df.to_pickle(output_file_path)



In [282]:
successes = 0
for i, y_pred in global_autoencoder_predictions_df.iterrows():
    global_y_test_tf = tf.reshape(global_y_test.loc[i], [1, 1536])
    global_y_test_tf = tf.cast(global_y_test_tf, dtype=tf.float32)
    
    y_pred_tf = tf.reshape(y_pred, [1, 1536])
    y_pred_tf = tf.cast(y_pred_tf, dtype=tf.float32)
    res = metric_choose_argument_global_y_train(global_y_test_tf, y_pred_tf).numpy()
    if res == 1:
        successes += 1

In [97]:
successes

NameError: name 'successes' is not defined

In [284]:
len(global_autoencoder_predictions_df)

813

#### Global Shuffled Predict

In [35]:
# Global model predictions
global_shuffled_autoencoder_predictions = global_shuffled_autoencoder_model.predict(global_x_test)
global_shuffled_autoencoder_predictions_df = pd.DataFrame(global_shuffled_autoencoder_predictions)
global_shuffled_autoencoder_predictions_df.columns = [str(i) for i in global_shuffled_autoencoder_predictions_df.columns]
output_folder = f'../data_dump/autoencoder_predictions_dump/global/'
output_file_path = f'{output_folder}global_shuffled_predictions.pkl'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
global_shuffled_autoencoder_predictions_df.to_pickle(output_file_path)



In [36]:
global_shuffled_autoencoder_predictions_df = pd.read_pickle("../data_dump/autoencoder_predictions_dump/global/global_shuffled_predictions.pkl")

In [37]:
successes_shuffled = 0
for i, y_pred in global_shuffled_autoencoder_predictions_df.iterrows():
    global_y_test_tf = tf.reshape(global_y_test.loc[i], [1, 1536])
    global_y_test_tf = tf.cast(global_y_test_tf, dtype=tf.float32)
    
    y_pred_tf = tf.reshape(y_pred, [1, 1536])
    y_pred_tf = tf.cast(y_pred_tf, dtype=tf.float32)
    res = metric_choose_argument_global_y_train(global_y_test_tf, y_pred_tf).numpy()
    if res == 1:
        successes_shuffled += 1

In [38]:
successes_shuffled

51

#### Category Predict

In [None]:
# Category model predictions
category_autoencoder_predictions = category_autoencoder_model.predict(economy_x_test)
category_autoencoder_predictions_df = pd.DataFrame(category_autoencoder_predictions)
category_autoencoder_predictions_df.columns = [str(i) for i in category_autoencoder_predictions_df.columns]

#### Debate Predict

In [None]:
# Debate model predictions
debate_autoencoder_predictions = debate_autoencoder_model.predict(economy_debate_x_test)
debate_autoencoder_predictions_df = pd.DataFrame(debate_autoencoder_predictions)
debate_autoencoder_predictions_df.columns = [str(i) for i in debate_autoencoder_predictions_df.columns]

## Combine Point-Counter Dataframes

#### Global point-counter df

In [None]:
# Combine global_x_test and global_y_test
global_x_test_df = global_x_test.copy().astype(np.float32)
global_x_test_df['pair_id'] = global_x_test_df.index.astype(str)
global_x_test_df['type'] = 'point'
global_x_test_df['pred_test'] = 'test'

global_y_test_df = global_y_test.copy().astype(np.float32)
global_y_test_df['pair_id'] = global_y_test_df.index.astype(str)
global_y_test_df['type'] = 'counter'
global_y_test_df['pred_test'] = 'test'

global_x_y_test_combined_df = pd.concat([global_x_test_df, global_y_test_df], axis=0, ignore_index=True)

In [None]:
# Combine global_x_test and global autoencoder predictions
global_x_test_df_copy = global_x_test.copy().astype(np.float32)
global_x_test_df_copy['pair_id'] = global_x_test_df_copy.index.astype(str)
global_x_test_df_copy['type'] = 'point'
global_x_test_df_copy['pred_test'] = 'pred'
global_autoencoder_predictions_df['pair_id'] = global_autoencoder_predictions_df.index.astype(str)
global_autoencoder_predictions_df['type'] = 'counter'
global_autoencoder_predictions_df['pred_test'] = 'pred'
global_pred_test_combined_df = pd.concat([global_x_test_df_copy, global_autoencoder_predictions_df], axis=0, ignore_index=True)

In [None]:
# Combine global_x_test and global_y_test and global autoencoder predictions
global_combined_df = pd.concat([global_x_test_df, global_y_test_df, global_autoencoder_predictions_df], axis=0, ignore_index=True)

#### Global shuffled point-counter df

In [None]:
# Combine global_x_test and global_y_test_shuffled
global_x_test_df = global_x_test.copy().astype(np.float32)
global_x_test_df['pair_id'] = global_x_test_df.index.astype(str)
global_x_test_df['type'] = 'point'
global_x_test_df['pred_test'] = 'test'

global_y_test_shuffled_df = global_y_test_shuffled.copy().astype(np.float32)
global_y_test_shuffled_df['pair_id'] = global_y_test_shuffled_df.index.astype(str)
global_y_test_shuffled_df['type'] = 'counter'
global_y_test_shuffled_df['pred_test'] = 'test'

global_x_y_test_combined_shuffled_df = pd.concat([global_x_test_df, global_y_test_shuffled_df], axis=0, ignore_index=True)

In [None]:
# Combine global_x_test and global shuffled autoencoder predictions
global_x_test_df_copy = global_x_test.copy().astype(np.float32)
global_x_test_df_copy['pair_id'] = global_x_test_df_copy.index.astype(str)
global_x_test_df_copy['type'] = 'point'
global_x_test_df_copy['pred_test'] = 'pred'
global_shuffled_autoencoder_predictions_df['pair_id'] = global_shuffled_autoencoder_predictions_df.index.astype(str)
global_shuffled_autoencoder_predictions_df['type'] = 'counter'
global_shuffled_autoencoder_predictions_df['pred_test'] = 'pred'
global_pred_test_combined_shuffled_df = pd.concat([global_x_test_df_copy, global_shuffled_autoencoder_predictions_df], axis=0, ignore_index=True)

In [None]:
# Combine global_x_test and global_y_test and global autoencoder predictions
global_combined_shuffled_df = pd.concat([global_x_test_df, global_y_test_shuffled_df, global_shuffled_autoencoder_predictions_df], axis=0, ignore_index=True)

#### Category point-counter df

In [None]:
# Combine economy_x_test and economy_y_test
economy_x_test_df = economy_x_test.copy().astype(np.float32)
economy_x_test_df['pair_id'] = economy_x_test_df.index.astype(str)
economy_x_test_df['type'] = 'point'
economy_x_test_df['pred_test'] = 'test'

economy_y_test_df = economy_y_test.copy().astype(np.float32)
economy_y_test_df['pair_id'] = economy_y_test_df.index.astype(str)
economy_y_test_df['type'] = 'counter'
economy_y_test_df['pred_test'] = 'test'

economy_x_y_test_combined_df = pd.concat([economy_x_test_df, economy_y_test_df], axis=0, ignore_index=True)

In [None]:
# Combine economy_x_test and economy autoencoder predictions
economy_x_test_df_copy = economy_x_test.copy().astype(np.float32)
economy_x_test_df_copy['pair_id'] = economy_x_test_df_copy.index.astype(str)
economy_x_test_df_copy['type'] = 'point'
economy_x_test_df_copy['pred_test'] = 'pred'
category_autoencoder_predictions_df['pair_id'] = category_autoencoder_predictions_df.index.astype(str)
category_autoencoder_predictions_df['type'] = 'counter'
category_autoencoder_predictions_df['pred_test'] = 'pred'
economy_pred_test_combined_df = pd.concat([economy_x_test_df_copy, category_autoencoder_predictions_df], axis=0, ignore_index=True)

In [None]:
# Combine economy_x_test and economy_y_test and economy autoencoder predictions
economy_combined_df = pd.concat([economy_x_test_df, economy_y_test_df, category_autoencoder_predictions_df], axis=0, ignore_index=True)

#### Debate point-counter df

In [None]:
# Combine economy_debate_x_test and economy_debate_y_test
economy_debate_x_test_df = economy_debate_x_test.copy().astype(np.float32)
economy_debate_x_test_df['pair_id'] = economy_debate_x_test_df.index.astype(str)
economy_debate_x_test_df['type'] = 'point'
economy_debate_x_test_df['pred_test'] = 'test'

economy_debate_y_test_df = economy_debate_y_test.copy().astype(np.float32)
economy_debate_y_test_df['pair_id'] = economy_debate_y_test_df.index.astype(str)
economy_debate_y_test_df['type'] = 'counter'
economy_debate_y_test_df['pred_test'] = 'test'

economy_debate_x_y_test_combined_df = pd.concat([economy_debate_x_test_df, economy_debate_y_test_df], axis=0, ignore_index=True)

In [None]:
# Combine economy_debate_x_test and debate autoencoder predictions
economy_debate_x_test_df_copy = economy_debate_x_test.copy().astype(np.float32)
economy_debate_x_test_df_copy['pair_id'] = economy_debate_x_test_df_copy.index.astype(str)
economy_debate_x_test_df_copy['type'] = 'point'
economy_debate_x_test_df_copy['pred_test'] = 'pred'
debate_autoencoder_predictions_df['pair_id'] = debate_autoencoder_predictions_df.index.astype(str)
debate_autoencoder_predictions_df['type'] = 'counter'
debate_autoencoder_predictions_df['pred_test'] = 'pred'
economy_debate_pred_test_combined_df = pd.concat([economy_debate_x_test_df_copy, debate_autoencoder_predictions_df], axis=0, ignore_index=True)

In [None]:
# Combine economy_x_test and economy_y_test and debate autoencoder predictions
economy_debate_combined_df = pd.concat([economy_debate_x_test_df, economy_debate_y_test_df, debate_autoencoder_predictions_df], axis=0, ignore_index=True)

#### Global tsne df

In [None]:
# TSNE global_x_y_test
global_x_y_test_combined_tsne_df = tsne_embeddings(global_x_y_test_combined_df)

In [None]:
# TSNE global autoencoder predictions
global_pred_test_combined_tsne_df = tsne_embeddings(global_pred_test_combined_df)

In [None]:
# TSNE global combined df
global_combined_tsne_df = tsne_embeddings(global_combined_df)

In [None]:
# Combine both prediction and test tsne df
global_both_tsne_df = pd.concat([global_x_y_test_combined_tsne_df, global_pred_test_combined_tsne_df], axis=0)

#### Global shuffled tsne df

In [None]:
# TSNE global_x_y_shuffled_test
global_x_y_test_combined_shuffled_tsne_df = tsne_embeddings(global_x_y_test_combined_shuffled_df)

In [None]:
# TSNE global shuffled autoencoder predictions
global_pred_test_combined_shuffled_tsne_df = tsne_embeddings(global_pred_test_combined_shuffled_df)

In [None]:
# TSNE global shuffled combined df
global_combined_shuffled_tsne_df = tsne_embeddings(global_combined_shuffled_df)

In [None]:
# Combine both prediction and test shuffled tsne df
global_both_shuffled_tsne_df = pd.concat([global_x_y_test_combined_shuffled_tsne_df, global_pred_test_combined_shuffled_tsne_df], axis=0)

#### Category tsne df

In [None]:
# TSNE economy_x_y_test
economy_x_y_test_combined_tsne_df = tsne_embeddings(economy_x_y_test_combined_df)

In [None]:
# TSNE category autoencoder predictions
economy_pred_test_combined_tsne_df = tsne_embeddings(economy_pred_test_combined_df)

In [None]:
# TSNE economy combined df
economy_combined_tsne_df = tsne_embeddings(economy_combined_df)

In [None]:
# Combine both prediction and test tsne df
economy_both_tsne_df = pd.concat([economy_x_y_test_combined_tsne_df, economy_pred_test_combined_tsne_df], axis=0)

#### Debate tsne df

In [None]:
# TSNE economy_debate_x_y_test
economy_debate_x_y_test_combined_tsne_df = tsne_embeddings(economy_debate_x_y_test_combined_df)

In [None]:
# TSNE debate autoencoder predictions
economy_debate_pred_test_combined_tsne_df = tsne_embeddings(economy_debate_pred_test_combined_df)

In [None]:
# TSNE debate combined df
economy_debate_combined_tsne_df = tsne_embeddings(economy_debate_combined_df)

In [None]:
# Combine both prediction and test tsne df
economy_debate_both_tsne_df = pd.concat([economy_debate_x_y_test_combined_tsne_df, economy_debate_pred_test_combined_tsne_df], axis=0)

## PCA Dataframes

#### Global pca df

In [None]:
# PCA global_x_y_test
global_x_y_test_combined_pca_df = pca_embeddings(global_x_y_test_combined_df)

In [None]:
# PCA global autoencoder predictions
global_pred_test_combined_pca_df = pca_embeddings(global_pred_test_combined_df)

In [None]:
# PCA global combined df
global_combined_pca_df = pca_embeddings(global_combined_df)

In [None]:
# Combine both prediction and test pca df
global_both_pca_df = pd.concat([global_x_y_test_combined_pca_df, global_pred_test_combined_pca_df], axis=0)

#### Category pca df

In [None]:
# PCA economy_x_y_test
economy_x_y_test_combined_pca_df = pca_embeddings(economy_x_y_test_combined_df)

In [None]:
# PCA category autoencoder predictions
economy_pred_test_combined_pca_df = pca_embeddings(economy_pred_test_combined_df)

In [None]:
# PCA economy combined df
economy_combined_pca_df = pca_embeddings(economy_combined_df)

In [None]:
# Combine both prediction and test pca df
economy_both_pca_df = pd.concat([economy_x_y_test_combined_pca_df, economy_pred_test_combined_pca_df], axis=0)

#### Debate pca df

In [None]:
# PCA economy_debate_x_y_test
economy_debate_x_y_test_combined_pca_df = pca_embeddings(economy_debate_x_y_test_combined_df)

In [None]:
# PCA debate autoencoder predictions
economy_debate_pred_test_combined_pca_df = pca_embeddings(economy_debate_pred_test_combined_df)

In [None]:
# PCA economy debate combined df
economy_debate_combined_pca_df = pca_embeddings(economy_debate_combined_df)

In [None]:
# Combine both prediction and test pca df
economy_debate_both_pca_df = pd.concat([economy_debate_x_y_test_combined_pca_df, economy_debate_pred_test_combined_pca_df], axis=0)

## Plot Data to Compare

#### Plot Functions

In [None]:
# Plot for prediction or test
def pred_test_plot(
        analysis_type: AnalysisType,
        pred_test_data: pd.DataFrame,
        processing_unit: ProcessingUnit
    ):
    plot_analysis_type = analysis_type.value.upper()
    pred_test = pred_test_data['pred_test'].iloc[0]
    gg = (
        ggplot(pred_test_data, aes(x='x', y='y', color='type', group='pair_id')) +
        geom_point(size=2) +
        geom_line(color='black', size=0.5) +
        labs(
            title=f'{plot_analysis_type} Plot: {pred_test}',
            x=f'{plot_analysis_type}_x',
            y=f'{plot_analysis_type}_y'
        ) +
        theme(
            axis_title=element_text(margin={'t': 20}),
            figure_size=(8, 8),
        )
    )

    output_folder = f'../data_dump/autoencoder_{analysis_type.value}_plots_dump/'
    output_file_path = f'{output_folder}{processing_unit.value}_autoencoder_{pred_test}_{analysis_type.value}_plot.png'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    ggsave(gg, output_file_path)
    print(gg)

In [None]:
# Both plot for prediction and test
def both_pred_test_plot(
        analysis_type: AnalysisType,
        both_data_df: pd.DataFrame,
        processing_unit: ProcessingUnit
    ):
    plot_analysis_type = analysis_type.value.upper()
    both_data_df['interaction'] = both_data_df['pair_id'] + '_' + both_data_df['pred_test']
    gg = (
        ggplot(both_data_df, aes(x='x', y='y', color='pred_test', shape='type', group='interaction')) +
        geom_point(size=2) +
        geom_line(color='black', size=0.5) +
        labs(
            title=f'{plot_analysis_type} Combined Plot for Both Prediction and Test',
            x=f'{plot_analysis_type}_x',
            y=f'{plot_analysis_type}_y'
        ) +
        theme(
            axis_title=element_text(margin={'t': 20}),
            figure_size=(8, 8),
        )
    )

    output_folder = f'../data_dump/autoencoder_{analysis_type.value}_plots_dump/'
    output_file_path = f'{output_folder}{processing_unit.value}_autoencoder_combined_pred_test_{analysis_type.value}_plot.png'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    ggsave(gg, output_file_path)
    print(gg)

In [None]:
# Combined plot for x_test + y_test + prediction
def combined_pred_test_plot(
        analysis_type: AnalysisType,
        combined_df: pd.DataFrame,
        processing_unit: ProcessingUnit
    ):
    plot_analysis_type = analysis_type.value.upper()
    gg = (
        ggplot(combined_df, aes(x='x', y='y', color='pred_test', shape='type', group='pair_id')) +
        geom_point(size=2) +
        geom_line(color='black', size=0.5) +
        labs(
            title=f'{plot_analysis_type} Combined Plot for Prediction vs Test',
            x=f'{plot_analysis_type}_x',
            y=f'{plot_analysis_type}_y'
        ) +
        theme(
            axis_title=element_text(margin={'t': 20}),
            figure_size=(8, 8),
        )
    )

    output_folder = f'../data_dump/autoencoder_{analysis_type.value}_plots_dump/'
    output_file_path = f'{output_folder}{processing_unit.value}_autoencoder_all_{analysis_type.value}_plot.png'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    ggsave(gg, output_file_path)
    print(gg)

#### Global Prediction PCA Plots

In [None]:
# Plot PCA test
pred_test_plot(AnalysisType.PCA, global_x_y_test_combined_pca_df, ProcessingUnit.GLOBAL)

In [None]:
# Plot PCA autoencoder predictions
pred_test_plot(AnalysisType.PCA, global_pred_test_combined_pca_df, ProcessingUnit.GLOBAL)

In [None]:
# Plot both PCA pred and test
both_pred_test_plot(AnalysisType.PCA, global_both_pca_df, ProcessingUnit.GLOBAL)

In [None]:
# Plot combined PCA pred and test
combined_pred_test_plot(AnalysisType.PCA, global_combined_pca_df, ProcessingUnit.GLOBAL)

#### Category Prediction PCA Plots

In [None]:
# Plot PCA test
pred_test_plot(AnalysisType.PCA, economy_x_y_test_combined_pca_df, ProcessingUnit.CATEGORY)

In [None]:
# Plot PCA autoencoder predictions
pred_test_plot(AnalysisType.PCA, economy_pred_test_combined_pca_df, ProcessingUnit.CATEGORY)

In [None]:
# Plot both pred and test
both_pred_test_plot(AnalysisType.PCA, economy_both_pca_df, ProcessingUnit.CATEGORY)

In [None]:
# Plot combined pred vs test
combined_pred_test_plot(AnalysisType.PCA, economy_combined_pca_df, ProcessingUnit.CATEGORY)

#### Debate Prediction PCA Plots

In [None]:
# Plot PCA test
pred_test_plot(AnalysisType.PCA, economy_debate_x_y_test_combined_pca_df, ProcessingUnit.DEBATE)

In [None]:
# Plot PCA autoencoder predictions
pred_test_plot(AnalysisType.PCA, economy_debate_pred_test_combined_pca_df, ProcessingUnit.DEBATE)

In [None]:
# Plot both pred and test
both_pred_test_plot(AnalysisType.PCA, economy_debate_both_pca_df, ProcessingUnit.DEBATE)

In [None]:
# Plot combined pred vs test
combined_pred_test_plot(AnalysisType.PCA, economy_debate_combined_pca_df, ProcessingUnit.DEBATE)