# Autoencoder: Generate Corresponding Embedding

### Imports

In [1]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
from scipy import spatial
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from plotnine import ggplot, geom_line, aes, ggsave, labs, theme, element_text, guides, guide_legend

2024-02-26 22:58:31.176978: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Data

In [2]:
# Drop rows that do not follow 'point' -> 'counter' pattern
def prepare_training_df(data: pd.DataFrame):
    point_indices = data[data['type'] == 'point'].index
    counter_indices = data[data['type'] == 'counter'].index
    drop_indices = []
    for idx in point_indices:
        if (idx == len(data)-1) or (idx + 1 < len(data) and data.loc[idx + 1, 'type'] != 'counter'):
            drop_indices.append(idx)
    for idx in counter_indices:
        if idx > 0 and data.loc[idx - 1, 'type'] != 'point':
            drop_indices.append(idx)
    data = data.drop(drop_indices)
    data = data.select_dtypes(include=[np.number])
    data = data.reset_index(drop=True)
    return data

In [3]:
# Drop rows that do not follow 'point' -> 'counter' pattern
def prepare_training_df_shuffled(data: pd.DataFrame):
    point_indices = data[data['type'] == 'point'].index
    counter_indices = data[data['type'] == 'counter'].index
    drop_indices = []
    for idx in point_indices:
        if (idx == len(data)-1) or (idx + 1 < len(data) and data.loc[idx + 1, 'type'] != 'counter'):
            drop_indices.append(idx)
    for idx in counter_indices:
        if idx > 0 and data.loc[idx - 1, 'type'] != 'point':
            drop_indices.append(idx)
    data = data.drop(drop_indices)
    data = data.reset_index(drop=True)
    return data

In [4]:
# Make training and testing datasets
def make_x_train(data: pd.DataFrame) -> pd.DataFrame:
    cutoff = int(0.8 * data.shape[0])
    if cutoff % 2 != 0:
        cutoff = cutoff - 1
    train_rows_df = data.iloc[:cutoff, :]
    x_train = train_rows_df[train_rows_df.index % 2 == 0].reset_index(drop=True)
    return x_train
    
def make_y_train(data: pd.DataFrame) -> pd.DataFrame:
    cutoff = int(0.8 * data.shape[0])
    if cutoff % 2 != 0:
        cutoff = cutoff - 1
    train_rows_df = data.iloc[:cutoff, :]
    y_train = train_rows_df[train_rows_df.index % 2 != 0].reset_index(drop=True)
    return y_train

def make_x_test(data: pd.DataFrame) -> pd.DataFrame:
    cutoff = int(0.8 * data.shape[0])
    if cutoff % 2 != 0:
        cutoff = cutoff - 1
    test_rows_df = data.iloc[cutoff:, :]
    x_test = test_rows_df[test_rows_df.index % 2 == 0].reset_index(drop=True)
    return x_test

def make_y_test(data: pd.DataFrame) -> pd.DataFrame:
    cutoff = int(0.8 * data.shape[0])
    if cutoff % 2 != 0:
        cutoff = cutoff - 1
    test_rows_df = data.iloc[cutoff:, :]
    y_test = test_rows_df[test_rows_df.index % 2 != 0].reset_index(drop=True)
    return y_test

#### Global data

In [5]:
global_embeddings_data = pd.read_pickle("../data_dump/embeddings_dump/global_embeddings.pkl")

In [6]:
global_training_df = prepare_training_df(global_embeddings_data)

In [7]:
global_x_train = make_x_train(global_training_df)

In [8]:
global_y_train = make_y_train(global_training_df)

In [9]:
global_x_test = make_x_test(global_training_df)

In [10]:
global_y_test = make_y_test(global_training_df)

In [11]:
global_y_train_test = pd.concat([global_y_train, global_y_test], axis=0)

#### Global data shuffled

In [12]:
global_training_df_shuffled = prepare_training_df_shuffled(global_embeddings_data)

In [13]:
global_y_train_shuffled = make_y_train(global_training_df_shuffled)

In [14]:
global_y_train_shuffled = global_y_train_shuffled.groupby(['topic'], sort=False)
global_y_train_shuffled = global_y_train_shuffled.sample(frac=1).reset_index(drop=True)
global_y_train_shuffled = global_y_train_shuffled.select_dtypes(include=[np.number])

#### Category Data (Economy)

In [15]:
economy_embeddings_data = pd.read_pickle("../data_dump/embeddings_dump/economy/economy_embeddings.pkl")

In [16]:
economy_training_df = prepare_training_df(economy_embeddings_data)
economy_x_train = make_x_train(economy_training_df)
economy_y_train = make_y_train(economy_training_df)
economy_x_test = make_x_test(economy_training_df)
economy_y_test = make_y_test(economy_training_df)

#### Debate Data (Economy)

In [15]:
economy_debate_embeddings_data = pd.read_pickle("../data_dump/embeddings_dump/economy/business_economy_general_house_would_prohibit_retailers_selling_certain_items_embeddings.pkl")

In [16]:
economy_debate_training_df = prepare_training_df(economy_debate_embeddings_data)
economy_debate_x_train = make_x_train(economy_debate_training_df)
economy_debate_y_train = make_y_train(economy_debate_training_df)
economy_debate_x_test = make_x_test(economy_debate_training_df)
economy_debate_y_test = make_y_test(economy_debate_training_df)

## Model

In [17]:
# Layers
input_layer = tf.keras.layers.Input(shape=(1536, ), name="Input")
hidden_layer = tf.keras.layers.Dense(units=1536, activation="relu", name="Hidden")(input_layer)
output_layer = tf.keras.layers.Dense(units=1536, activation="linear", name="Output")(hidden_layer)

In [18]:
# Model
autoencoder_model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
autoencoder_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 1536)]            0         
                                                                 
 Hidden (Dense)              (None, 1536)              2360832   
                                                                 
 Output (Dense)              (None, 1536)              2360832   
                                                                 
Total params: 4721664 (18.01 MB)
Trainable params: 4721664 (18.01 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
"""
global_metric
"""
@tf.keras.saving.register_keras_serializable()
def metric_choose_argument_global_y_train(y_true, y_pred):
    global_training_df_32 = tf.cast(global_training_df, dtype=tf.float32)
    
    cos_sim_pred = tf.matmul(global_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(global_training_df_32, axis=1), [-1, 1])
    cos_sim_true = tf.matmul(global_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(global_training_df_32, axis=1), [-1, 1])

    max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
    max_cos_sim_true = tf.math.argmax(cos_sim_true)

    return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

#### Global Training

In [20]:
# Global Model
global_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_autoencoder_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="cosine_similarity",
    metrics=[metric_choose_argument_global_y_train]
)

In [21]:
checkpoint_callback = ModelCheckpoint(filepath='global_autoencoder_weights.keras', save_best_only=False, save_weights_only=False, verbose=1)
csv_logger_callback = CSVLogger(filename='global_training_log.csv', separator=',', append=True)
global_history = global_autoencoder_model.fit(
    x=global_x_train,
    y=global_y_train,
    batch_size=1,
    epochs=20,
    validation_data = (global_x_test, global_y_test),
    callbacks=[checkpoint_callback, csv_logger_callback]
)

Epoch 1/20
Epoch 1: saving model to global_autoencoder_weights.keras
Epoch 2/20
Epoch 2: saving model to global_autoencoder_weights.keras
Epoch 3/20
Epoch 3: saving model to global_autoencoder_weights.keras
Epoch 4/20
Epoch 4: saving model to global_autoencoder_weights.keras
Epoch 5/20
Epoch 5: saving model to global_autoencoder_weights.keras
Epoch 6/20
Epoch 6: saving model to global_autoencoder_weights.keras
Epoch 7/20
Epoch 7: saving model to global_autoencoder_weights.keras
Epoch 8/20
Epoch 8: saving model to global_autoencoder_weights.keras
Epoch 9/20
Epoch 9: saving model to global_autoencoder_weights.keras
Epoch 10/20
Epoch 10: saving model to global_autoencoder_weights.keras
Epoch 11/20
Epoch 11: saving model to global_autoencoder_weights.keras
Epoch 12/20
Epoch 12: saving model to global_autoencoder_weights.keras
Epoch 13/20
Epoch 13: saving model to global_autoencoder_weights.keras
Epoch 14/20
Epoch 14: saving model to global_autoencoder_weights.keras
Epoch 15/20
Epoch 15: sa

In [91]:
global_history_df = pd.DataFrame(global_history.history)

In [92]:
global_history_df.to_csv("./global_training_log.csv")

In [114]:
global_autoencoder_model.save('global_autoencoder_model.keras')

In [3]:
# Access training history
loaded_global_history = pd.DataFrame(pd.read_csv("./global_training_log.csv"))
loaded_global_history = pd.melt(loaded_global_history, id_vars='Unnamed: 0', value_vars=['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], var_name='dataset', value_name='accuracy')
loaded_global_history = loaded_global_history.replace(['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], ['training set', 'validation set'])
loaded_global_history.rename(columns = {'Unnamed: 0':'epoch'}, inplace = True) 
loaded_global_history['shuffled'] = False

In [172]:
loaded_global_history

Unnamed: 0,epoch,dataset,accuracy,shuffled
0,0,training set,0.040283,False
1,1,training set,0.124231,False
2,2,training set,0.2008,False
3,3,training set,0.281058,False
4,4,training set,0.382226,False
5,5,training set,0.47294,False
6,6,training set,0.554428,False
7,7,training set,0.624231,False
8,8,training set,0.688192,False
9,9,training set,0.735855,False


In [154]:
global_training_plot = ggplot(loaded_global_history, aes(x='epoch', y='accuracy', linetype='dataset')) + geom_line() + labs(title='Learning Curve of Model Trained on Unshuffled Data', x='Epoch', y='Accuracy')
ggsave(global_training_plot, "../data_dump/training_plots_dump/global_training_plot.png")



#### Global Shuffled Training

In [None]:
# Global Shuffled Model

global_shuffled_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
global_shuffled_autoencoder_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="cosine_similarity",
    metrics=[metric_choose_argument_global_y_train]
)

global_shuffled_history = global_shuffled_autoencoder_model.fit(
    x=global_x_train,
    y=global_y_train_shuffled,
    batch_size=1,
    epochs=20,
    validation_data = (global_x_test, global_y_test)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20

In [174]:
loaded_global_shuffled_history = pd.read_csv("./global_training_shuffled_log.csv")

In [175]:
loaded_global_shuffled_history = loaded_global_shuffled_history.loc[0:19]
loaded_global_shuffled_history = pd.melt(loaded_global_shuffled_history, id_vars='epoch', value_vars=['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], var_name='dataset', value_name='accuracy')
loaded_global_shuffled_history = loaded_global_shuffled_history.replace(['metric_choose_argument_global_y_train', 'val_metric_choose_argument_global_y_train'], ['training set', 'validation set'])
loaded_global_shuffled_history['shuffled'] = True

In [176]:
loaded_global_shuffled_history

Unnamed: 0,epoch,dataset,accuracy,shuffled
0,0,training set,0.012608,True
1,1,training set,0.035363,True
2,2,training set,0.048893,True
3,3,training set,0.073801,True
4,4,training set,0.108549,True
5,5,training set,0.154367,True
6,6,training set,0.217405,True
7,7,training set,0.273063,True
8,8,training set,0.338561,True
9,9,training set,0.405904,True


In [170]:
global_training_shuffled_plot = ggplot(loaded_global_shuffled_history, aes(x='epoch', y='accuracy', linetype='dataset')) + geom_line() + labs(title='Learning Curve of Model Trained on Within-Topic Shuffled Data', x='Epoch', y='Accuracy')
ggsave(global_training_shuffled_plot, "../data_dump/training_plots_dump/global_shuffled_training_plot.png")



In [178]:
combined_global_training_df = pd.concat([loaded_global_history, loaded_global_shuffled_history])
combined_global_training_df

Unnamed: 0,epoch,dataset,accuracy,shuffled
0,0,training set,0.040283,False
1,1,training set,0.124231,False
2,2,training set,0.200800,False
3,3,training set,0.281058,False
4,4,training set,0.382226,False
...,...,...,...,...
35,15,validation set,0.071341,True
36,16,validation set,0.073801,True
37,17,validation set,0.066421,True
38,18,validation set,0.066421,True


In [218]:
combined_global_plot = (
    ggplot(combined_global_training_df, aes(x='epoch', y='accuracy', linetype='dataset', color='shuffled')) +
    geom_line(size=2) +
    labs(title='Learning Curve of Model Trained on Unshuffled vs. Within-Topic Shuffled Data', x='Epoch', y='Accuracy') +
    theme(
        figure_size=(16,24),
        axis_title=element_text(size=32),
        axis_text=element_text(size=24),
        legend_title=element_text(size=32, lineheight=1.5),
        legend_text=element_text(size=24, lineheight=1.5),
        plot_title=element_text(size=40, wrap=True, lineheight=1.5),
        legend_position="bottom",
        legend_key_width=64
    ) +
    guides(fill = guide_legend(byrow = True))
)
ggsave(combined_global_plot, "../data_dump/training_plots_dump/combined_global_training_plot.png")



#### Category Training

In [None]:
# Category Model
category_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
category_autoencoder_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="mse",
    metrics=['accuracy']
)
category_autoencoder_model.fit(
    x=economy_x_train,
    y=economy_y_train,
    batch_size=1,
    epochs=20,
    validation_data=(economy_x_test, economy_y_test)
)

#### Debate Training

In [None]:
# Debate Model
debate_autoencoder_model = tf.keras.models.clone_model(autoencoder_model)
debate_autoencoder_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="mse",
    metrics=['accuracy']
)
debate_autoencoder_model.fit(
    x=economy_debate_x_train,
    y=economy_debate_y_train,
    batch_size=1,
    epochs=5
)

# Study 3: Choose Corresponding Embedding

Given an embedding, can a model be trained to choose the correct embeddings corresponding to its counterargument from a list of them?

### OpenAI Setup

In [47]:
pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
import openai
from openai import OpenAI

In [17]:
client = OpenAI()

### Imports

In [196]:
import tensorflow as tf
import pandas as pd
import numpy as np
from scipy import spatial
import time
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential
)
import statistics

In [107]:
from tensorflow.keras.models import load_model

## Data

#### GPR 55

In [255]:
gpr_df = pd.read_csv("../no_wavs/GPR-KB-55/GPR-KB-55.csv")

In [256]:
DIM_EMBEDDING = 1536

def gpr_get_embeddings_df(gpr_df: pd.DataFrame) -> pd.DataFrame:
    """ Add embeddings column to a df
    """
    gpr_embeddings_df = pd.DataFrame()
    arguments_list = list(gpr_df)

    claims_embeddings = client.embeddings.create(input=arguments_list, model="text-embedding-ada-002")
    claims_embeddings_data = [embedding_data.embedding for embedding_data in claims_embeddings.data]
    claims_embeddings_df = pd.DataFrame(claims_embeddings_data, columns=[f"{str(i)}" for i in range(DIM_EMBEDDING)])
    claims_embeddings_df = claims_embeddings_df.reset_index(drop=True)

    claims_embeddings_df = pd.concat([gpr_df, claims_embeddings_df], axis=1)
    return claims_embeddings_df

In [257]:
gpr_claims_df = gpr_get_embeddings_df(gpr_df['claim'])
gpr_rebuttals_df = gpr_get_embeddings_df(gpr_df['rebuttal'])

gpr_combined = pd.concat([gpr_claims_df, gpr_rebuttals_df])
gpr_combined = gpr_combined.reset_index(drop=True)

gpr_combined_nums = gpr_combined.select_dtypes(include=[np.number])

gpr_x_train = gpr_claims_df.select_dtypes(include=[np.number])
gpr_x_train = gpr_x_train.loc[:len(gpr_x_train) * 0.8-1]
gpr_y_train = gpr_rebuttals_df.select_dtypes(include=[np.number])
gpr_y_train = gpr_y_train.loc[:len(gpr_y_train) * 0.8-1]

gpr_x_test = gpr_claims_df.select_dtypes(include=[np.number])
gpr_x_test = gpr_x_test.loc[len(gpr_x_test) * 0.8:]
gpr_x_test = gpr_x_test.reset_index(drop=True)
gpr_y_test = gpr_rebuttals_df.select_dtypes(include=[np.number])
gpr_y_test = gpr_y_test.loc[len(gpr_y_test) * 0.8:]
gpr_y_test = gpr_y_test.reset_index(drop=True)

In [258]:
def metric_choose_argument_gpr(y_true, y_pred):
    """ See if the output vector is closest to the rebuttal to the claim
    """
    gpr_training_df_32 = tf.cast(gpr_combined_nums, dtype=tf.float32)
    
    cos_sim_pred = tf.matmul(gpr_training_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(gpr_training_df_32, axis=1), [-1, 1])
    cos_sim_true = tf.matmul(gpr_training_df_32, y_true, transpose_b=True) / tf.reshape(tf.norm(y_true) * tf.norm(gpr_training_df_32, axis=1), [-1, 1])

    max_cos_sim_pred = tf.math.argmax(cos_sim_pred)
    max_cos_sim_true = tf.math.argmax(cos_sim_true)

    return tf.math.count_nonzero(tf.equal(max_cos_sim_pred, max_cos_sim_true))

#### IBM EACL

In [11]:
eacl_df = pd.read_csv("../IBM_EACL/claim_stance_dataset.csv")
eacl_df = eacl_df[['topicId', 'topicText', 'claims.stance', 'claims.claimCorrectedText']]

Unnamed: 0,topicId,topicText,claims.stance,claims.claimCorrectedText
0,1,This house believes that the sale of violent v...,PRO,Exposure to violent video games causes at leas...
1,1,This house believes that the sale of violent v...,CON,video game violence is not related to serious ...
2,1,This house believes that the sale of violent v...,CON,some violent video games may actually have a p...
3,1,This house believes that the sale of violent v...,PRO,exposure to violent video games causes both sh...
4,1,This house believes that the sale of violent v...,PRO,Violent video games increase the violent tende...
...,...,...,...,...
2389,1065,This house would promote democratization,CON,democracies have ever been found incompatible ...
2390,1065,This house would promote democratization,CON,democracy cannot subsist long nor be carried f...
2391,1065,This house would promote democratization,CON,Democracy in general is criticized for ignorin...
2392,1065,This house would promote democratization,PRO,democracy and freedom are indispensable ingred...


In [232]:
topic_lens = []
pro_lens = []
con_lens = []
for topic in eacl_df['topicId'].unique():
    topic_rows = eacl_df[eacl_df['topicId'] == topic]
    topic_lens.append(len(topic_rows))
    pro_lens.append(len(topic_rows[topic_rows['claims.stance'] == "PRO"]))
    con_lens.append(len(topic_rows[topic_rows['claims.stance'] == "CON"]))

In [59]:
""" Convert an argument into a (1 x 1536) embedding df """

DIM_EMBEDDING = 1536

@retry(wait=wait_random_exponential(min=60, max=500), stop=stop_after_attempt(10))
def eacl_get_embeddings(arguments: []) -> []:
    embeddings = client.embeddings.create(input=arguments, model="text-embedding-ada-002")
    embeddings_data = [embedding_data.embedding for embedding_data in embeddings.data]
    embeddings_df = pd.DataFrame(embeddings_data, columns=[f"{str(i)}" for i in range(DIM_EMBEDDING)])
    return embeddings_df.reset_index(drop=True)

In [65]:
API_LIMIT = 1000

def eacl_get_embeddings_df(eacl_df: pd.DataFrame) -> pd.DataFrame:
    """ Add embeddings column to a df
    """
    embeddings_df = pd.DataFrame()
    arguments_list = list(eacl_df['claims.claimCorrectedText'])
    total_len = len(arguments_list)
    i = 0

    # Grab embeddings from arguments column in chunks
    while i < total_len:
        embeddings = eacl_get_embeddings(arguments_list[i:min(total_len, i+API_LIMIT)])
        embeddings_df = pd.concat([embeddings_df, embeddings], axis=0, ignore_index=True)
        i = i + API_LIMIT
    arguments_embeddings_df = pd.concat([eacl_df, embeddings_df], axis=1)
    return arguments_embeddings_df

In [67]:
eacl_embeddings_df = eacl_get_embeddings_df(eacl_df)

In [71]:
eacl_nums_df = eacl_embeddings_df.select_dtypes(include=[np.number])
eacl_vectors_df = eacl_nums_df.drop('topicId', axis=1)

## GPR predict 

In [264]:
global_autoencoder_gpr_predictions = global_autoencoder_model.predict(gpr_x_train)
global_autoencoder_gpr_predictions_df = pd.DataFrame(global_autoencoder_gpr_predictions)
global_autoencoder_gpr_predictions_df.columns = [str(i) for i in global_autoencoder_gpr_predictions_df.columns]



In [265]:
successes = 0
for i in range(len(gpr_y_train)):
    gpr_y_train_tf = tf.convert_to_tensor(gpr_y_train.loc[i], dtype=tf.float32)
    gpr_pred_tf = tf.convert_to_tensor(global_autoencoder_gpr_predictions_df.loc[i], dtype=tf.float32)
    gpr_y_train_tf = tf.reshape(gpr_y_train_tf, (1, -1))
    gpr_pred_tf = tf.reshape(gpr_pred_tf, (1, -1))
    if metric_choose_argument_gpr(gpr_y_train_tf, gpr_pred_tf).numpy() == 1:
        successes += 1

In [266]:
successes

7

## EACL Predict

In [116]:
global_autoencoder_eacl_predictions = global_autoencoder_model.predict(eacl_vectors_df)
global_autoencoder_eacl_predictions_df = pd.DataFrame(global_autoencoder_eacl_predictions)
global_autoencoder_eacl_predictions_df.columns = [str(i) for i in global_autoencoder_eacl_predictions_df.columns]
global_autoencoder_eacl_predictions_df



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,-0.228883,-0.798650,1.206804,-2.379915,-1.636264,0.360619,-0.825054,-0.813067,-0.548199,-1.500611,...,0.342699,0.234322,0.672558,-0.972752,-1.087595,-0.216367,0.320369,-1.215356,-0.443726,-0.299795
1,-0.557883,-0.409383,1.171087,-1.981327,-1.036210,0.220497,-0.838605,-0.819698,-1.100810,-1.012501,...,0.931535,0.818045,1.471216,-1.408638,-1.282735,0.167310,0.973538,-1.621110,-0.358796,-0.720913
2,-0.854444,-1.084031,0.989973,-1.987451,-1.624811,0.425982,-1.417340,-0.267563,-1.422044,-1.560554,...,0.854357,0.700630,0.814746,-1.859064,-1.603771,0.459844,-0.040004,-1.096585,-0.427586,-0.427207
3,-0.282068,-1.068843,1.547549,-2.913010,-2.350163,0.858396,-0.980850,-1.077050,-1.141163,-1.655215,...,0.570350,0.110851,0.990802,-1.204208,-0.894073,-0.035082,-0.212720,-1.485491,-0.691650,-0.365833
4,-0.356761,-0.873439,1.128338,-1.861351,-1.272373,0.496455,-1.351442,-1.098501,-0.496969,-1.620215,...,1.537553,0.581744,0.522188,-1.540636,-1.692833,0.056214,0.536134,-1.494208,-0.454808,-0.317037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2389,-0.051398,-0.138598,-0.450709,-1.271696,-0.704614,0.445081,-1.184983,-0.096439,-0.639818,-0.364644,...,0.433787,0.407175,1.106879,-0.812399,-1.441804,-0.309056,0.693948,-0.294063,0.217393,-0.074500
2390,-0.639797,-0.260313,-0.501766,-0.693032,-0.844574,0.426009,-0.750114,-0.009548,-0.651297,-0.696315,...,0.730158,-0.035551,0.559079,-1.108134,-1.777715,-0.282126,0.921828,-0.093053,-1.006203,-0.633085
2391,-0.262894,-0.201490,-0.776411,-1.628114,-0.824789,0.753134,-1.053622,-0.403603,-0.668763,-0.633884,...,0.080172,0.702641,0.503137,-1.151867,-1.180909,0.107792,-0.481588,-0.123500,-0.210645,0.090385
2392,-0.282158,-0.322060,-0.958403,-1.221992,-1.297327,0.649798,-0.758350,-0.353878,-0.319313,-0.523130,...,0.847846,0.355244,0.043943,-1.042431,-1.625324,-0.508618,0.601720,-0.475245,-0.336199,-0.173208


In [276]:
eacl_embeddings_df_32 = tf.cast(eacl_vectors_df, dtype=tf.float32)
pred_topk = []

for i, row in global_autoencoder_eacl_predictions_df.iterrows():
    successes = 0
    y_pred = tf.reshape(row.values, [1, -1])
    target_topic = eacl_embeddings_df.loc[i]['topicId']
    target_type = 'PRO' if eacl_embeddings_df.loc[i]['claims.stance'] == 'CON' else 'CON'

    cos_sim_pred = tf.matmul(eacl_embeddings_df_32, y_pred, transpose_b=True) / tf.reshape(tf.norm(y_pred) * tf.norm(eacl_embeddings_df_32, axis=1), [-1, 1])
    top_k_sim_pred = tf.math.top_k(tf.reshape(cos_sim_pred, [-1]), k=200).indices
    top_k_sim_pred = top_k_sim_pred.numpy()
    
    for index in top_k_sim_pred:
        match_row = eacl_embeddings_df.loc[index]
        if match_row['topicId'] == target_topic and match_row['claims.stance'] == target_type:
            successes += 1 
    pred_topk.append(successes/200 * 100)

In [277]:
statistics.mean(pred_topk)

12.06641604010025

## Predict

#### Global Predict

In [267]:
# Global model predictions
global_autoencoder_predictions = global_autoencoder_model.predict(global_x_test)
global_autoencoder_predictions_df = pd.DataFrame(global_autoencoder_predictions)
global_autoencoder_predictions_df.columns = [str(i) for i in global_autoencoder_predictions_df.columns]
output_folder = f'../data_dump/autoencoder_predictions_dump/global/'
output_file_path = f'{output_folder}global_predictions.pkl'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
global_autoencoder_predictions_df.to_pickle(output_file_path)



In [7]:
successes = 0
for i, y_pred in global_autoencoder_predictions_df.iterrows():
    global_y_test_tf = tf.reshape(global_y_test.loc[i], [1, 1536])
    global_y_test_tf = tf.cast(global_y_test_tf, dtype=tf.float32)
    
    y_pred_tf = tf.reshape(y_pred, [1, 1536])
    y_pred_tf = tf.cast(y_pred_tf, dtype=tf.float32)
    res = metric_choose_argument_global_y_train(global_y_test_tf, y_pred_tf).numpy()
    if res == 1:
        successes += 1

NameError: name 'global_autoencoder_predictions_df' is not defined

In [97]:
successes

NameError: name 'successes' is not defined

In [284]:
len(global_autoencoder_predictions_df)

813

#### Global Shuffled Predict

In [35]:
# Global model predictions
global_shuffled_autoencoder_predictions = global_shuffled_autoencoder_model.predict(global_x_test)
global_shuffled_autoencoder_predictions_df = pd.DataFrame(global_shuffled_autoencoder_predictions)
global_shuffled_autoencoder_predictions_df.columns = [str(i) for i in global_shuffled_autoencoder_predictions_df.columns]
output_folder = f'../data_dump/autoencoder_predictions_dump/global/'
output_file_path = f'{output_folder}global_shuffled_predictions.pkl'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
global_shuffled_autoencoder_predictions_df.to_pickle(output_file_path)



In [36]:
global_shuffled_autoencoder_predictions_df = pd.read_pickle("../data_dump/autoencoder_predictions_dump/global/global_shuffled_predictions.pkl")

In [37]:
successes_shuffled = 0
for i, y_pred in global_shuffled_autoencoder_predictions_df.iterrows():
    global_y_test_tf = tf.reshape(global_y_test.loc[i], [1, 1536])
    global_y_test_tf = tf.cast(global_y_test_tf, dtype=tf.float32)
    
    y_pred_tf = tf.reshape(y_pred, [1, 1536])
    y_pred_tf = tf.cast(y_pred_tf, dtype=tf.float32)
    res = metric_choose_argument_global_y_train(global_y_test_tf, y_pred_tf).numpy()
    if res == 1:
        successes_shuffled += 1

In [38]:
successes_shuffled

51

#### Category Predict

In [None]:
# Category model predictions
category_autoencoder_predictions = category_autoencoder_model.predict(economy_x_test)
category_autoencoder_predictions_df = pd.DataFrame(category_autoencoder_predictions)
category_autoencoder_predictions_df.columns = [str(i) for i in category_autoencoder_predictions_df.columns]

#### Debate Predict

In [None]:
# Debate model predictions
debate_autoencoder_predictions = debate_autoencoder_model.predict(economy_debate_x_test)
debate_autoencoder_predictions_df = pd.DataFrame(debate_autoencoder_predictions)
debate_autoencoder_predictions_df.columns = [str(i) for i in debate_autoencoder_predictions_df.columns]

## Combine Point-Counter Dataframes

#### Global point-counter df

In [None]:
# Combine global_x_test and global_y_test
global_x_test_df = global_x_test.copy().astype(np.float32)
global_x_test_df['pair_id'] = global_x_test_df.index.astype(str)
global_x_test_df['type'] = 'point'
global_x_test_df['pred_test'] = 'test'

global_y_test_df = global_y_test.copy().astype(np.float32)
global_y_test_df['pair_id'] = global_y_test_df.index.astype(str)
global_y_test_df['type'] = 'counter'
global_y_test_df['pred_test'] = 'test'

global_x_y_test_combined_df = pd.concat([global_x_test_df, global_y_test_df], axis=0, ignore_index=True)

In [None]:
# Combine global_x_test and global autoencoder predictions
global_x_test_df_copy = global_x_test.copy().astype(np.float32)
global_x_test_df_copy['pair_id'] = global_x_test_df_copy.index.astype(str)
global_x_test_df_copy['type'] = 'point'
global_x_test_df_copy['pred_test'] = 'pred'
global_autoencoder_predictions_df['pair_id'] = global_autoencoder_predictions_df.index.astype(str)
global_autoencoder_predictions_df['type'] = 'counter'
global_autoencoder_predictions_df['pred_test'] = 'pred'
global_pred_test_combined_df = pd.concat([global_x_test_df_copy, global_autoencoder_predictions_df], axis=0, ignore_index=True)

In [None]:
# Combine global_x_test and global_y_test and global autoencoder predictions
global_combined_df = pd.concat([global_x_test_df, global_y_test_df, global_autoencoder_predictions_df], axis=0, ignore_index=True)

#### Global shuffled point-counter df

In [None]:
# Combine global_x_test and global_y_test_shuffled
global_x_test_df = global_x_test.copy().astype(np.float32)
global_x_test_df['pair_id'] = global_x_test_df.index.astype(str)
global_x_test_df['type'] = 'point'
global_x_test_df['pred_test'] = 'test'

global_y_test_shuffled_df = global_y_test_shuffled.copy().astype(np.float32)
global_y_test_shuffled_df['pair_id'] = global_y_test_shuffled_df.index.astype(str)
global_y_test_shuffled_df['type'] = 'counter'
global_y_test_shuffled_df['pred_test'] = 'test'

global_x_y_test_combined_shuffled_df = pd.concat([global_x_test_df, global_y_test_shuffled_df], axis=0, ignore_index=True)

In [None]:
# Combine global_x_test and global shuffled autoencoder predictions
global_x_test_df_copy = global_x_test.copy().astype(np.float32)
global_x_test_df_copy['pair_id'] = global_x_test_df_copy.index.astype(str)
global_x_test_df_copy['type'] = 'point'
global_x_test_df_copy['pred_test'] = 'pred'
global_shuffled_autoencoder_predictions_df['pair_id'] = global_shuffled_autoencoder_predictions_df.index.astype(str)
global_shuffled_autoencoder_predictions_df['type'] = 'counter'
global_shuffled_autoencoder_predictions_df['pred_test'] = 'pred'
global_pred_test_combined_shuffled_df = pd.concat([global_x_test_df_copy, global_shuffled_autoencoder_predictions_df], axis=0, ignore_index=True)

In [None]:
# Combine global_x_test and global_y_test and global autoencoder predictions
global_combined_shuffled_df = pd.concat([global_x_test_df, global_y_test_shuffled_df, global_shuffled_autoencoder_predictions_df], axis=0, ignore_index=True)

#### Category point-counter df

In [None]:
# Combine economy_x_test and economy_y_test
economy_x_test_df = economy_x_test.copy().astype(np.float32)
economy_x_test_df['pair_id'] = economy_x_test_df.index.astype(str)
economy_x_test_df['type'] = 'point'
economy_x_test_df['pred_test'] = 'test'

economy_y_test_df = economy_y_test.copy().astype(np.float32)
economy_y_test_df['pair_id'] = economy_y_test_df.index.astype(str)
economy_y_test_df['type'] = 'counter'
economy_y_test_df['pred_test'] = 'test'

economy_x_y_test_combined_df = pd.concat([economy_x_test_df, economy_y_test_df], axis=0, ignore_index=True)

In [None]:
# Combine economy_x_test and economy autoencoder predictions
economy_x_test_df_copy = economy_x_test.copy().astype(np.float32)
economy_x_test_df_copy['pair_id'] = economy_x_test_df_copy.index.astype(str)
economy_x_test_df_copy['type'] = 'point'
economy_x_test_df_copy['pred_test'] = 'pred'
category_autoencoder_predictions_df['pair_id'] = category_autoencoder_predictions_df.index.astype(str)
category_autoencoder_predictions_df['type'] = 'counter'
category_autoencoder_predictions_df['pred_test'] = 'pred'
economy_pred_test_combined_df = pd.concat([economy_x_test_df_copy, category_autoencoder_predictions_df], axis=0, ignore_index=True)

In [None]:
# Combine economy_x_test and economy_y_test and economy autoencoder predictions
economy_combined_df = pd.concat([economy_x_test_df, economy_y_test_df, category_autoencoder_predictions_df], axis=0, ignore_index=True)

#### Debate point-counter df

In [None]:
# Combine economy_debate_x_test and economy_debate_y_test
economy_debate_x_test_df = economy_debate_x_test.copy().astype(np.float32)
economy_debate_x_test_df['pair_id'] = economy_debate_x_test_df.index.astype(str)
economy_debate_x_test_df['type'] = 'point'
economy_debate_x_test_df['pred_test'] = 'test'

economy_debate_y_test_df = economy_debate_y_test.copy().astype(np.float32)
economy_debate_y_test_df['pair_id'] = economy_debate_y_test_df.index.astype(str)
economy_debate_y_test_df['type'] = 'counter'
economy_debate_y_test_df['pred_test'] = 'test'

economy_debate_x_y_test_combined_df = pd.concat([economy_debate_x_test_df, economy_debate_y_test_df], axis=0, ignore_index=True)

In [None]:
# Combine economy_debate_x_test and debate autoencoder predictions
economy_debate_x_test_df_copy = economy_debate_x_test.copy().astype(np.float32)
economy_debate_x_test_df_copy['pair_id'] = economy_debate_x_test_df_copy.index.astype(str)
economy_debate_x_test_df_copy['type'] = 'point'
economy_debate_x_test_df_copy['pred_test'] = 'pred'
debate_autoencoder_predictions_df['pair_id'] = debate_autoencoder_predictions_df.index.astype(str)
debate_autoencoder_predictions_df['type'] = 'counter'
debate_autoencoder_predictions_df['pred_test'] = 'pred'
economy_debate_pred_test_combined_df = pd.concat([economy_debate_x_test_df_copy, debate_autoencoder_predictions_df], axis=0, ignore_index=True)

In [None]:
# Combine economy_x_test and economy_y_test and debate autoencoder predictions
economy_debate_combined_df = pd.concat([economy_debate_x_test_df, economy_debate_y_test_df, debate_autoencoder_predictions_df], axis=0, ignore_index=True)

#### Global tsne df

In [None]:
# TSNE global_x_y_test
global_x_y_test_combined_tsne_df = tsne_embeddings(global_x_y_test_combined_df)

In [None]:
# TSNE global autoencoder predictions
global_pred_test_combined_tsne_df = tsne_embeddings(global_pred_test_combined_df)

In [None]:
# TSNE global combined df
global_combined_tsne_df = tsne_embeddings(global_combined_df)

In [None]:
# Combine both prediction and test tsne df
global_both_tsne_df = pd.concat([global_x_y_test_combined_tsne_df, global_pred_test_combined_tsne_df], axis=0)

#### Global shuffled tsne df

In [None]:
# TSNE global_x_y_shuffled_test
global_x_y_test_combined_shuffled_tsne_df = tsne_embeddings(global_x_y_test_combined_shuffled_df)

In [None]:
# TSNE global shuffled autoencoder predictions
global_pred_test_combined_shuffled_tsne_df = tsne_embeddings(global_pred_test_combined_shuffled_df)

In [None]:
# TSNE global shuffled combined df
global_combined_shuffled_tsne_df = tsne_embeddings(global_combined_shuffled_df)

In [None]:
# Combine both prediction and test shuffled tsne df
global_both_shuffled_tsne_df = pd.concat([global_x_y_test_combined_shuffled_tsne_df, global_pred_test_combined_shuffled_tsne_df], axis=0)

#### Category tsne df

In [None]:
# TSNE economy_x_y_test
economy_x_y_test_combined_tsne_df = tsne_embeddings(economy_x_y_test_combined_df)

In [None]:
# TSNE category autoencoder predictions
economy_pred_test_combined_tsne_df = tsne_embeddings(economy_pred_test_combined_df)

In [None]:
# TSNE economy combined df
economy_combined_tsne_df = tsne_embeddings(economy_combined_df)

In [None]:
# Combine both prediction and test tsne df
economy_both_tsne_df = pd.concat([economy_x_y_test_combined_tsne_df, economy_pred_test_combined_tsne_df], axis=0)

#### Debate tsne df

In [None]:
# TSNE economy_debate_x_y_test
economy_debate_x_y_test_combined_tsne_df = tsne_embeddings(economy_debate_x_y_test_combined_df)

In [None]:
# TSNE debate autoencoder predictions
economy_debate_pred_test_combined_tsne_df = tsne_embeddings(economy_debate_pred_test_combined_df)

In [None]:
# TSNE debate combined df
economy_debate_combined_tsne_df = tsne_embeddings(economy_debate_combined_df)

In [None]:
# Combine both prediction and test tsne df
economy_debate_both_tsne_df = pd.concat([economy_debate_x_y_test_combined_tsne_df, economy_debate_pred_test_combined_tsne_df], axis=0)

## PCA Dataframes

#### Global pca df

In [None]:
# PCA global_x_y_test
global_x_y_test_combined_pca_df = pca_embeddings(global_x_y_test_combined_df)

In [None]:
# PCA global autoencoder predictions
global_pred_test_combined_pca_df = pca_embeddings(global_pred_test_combined_df)

In [None]:
# PCA global combined df
global_combined_pca_df = pca_embeddings(global_combined_df)

In [None]:
# Combine both prediction and test pca df
global_both_pca_df = pd.concat([global_x_y_test_combined_pca_df, global_pred_test_combined_pca_df], axis=0)

#### Category pca df

In [None]:
# PCA economy_x_y_test
economy_x_y_test_combined_pca_df = pca_embeddings(economy_x_y_test_combined_df)

In [None]:
# PCA category autoencoder predictions
economy_pred_test_combined_pca_df = pca_embeddings(economy_pred_test_combined_df)

In [None]:
# PCA economy combined df
economy_combined_pca_df = pca_embeddings(economy_combined_df)

In [None]:
# Combine both prediction and test pca df
economy_both_pca_df = pd.concat([economy_x_y_test_combined_pca_df, economy_pred_test_combined_pca_df], axis=0)

#### Debate pca df

In [None]:
# PCA economy_debate_x_y_test
economy_debate_x_y_test_combined_pca_df = pca_embeddings(economy_debate_x_y_test_combined_df)

In [None]:
# PCA debate autoencoder predictions
economy_debate_pred_test_combined_pca_df = pca_embeddings(economy_debate_pred_test_combined_df)

In [None]:
# PCA economy debate combined df
economy_debate_combined_pca_df = pca_embeddings(economy_debate_combined_df)

In [None]:
# Combine both prediction and test pca df
economy_debate_both_pca_df = pd.concat([economy_debate_x_y_test_combined_pca_df, economy_debate_pred_test_combined_pca_df], axis=0)

## Plot Data to Compare

#### Plot Functions

In [None]:
# Plot for prediction or test
def pred_test_plot(
        analysis_type: AnalysisType,
        pred_test_data: pd.DataFrame,
        processing_unit: ProcessingUnit
    ):
    plot_analysis_type = analysis_type.value.upper()
    pred_test = pred_test_data['pred_test'].iloc[0]
    gg = (
        ggplot(pred_test_data, aes(x='x', y='y', color='type', group='pair_id')) +
        geom_point(size=2) +
        geom_line(color='black', size=0.5) +
        labs(
            title=f'{plot_analysis_type} Plot: {pred_test}',
            x=f'{plot_analysis_type}_x',
            y=f'{plot_analysis_type}_y'
        ) +
        theme(
            axis_title=element_text(margin={'t': 20}),
            figure_size=(8, 8),
        )
    )

    output_folder = f'../data_dump/autoencoder_{analysis_type.value}_plots_dump/'
    output_file_path = f'{output_folder}{processing_unit.value}_autoencoder_{pred_test}_{analysis_type.value}_plot.png'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    ggsave(gg, output_file_path)
    print(gg)

In [None]:
# Both plot for prediction and test
def both_pred_test_plot(
        analysis_type: AnalysisType,
        both_data_df: pd.DataFrame,
        processing_unit: ProcessingUnit
    ):
    plot_analysis_type = analysis_type.value.upper()
    both_data_df['interaction'] = both_data_df['pair_id'] + '_' + both_data_df['pred_test']
    gg = (
        ggplot(both_data_df, aes(x='x', y='y', color='pred_test', shape='type', group='interaction')) +
        geom_point(size=2) +
        geom_line(color='black', size=0.5) +
        labs(
            title=f'{plot_analysis_type} Combined Plot for Both Prediction and Test',
            x=f'{plot_analysis_type}_x',
            y=f'{plot_analysis_type}_y'
        ) +
        theme(
            axis_title=element_text(margin={'t': 20}),
            figure_size=(8, 8),
        )
    )

    output_folder = f'../data_dump/autoencoder_{analysis_type.value}_plots_dump/'
    output_file_path = f'{output_folder}{processing_unit.value}_autoencoder_combined_pred_test_{analysis_type.value}_plot.png'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    ggsave(gg, output_file_path)
    print(gg)

In [None]:
# Combined plot for x_test + y_test + prediction
def combined_pred_test_plot(
        analysis_type: AnalysisType,
        combined_df: pd.DataFrame,
        processing_unit: ProcessingUnit
    ):
    plot_analysis_type = analysis_type.value.upper()
    gg = (
        ggplot(combined_df, aes(x='x', y='y', color='pred_test', shape='type', group='pair_id')) +
        geom_point(size=2) +
        geom_line(color='black', size=0.5) +
        labs(
            title=f'{plot_analysis_type} Combined Plot for Prediction vs Test',
            x=f'{plot_analysis_type}_x',
            y=f'{plot_analysis_type}_y'
        ) +
        theme(
            axis_title=element_text(margin={'t': 20}),
            figure_size=(8, 8),
        )
    )

    output_folder = f'../data_dump/autoencoder_{analysis_type.value}_plots_dump/'
    output_file_path = f'{output_folder}{processing_unit.value}_autoencoder_all_{analysis_type.value}_plot.png'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    ggsave(gg, output_file_path)
    print(gg)

#### Global Prediction PCA Plots

In [None]:
# Plot PCA test
pred_test_plot(AnalysisType.PCA, global_x_y_test_combined_pca_df, ProcessingUnit.GLOBAL)

In [None]:
# Plot PCA autoencoder predictions
pred_test_plot(AnalysisType.PCA, global_pred_test_combined_pca_df, ProcessingUnit.GLOBAL)

In [None]:
# Plot both PCA pred and test
both_pred_test_plot(AnalysisType.PCA, global_both_pca_df, ProcessingUnit.GLOBAL)

In [None]:
# Plot combined PCA pred and test
combined_pred_test_plot(AnalysisType.PCA, global_combined_pca_df, ProcessingUnit.GLOBAL)

#### Category Prediction PCA Plots

In [None]:
# Plot PCA test
pred_test_plot(AnalysisType.PCA, economy_x_y_test_combined_pca_df, ProcessingUnit.CATEGORY)

In [None]:
# Plot PCA autoencoder predictions
pred_test_plot(AnalysisType.PCA, economy_pred_test_combined_pca_df, ProcessingUnit.CATEGORY)

In [None]:
# Plot both pred and test
both_pred_test_plot(AnalysisType.PCA, economy_both_pca_df, ProcessingUnit.CATEGORY)

In [None]:
# Plot combined pred vs test
combined_pred_test_plot(AnalysisType.PCA, economy_combined_pca_df, ProcessingUnit.CATEGORY)

#### Debate Prediction PCA Plots

In [None]:
# Plot PCA test
pred_test_plot(AnalysisType.PCA, economy_debate_x_y_test_combined_pca_df, ProcessingUnit.DEBATE)

In [None]:
# Plot PCA autoencoder predictions
pred_test_plot(AnalysisType.PCA, economy_debate_pred_test_combined_pca_df, ProcessingUnit.DEBATE)

In [None]:
# Plot both pred and test
both_pred_test_plot(AnalysisType.PCA, economy_debate_both_pca_df, ProcessingUnit.DEBATE)

In [None]:
# Plot combined pred vs test
combined_pred_test_plot(AnalysisType.PCA, economy_debate_combined_pca_df, ProcessingUnit.DEBATE)