## load training triples

In [49]:
import json


with open('training_data_pairs_shuffled.json', 'r') as f:
    loaded_data = json.load(f)

In [50]:
label_list = []
anchor_list = []
game_list = []

for entry in loaded_data:
    label_list.append(entry['label'])
    anchor_list.append(f"{entry['anchor']['id']} {entry['anchor']['features']} {entry['anchor']['description']}")
    game_list.append(f"{entry['game']['id']} {entry['game']['features']} {entry['game']['description']}")

In [51]:
# import polars as pl
#
# label_list = []
# anchor_id_list = []
# anchor_features_list = []
# anchor_description_list = []
# game_id_list = []
# game_features_list = []
# game_description_list = []
# 
# for entry in loaded_data:
#     label_list.append(entry['label'])
#     anchor_id_list.append(entry['anchor']['id'])
#     anchor_features_list.append(entry['anchor']['features'])
#     anchor_description_list.append(entry['anchor']['description'])
#     game_id_list.append(entry['game']['id'])
#     game_features_list.append(entry['game']['features'])
#     game_description_list.append(entry['game']['description'])
# 
# data = {
#     'label': label_list,
#     'anchor_id': anchor_id_list,
#     'anchor_features': anchor_features_list,
#     'anchor_description': anchor_description_list,
#     'game_id': game_id_list,
#     'game_features': game_features_list,
#     'game_description': game_description_list
# }
# 
# df = pl.DataFrame(data)
# df.head()

In [52]:
# all_pairs_count = df.count().item(0,0)
# print(all_pairs_count)
# train_df = df.sample(all_pairs_count*0.8, shuffle=True, seed=24)
# print(train_df.describe())

In [53]:
# corpus = train_df.get_column('anchor_description') + train_df.get_column('game_description') + train_df.get_column('anchor_features') + train_df.get_column('game_features')
# print(corpus.shape)

In [54]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# 
# MAX_NUM_WORDS = 10000
# tokenizer = Tokenizer(MAX_NUM_WORDS)
# tokenizer.fit_on_texts(corpus)
# 
# A_train_description = tokenizer.texts_to_sequences(train_df.get_column('anchor_description'))
# B_train_description = tokenizer.texts_to_sequences(train_df.get_column('game_description'))
# 
# A_train_features = tokenizer.texts_to_sequences(train_df.get_column('anchor_features'))
# B_train_features = tokenizer.texts_to_sequences(train_df.get_column('game_features'))

In [55]:
# MAX_DESC_LENGTH = max([len(d) for d in A_train_description + B_train_description])
# MAX_FEAT_LENGTH = max([len(d) for d in A_train_features + B_train_features])
# print(f'MAX_DESC_LENGTH: {MAX_DESC_LENGTH} | MAX_FEAT_LENGTH: {MAX_FEAT_LENGTH}')

In [56]:
# A_train_description = pad_sequences(A_train_description, maxlen=MAX_DESC_LENGTH)
# B_train_description = pad_sequences(B_train_description, maxlen=MAX_DESC_LENGTH)
# 
# A_train_features = pad_sequences(A_train_features, maxlen=MAX_FEAT_LENGTH)
# B_train_features = pad_sequences(B_train_features, maxlen=MAX_FEAT_LENGTH)

In [57]:
# train_df.head()

In [58]:
# new_train_df = train_df.insert_column(1, pl.Series("A_description", A_train_description))
# new_train_df = new_train_df.insert_column(1, pl.Series("B_description", B_train_description))
# new_train_df = new_train_df.insert_column(1, pl.Series("A_features", A_train_features))
# new_train_df = new_train_df.insert_column(1, pl.Series("B_features", B_train_features))

In [59]:
# new_train_df = new_train_df.drop('anchor_features', 'anchor_description', 'game_features', 'game_description')
# new_train_df.head()

In [60]:
from sklearn.model_selection import train_test_split

A_train, A_test, B_train, B_test, y_train, y_test = train_test_split(
    anchor_list,
    game_list,
    label_list,
    test_size=0.9, random_state=42)

In [61]:
MAX_LENGTH = max([len(d) for d in anchor_list + game_list])
NUM_CLASSES = 2
MAX_TOKENS = 10000

In [64]:
import tensorflow as tf
from tensorflow.keras import layers, models

anchor_input = tf.keras.Input(shape=(1,), name='anchor', dtype=tf.string)
game_input = tf.keras.Input(shape=(1,), name='game', dtype=tf.string)


vectorize_layer = layers.TextVectorization(max_tokens=MAX_TOKENS,) 
vectorize_layer.adapt(anchor_list)

A_vectorized = vectorize_layer(anchor_input)
B_vectorized = vectorize_layer(game_input)

embedding_layer = layers.Embedding(input_dim=MAX_LENGTH, output_dim=128)
A_embedded = embedding_layer(A_vectorized)
B_embedded = embedding_layer(B_vectorized)

shared_lstm = layers.LSTM(64)
A_output = shared_lstm(A_embedded)
B_output = shared_lstm(B_embedded)

merged = layers.concatenate(
    [A_output, B_output], 
    axis=-1)

dense = layers.Dense(
    units=NUM_CLASSES, 
    activation='softmax')
predictions = dense(merged)

siamese_model = models.Model(inputs=[anchor_input, game_input],
                                  outputs=predictions, 
                                  name='siamese_model')

siamese_model.compile(optimizer='adam', 
                      loss='categorical_crossentropy', 
                      metrics=['accuracy'])

siamese_model.summary()

Model: "siamese_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 anchor (InputLayer)         [(None, 1)]                  0         []                            
                                                                                                  
 game (InputLayer)           [(None, 1)]                  0         []                            
                                                                                                  
 text_vectorization_5 (Text  (None, None)                 0         ['anchor[0][0]',              
 Vectorization)                                                      'game[0][0]']                
                                                                                                  
 embedding_2 (Embedding)     (None, None, 128)            1782272   ['text_vectorizati

In [65]:
from keras.utils import plot_model

plot_model(siamese_model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [None]:
history = siamese_model.predict(anchor_list[:2])

# history = siamese_model.fit(
#     [train_anchor, train_game],
#     np.array(train_labels),
#     epochs=10,
#     batch_size=64,
#     #validation_split=0.2,
#     use_multiprocessing=True,
#     workers=10,
#     validation_data=([val_anchor, val_game], np.array(val_labels))
# )

In [None]:
from matplotlib import pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()