## load training triples

In [None]:
import json


with open('training_data_pairs_shuffled.json', 'r') as f:
    loaded_data = json.load(f)

In [None]:
print(loaded_data[0])

In [None]:
len(loaded_data)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


features_anchor = [entry['anchor']['features'] for entry in loaded_data]
#description_anchor = [entry['anchor']['description'] for entry in loaded_data]
features_game = [entry['game']['features'] for entry in loaded_data]
#description_game = [entry['anchor']['description'] for entry in loaded_data]
labels = [entry['label'] for entry in loaded_data]

tokenizer_feat = Tokenizer()
tokenizer_feat.fit_on_texts(features_anchor + features_game)
#tokenizer_desc = Tokenizer()
#tokenizer_desc.fit_on_texts(description_anchor + description_game)
#vocab_size = max(len(tokenizer_feat.word_index) + 1, len(tokenizer_desc.word_index) + 1)

In [7]:
vocab_size = len(tokenizer_feat.word_index) + 1 #len(tokenizer_desc.word_index) + 1
print(vocab_size)

67775


In [9]:
max_length_feat = max(
        max(len(seq) for seq in features_anchor),
        max(len(seq) for seq in features_game)
    )
# max_length_desc = max(
#         max(len(seq) for seq in description_anchor),
#         max(len(seq) for seq in description_game)
#     )
# print(max_length_feat, max_length_desc)

3786 21299


In [None]:
def tokenize_and_pad(sequences, tokenizer, max_length):
    tokenized_sequences = tokenizer.texts_to_sequences(sequences)
    padded_sequences = pad_sequences(tokenized_sequences, maxlen=max_length, padding='post')
    return padded_sequences


features_anchor = tokenize_and_pad(features_anchor, tokenizer_feat, max_length_feat)
features_game = tokenize_and_pad(features_game, tokenizer_feat, max_length_feat)
# description_anchor = tokenize_and_pad(description_anchor, tokenizer_desc, max_length_desc)
# description_game = tokenize_and_pad(description_game, tokenizer_desc, max_length_desc)

print(features_anchor[0])

In [5]:
from sklearn.model_selection import train_test_split

#X_anchor = np.concatenate([np.array([[i+1] for i in range(len(features_anchor))]), description_anchor, features_anchor], axis=1)
#X_game = np.concatenate([np.array([[i+1] for i in range(len(features_game))]), description_game, features_game], axis=1)


X_train_anchor, X_temp_anchor, X_train_game, X_temp_game, y_train, y_temp = train_test_split(
    np.array(X_anchor),
    np.array(X_game),
    np.array(labels),
    test_size=0.3,
    random_state=42
)

X_val_anchor, X_test_anchor, X_val_game, X_test_game, y_val, y_test = train_test_split(
    np.array(X_temp_anchor),
    np.array(X_temp_game),
    np.array(y_temp),
    test_size=0.5,
    random_state=42
)

# Print the shapes
print("Training shapes:", X_train_anchor.shape, X_train_game.shape, y_train.shape)
print("Validation shapes:", X_val_anchor.shape, X_val_game.shape, y_val.shape)
print("Test shapes:", X_test_anchor.shape, X_test_game.shape, y_test.shape)

KeyboardInterrupt: 

In [None]:

from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam


# Define the Siamese network architecture
def build_siamese_network(input_shape, vocab_size):
    model = models.Sequential()
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=input_shape))
    model.add(layers.LSTM(64))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(16, activation='relu'))
    return model

# Define the input shapes
input_shape = max_sequence_length

# Create the Siamese network
anchor_input = tf.keras.Input(shape=(input_shape,), name='anchor')
game_input = tf.keras.Input(shape=(input_shape,), name='game')

siamese_network = build_siamese_network(input_shape, vocab_size)

# Generate the encodings (feature vectors) for the anchor and game
encoded_anchor = siamese_network(anchor_input)
encoded_game = siamese_network(game_input)

# Calculate the similarity between the anchor and game
similarity = layers.Dot(axes=1, normalize=True)([encoded_anchor, encoded_game])

# Create the Siamese model
siamese_model = tf.keras.Model(inputs=[anchor_input, game_input], outputs=similarity)

# Compile the Siamese model with binary crossentropy loss
# custom_adam = Adam(learning_rate=0.001)
siamese_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
print(siamese_model.summary())

In [None]:
train_anchor = np.array(train_anchor)
train_game = np.array(train_game)
train_labels = np.array(train_labels)

val_anchor = np.array(val_anchor)
val_game = np.array(val_game)
val_labels = np.array(val_labels)

In [None]:
# Train the Siamese model with your padded_sequences_anchor, padded_sequences_game, and labels
siamese_model.fit(
    [train_anchor, train_game],
    train_labels,
    epochs=10,
    batch_size=128,
    #validation_split=0.2,
    use_multiprocessing=True,
    workers=10,
    validation_data=([val_anchor, val_game], val_labels)
)

In [None]:
from tensorflow.keras.models import save_model

siamese_model.save('siamese_model.keras')

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = siamese_model.evaluate(
    (test_anchor, test_game),
    np.array(test_labels),
)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy * 100:.2f}%')

In [None]:
# from tensorflow.keras.models import load_model
# 
# siamese_model = load_model("siamese_model.h5")

In [None]:
# get item by id
# anchor_game = {'id': 26715, 'features': 'Adventure Arcade Indie Platform Racing Action abstract cyberpunk fastpaced precisionplatforming shootemup speedrunmode superhero'}
# 
# item_sequences = tokenizer.texts_to_sequences([anchor_game['features']])
# padded_item_sequences = pad_sequences(item_sequences, maxlen=max_sequence_length, padding='post')

In [None]:
# get all items