## load training triples

In [1]:
import json
import random


with open('training_data_pairs.json', 'r') as f:
    loaded_data = json.load(f)

train_data = loaded_data['train_data']
val_data = loaded_data['val_data']
test_data = loaded_data['test_data']

In [2]:
print(train_data[0])

{'anchor': {'id': 556, 'features': 'Shooter Action Stealth achievements aggressivedooropening binkvideo bloody checkpoints classbased controversy corpse darkness death digitaldistribution dolbydigital e eaapp eagunclub famousquotes firstpersonshooter franchisereboot gamersgate gamescom gamesondemand goat guidedbyradio human infiniteammo invisiblewall killfeed killstreakreward lineargameplay maleprotagonist matchmaking mature melee mercenary military militaryjargon modern modernmilitary modernwarfare mouse mp multimonitorsupport newpurchaseincentives norespawn novintfalconsupport onsiteprocurement paxprime playstationtrophies politicalthriller polygonald prereleasepublictesting profanity realism realtimecombat samenamereboot scriptedevents snow soldier stealthcamouflage stealthkill steam tank teamdeathmatch terrorists throwingweapons unitedstatesarmy useablevehicles visionobstruction voiceacting voicechat war wasdmovement whitenoise xboxlive'}, 'game': {'id': 23823, 'features': '  '}, '

In [3]:
total_len = len(train_data) + len(val_data) + len(test_data)
print(f'train: {len(train_data)}, val: {len(val_data)}, test: {len(test_data)}, total: {total_len}')

train: 45767, val: 91535, test: 91536, total: 228838


In [4]:
# train_data = train_data[:int(len(train_data)/2)]
# val_data = val_data[:int(len(val_data)/2)]
# test_data = test_data[:int(len(test_data)/2)]
# 
# total_len = len(train_data) + len(val_data) + len(test_data)
# print(f'train: {len(train_data)}, val: {len(val_data)}, test: {len(test_data)}, total: {total_len}')

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


def extract_features_and_labels(data):
    features_anchor = [entry['anchor']['features'] for entry in data]
    features_game = [entry['game']['features'] for entry in data]
    labels = [entry['label'] for entry in data]
    return (features_anchor, features_game, labels)


train_anchor, train_game, train_labels = extract_features_and_labels(train_data)
val_anchor, val_game, val_labels = extract_features_and_labels(val_data)
test_anchor, test_game, test_labels = extract_features_and_labels(test_data)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_anchor + train_game)
vocab_size = len(tokenizer.word_index) + 1
max_sequence_length = max(
        max(len(seq) for seq in train_anchor),
        max(len(seq) for seq in train_game),
        max(len(seq) for seq in val_anchor),
        max(len(seq) for seq in val_game),
        max(len(seq) for seq in test_anchor),
        max(len(seq) for seq in test_game)
    )

def tokenize_and_pad(sequences):
    tokenized_sequences = tokenizer.texts_to_sequences(sequences)
    padded_sequences = pad_sequences(tokenized_sequences, maxlen=max_sequence_length, padding='post')
    return padded_sequences


train_anchor = tokenize_and_pad(train_anchor)
train_game = tokenize_and_pad(train_game)

val_anchor = tokenize_and_pad(val_anchor)
val_game = tokenize_and_pad(val_game)

test_anchor = tokenize_and_pad(test_anchor)
test_game = tokenize_and_pad(test_game)

print(train_anchor[0])

2024-01-19 17:03:33.079553: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-19 17:03:33.501271: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-19 17:03:33.501379: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-19 17:03:33.580773: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-19 17:03:33.747398: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-19 17:03:33.749837: I tensorflow/core/platform/cpu_feature_guard.cc:1

[11  1 67 ...  0  0  0]


In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam


# Define the Siamese network architecture
def build_siamese_network(input_shape, vocab_size):
    model = models.Sequential()
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=input_shape))
    model.add(layers.LSTM(64))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(16, activation='relu'))
    return model

# Define the input shapes
input_shape = max_sequence_length

# Create the Siamese network
anchor_input = tf.keras.Input(shape=(input_shape,), name='anchor')
game_input = tf.keras.Input(shape=(input_shape,), name='game')

siamese_network = build_siamese_network(input_shape, vocab_size)

# Generate the encodings (feature vectors) for the anchor and game
encoded_anchor = siamese_network(anchor_input)
encoded_game = siamese_network(game_input)

# Calculate the similarity between the anchor and game
similarity = layers.Dot(axes=1, normalize=True)([encoded_anchor, encoded_game])

# Create the Siamese model
siamese_model = tf.keras.Model(inputs=[anchor_input, game_input], outputs=similarity)

# Compile the Siamese model with binary crossentropy loss
# custom_adam = Adam(learning_rate=0.001)
siamese_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
print(siamese_model.summary())

2024-01-19 17:03:56.707298: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-19 17:03:56.767982: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 anchor (InputLayer)         [(None, 3786)]               0         []                            
                                                                                                  
 game (InputLayer)           [(None, 3786)]               0         []                            
                                                                                                  
 sequential (Sequential)     (None, 16)                   470576    ['anchor[0][0]',              
                                                                     'game[0][0]']                
                                                                                                  
 dot (Dot)                   (None, 1)                    0         ['sequential[0][0]',      

In [7]:
train_anchor = np.array(train_anchor)
train_game = np.array(train_game)
train_labels = np.array(train_labels)

val_anchor = np.array(val_anchor)
val_game = np.array(val_game)
val_labels = np.array(val_labels)

In [None]:
# Train the Siamese model with your padded_sequences_anchor, padded_sequences_game, and labels
siamese_model.fit(
    [train_anchor, train_game],
    train_labels,
    epochs=10,
    batch_size=128,
    #validation_split=0.2,
    use_multiprocessing=True,
    workers=10,
    validation_data=([val_anchor, val_game], val_labels)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

In [None]:
from tensorflow.keras.models import save_model

siamese_model.save('siamese_model.keras')

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = siamese_model.evaluate(
    (test_anchor, test_game),
    np.array(test_labels),
)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy * 100:.2f}%')

In [None]:
# from tensorflow.keras.models import load_model
# 
# siamese_model = load_model("siamese_model.h5")

In [None]:
# get item by id
# anchor_game = {'id': 26715, 'features': 'Adventure Arcade Indie Platform Racing Action abstract cyberpunk fastpaced precisionplatforming shootemup speedrunmode superhero'}
# 
# item_sequences = tokenizer.texts_to_sequences([anchor_game['features']])
# padded_item_sequences = pad_sequences(item_sequences, maxlen=max_sequence_length, padding='post')

In [None]:
# get all items