In [1]:
%load_ext tensorboard

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional, RepeatVector, Attention, Concatenate, Conv1D, MaxPooling1D, UpSampling1D, MultiHeadAttention, LayerNormalization, Add
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
import pickle
import os
import datetime
import time

2024-11-12 00:03:29.619562: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
filepath = 'data_download/clean_df.csv'


def get_names_descriptions(filepath):
    df = pd.read_csv(filepath)
    df['text'] = df['text'].astype(str).fillna('text')
    df['manaValue'] = df['manaValue'].astype(str).fillna('manaValue')
    df['toughness'] = df['toughness'].astype(str).fillna('toughness')
    df['power'] = df['power'].astype(str).fillna('power')
    df['type'] = df['type'].astype(str).fillna('type')
    df['description'] = df['text'] + ' cost ' + df['manaCost'] + ' toughness ' + df['toughness'] + ' power ' + df['power'] + ' types ' + df['type']
    df['description'] = df['description'].astype(str).fillna('description')
    return df['card_name'].values, df['description'].values

card_names, card_descriptions = get_names_descriptions(filepath)

card_names, card_descriptions

  df = pd.read_csv(filepath)


(array(['Tukatongue Thallid', 'Moriok Replica', 'Faerie Mechanist', ...,
        'Mogg Squad', 'Roots of Wisdom', 'Raven Guild Master'],
       dtype=object),
 array(['When Tukatongue Thallid dies, create a one by one green Saproling creature token. cost green toughness one power one types Creature — Fungus',
        'one colorless black, Sacrifice Moriok Replica: You draw two cards and you lose two life. cost three colorless toughness two power two types Artifact Creature — Warrior',
        'Flying\nWhen Faerie Mechanist enters the battlefield, look at the top three cards of your library. You may reveal an artifact card from among them and put it into your hand. Put the rest on the bottom of your library in any order. cost three colorless blue toughness two power two types Artifact Creature — Faerie Artificer',
        ...,
        'Mogg Squad gets minus one by minus one for each other creature on the battlefield. cost one colorless red toughness three power three types Creature — Go

In [4]:
embedding_dim = 512
max_len_description = 100 
max_len_name = 10 

In [5]:
# Tokenization and padding
tokenizer = Tokenizer(char_level=True,
                      lower=True,
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

tokenizer.fit_on_texts(card_descriptions)
sequences = tokenizer.texts_to_sequences(card_descriptions)
padded_sequences = pad_sequences(sequences, maxlen=max_len_description, padding='post')

In [6]:
def save_tokenizer(tokenizer, path='./models/tokenizer.pkl'):
    with open(path, 'wb') as f:
        pickle.dump(tokenizer, f)

def load_tokenizer(path='./models/tokenizer.pkl'):
    with open(path, 'rb') as f:
        tokenizer = pickle.load(f)
    return tokenizer

save_tokenizer(tokenizer)
tokenizer_loaded = load_tokenizer()

In [7]:
def save_model(model, name, path='./models'):
    os.makedirs(path, exist_ok=True)
    model.save(os.path.join(path, f"{name}.keras"))

def load_model(name, path='./models'):
    return tf.keras.models.load_model(os.path.join(path, f"{name}.keras"))

In [8]:
def create_lstm_model(vocab_size, input_length, embedding_dim):
    # Encoder
    input_text = Input(shape=(input_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_text)
    x = LSTM(embedding_dim, return_sequences=False)(x)
    
    # Dense layer as the "bottleneck" embedding (this is our sentence embedding)
    encoded = Dense(embedding_dim, activation='relu')(x)
    
    # Decoder
    x = Dense(embedding_dim, activation='relu')(encoded)
    x = tf.keras.layers.RepeatVector(input_length)(x)
    x = LSTM(embedding_dim, return_sequences=True)(x)
    decoded = Dense(vocab_size, activation='softmax')(x)
    
    # Autoencoder model
    autoencoder = Model(inputs=input_text, outputs=decoded)
    
    # Encoder model (for extracting embeddings)
    encoder = Model(inputs=input_text, outputs=encoded)
    
    return autoencoder, encoder

def create_bilstm_autoencoder(vocab_size, input_length, embedding_dim):
    # Encoder
    input_text = Input(shape=(input_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_text)
    x = Bidirectional(LSTM(embedding_dim, return_sequences=False))(x)
    
    # Bottleneck
    encoded = Dense(embedding_dim, activation='relu')(x)
    
    # Decoder
    x = Dense(embedding_dim, activation='relu')(encoded)
    x = RepeatVector(input_length)(x)
    x = Bidirectional(LSTM(embedding_dim, return_sequences=True))(x)
    decoded = Dense(vocab_size, activation='softmax')(x)
    
    # Models
    autoencoder = Model(inputs=input_text, outputs=decoded)
    encoder = Model(inputs=input_text, outputs=encoded)
    
    return autoencoder, encoder

def create_bilstm_autoencoder_attention(vocab_size, input_length, embedding_dim):
    # Encoder
    input_text = Input(shape=(input_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_text)
    
    # Bidirectional LSTM for richer context encoding
    x = Bidirectional(LSTM(embedding_dim, return_sequences=True))(x)
    
    # Adding a Dropout layer to prevent overfitting
    x = Dropout(0.2)(x)
    
    # Attention layer to focus on important words for MTG cards
    # We calculate attention on the output of the LSTM
    attention = Attention()([x, x])
    x = Concatenate()([x, attention])  # Concatenate original LSTM output with attention output
    
    # Final dense layer as bottleneck embedding (sentence embedding)
    x = LSTM(embedding_dim, return_sequences=False)(x)  # Flatten output for dense layer
    encoded = Dense(embedding_dim, activation='relu')(x)
    
    # Decoder
    x = Dense(embedding_dim, activation='relu')(encoded)
    x = RepeatVector(input_length)(x)
    
    # Second LSTM layer for decoding
    x = Bidirectional(LSTM(embedding_dim, return_sequences=True))(x)
    x = Dropout(0.2)(x)  # Dropout in decoder for robustness
    
    # Final output layer with softmax activation
    decoded = Dense(vocab_size, activation='softmax')(x)
    
    # Autoencoder model
    autoencoder = Model(inputs=input_text, outputs=decoded)
    
    # Encoder model (for extracting embeddings)
    encoder = Model(inputs=input_text, outputs=encoded)
    
    return autoencoder, encoder

def create_cnn_lstm_autoencoder(vocab_size, input_length, embedding_dim):
    # Encoder
    input_text = Input(shape=(input_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_text)
    x = Conv1D(embedding_dim, kernel_size=3, activation='relu', padding='same')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = LSTM(embedding_dim, return_sequences=False)(x)
    
    # Bottleneck
    encoded = Dense(embedding_dim, activation='relu')(x)
    
    # Decoder
    x = Dense(embedding_dim, activation='relu')(encoded)
    x = RepeatVector(input_length // 2)(x)
    x = LSTM(embedding_dim, return_sequences=True)(x)
    x = UpSampling1D(size=2)(x)
    decoded = Dense(vocab_size, activation='softmax')(x)
    
    # Models
    autoencoder = Model(inputs=input_text, outputs=decoded)
    encoder = Model(inputs=input_text, outputs=encoded)
    
    return autoencoder, encoder

def transformer_encoder_decoder(vocab_size, input_length, embedding_dim, num_heads=4):
    # Encoder
    input_text = Input(shape=(input_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_text)
    x = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(x, x)
    x = LayerNormalization()(x)
    x = LSTM(embedding_dim, return_sequences=False)(x)
    
    # Bottleneck
    encoded = Dense(embedding_dim, activation='relu')(x)
    
    # Decoder
    x = Dense(embedding_dim, activation='relu')(encoded)
    x = RepeatVector(input_length)(x)
    x = LSTM(embedding_dim, return_sequences=True)(x)
    decoded = Dense(vocab_size, activation='softmax')(x)
    
    # Models
    autoencoder = Model(inputs=input_text, outputs=decoded)
    encoder = Model(inputs=input_text, outputs=encoded)
    
    return autoencoder, encoder

In [9]:
vocab_size = len(tokenizer.word_index) + 1  # Plus 1 for padding
padded_sequences = np.array(padded_sequences)
target_sequences = np.expand_dims(padded_sequences, -1)

In [10]:
early_stopping = EarlyStopping(
    monitor='loss',    
    patience=2,
    min_delta=0.1,
    restore_best_weights=True
)

In [11]:
model_names = ["lstm", "bilstm", "bilstm_attention", "cnn_lstm", "transformer"]
model_functions = [create_lstm_model, create_bilstm_autoencoder, create_bilstm_autoencoder_attention, create_cnn_lstm_autoencoder, transformer_encoder_decoder]

In [12]:
def compute_embeddings(descriptions):
    sequences = tokenizer.texts_to_sequences(descriptions)
    padded_seqs = pad_sequences(sequences, maxlen=max_len_description, padding='post')
    return encoder.predict(padded_seqs)

def get_card_description(querry):
    index = np.where(card_names == querry)[0]
    if index.size > 0:
        return card_descriptions[index][0]
    return querry

def get_card_name(querry):
    card_index = np.where(card_descriptions == querry)[0][0]
    return card_names[card_index]

def find_similar_cards(querry, card_descriptions, card_embeddings, top_n=3):
    card_description = get_card_description(querry)
    query_embedding = compute_embeddings([card_description])[0]
    similarities = cosine_similarity([query_embedding], card_embeddings)[0]
    similar_indices = similarities.argsort()[-top_n:][::-1]
    return [(card_descriptions[i], similarities[i]) for i in similar_indices]

for i in range(len(model_names)):
    model_name = "_" + model_names[i]
    autoencoder_name = model_names[i] + "_autoencoder"
    encoder_name = model_names[i] + "_encoder"
    model_function = model_functions[i]
    autoencoder, encoder = model_function(vocab_size, max_len_description, embedding_dim)
    # print(autoencoder.summary(), encoder.summary())
    autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    
    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + model_name
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
    print(f"Model: {model_names[i]}")
    
    start_time = time.time()
    autoencoder.fit(padded_sequences, target_sequences, epochs=10, batch_size=16, callbacks=[early_stopping, tensorboard_callback])
    end_time = time.time()
    
    training_time = end_time - start_time
    print(f"Training Time: {training_time:.2f} seconds")
    
    card_embeddings = compute_embeddings(card_descriptions)
    query_descriptions = ['Sol Ring', 'Structural Assault', 'Crossbow Ambush', 'Mephitic Draught', 'Sangromancer']
    for query_description in query_descriptions:
        similar_cards = find_similar_cards(query_description, card_descriptions, card_embeddings, 10)
        print(f"Similar cards to {query_description}:")
        for desc, score in similar_cards:        
            print(f"{get_card_name(desc)}, (similarity: {score:.2f})")

    save_model(autoencoder, autoencoder_name)
    save_model(encoder, encoder_name)

Model: lstm
Epoch 1/10
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m639s[0m 390ms/step - loss: 2.8196
Epoch 2/10
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m633s[0m 387ms/step - loss: 2.3512
Epoch 3/10
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m635s[0m 389ms/step - loss: 2.0880
Epoch 4/10
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m637s[0m 390ms/step - loss: 1.9430
Epoch 5/10
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m634s[0m 388ms/step - loss: 1.9065
Epoch 6/10
[1m1634/1634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m634s[0m 388ms/step - loss: 1.8920
Training Time: 3811.90 seconds
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Similar cards to Sol Ring:
Sol Ring, (similarity: 1.00)
Morningtide, (similarity: 1.00)
Glimpse the Unthinkable, (similarity: 1.00)
Concentrate, (similarity: 1.00)
Ac