In [1]:
%load_ext tensorboard

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional, RepeatVector, Attention, Concatenate, Conv1D, MaxPooling1D, Concatenate, UpSampling1D, MultiHeadAttention, LayerNormalization, Add, GRU, Layer
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
import pickle
import os
import datetime
import time
from datasets import load_dataset
import json


2024-11-30 01:28:08.730533: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("embedding-data/simple-wiki")

In [4]:
train_data = dataset['train']
sentences = [sentence for pair in train_data['set'] for sentence in pair]
for i in range(10):
    print(sentences[i])

The greatest example has been in his present job ( then , Minister for Foreign Affairs ) , where he has perforce concentrated on Anglo-Irish relations and , in particular Northern Ireland ( .
The greatest example has been in his present job ( then , Minister for Foreign Affairs ) , where he has perforce concentrated on Anglo-Irish relations and , in particular the North ( i.e. , Northern Ireland ) .
President Hillery refused to speak to any opposition party politicians , but when Charles Haughey , who was Leader of the Opposition , had rang the President 's Office he threatened to end the career of the army officer answered and refused on Hillery 's explicit orders to put the call through to the President .
His reputation rose further when opposition leaders under parliamentary privilege alleged that Taoiseach Charles Haughey , who in January 1982 had been Leader of the Opposition , had not merely rung the President 's Office but threatened to end the career of the army officer who too

In [5]:
filepath = 'data_download/MyDataMTGv2.json'
# filepath = 'data_download/clean_df.csv'

# def load_dataset(path):
#     df = pd.read_csv('data_download/clean_df.csv')
#     df['text'] = df['text'].astype(str).fillna('text')
#     df['manaCost'] = df['manaCost'].astype(str).fillna('manaCost')
#     df['toughness'] = df['toughness'].astype(str).fillna('toughness')
#     df['power'] = df['power'].astype(str).fillna('power')
#     df['type'] = df['type'].astype(str).fillna('type')
#     df['description'] = df['text'] + ' manaCost ' + df['manaCost'] + ' toughness ' + df['toughness'] + ' power ' + df['power'] + ' types ' + df['type']
#     return df['card_name'].values, df['description'].values

def load_dataset(path):
    df = pd.read_json(path).T
    df['text'] = df['text'].astype(str).fillna('text')
    df['manaValue'] = df['manaValue'].astype(str).fillna('manaValue')
    df['toughness'] = df['toughness'].astype(str).fillna('toughness')
    df['power'] = df['power'].astype(str).fillna('power')
    df['type'] = df['type'].astype(str).fillna('type')
    df['description'] = df['text'] + '' + df['manaValue'] + '' + df['toughness'] + '' + df['power'] + '' + df['type']
    df_filtered = df[['description']].reset_index()
    df_filtered.rename(columns={'index': 'card_name'}, inplace=True)
    return df_filtered['card_name'].values, df_filtered['description'].values

card_names, card_descriptions = load_dataset(filepath)

card_names, card_descriptions

(array(['Tukatongue Thallid', 'Moriok Replica', 'Faerie Mechanist', ...,
        'Mogg Squad', 'Roots of Wisdom', 'Raven Guild Master'],
       dtype=object),
 array(['When Tukatongue Thallid dies, create a 1/1 green Saproling creature token.1.011Creature — Fungus',
        '{1}{B}, Sacrifice Moriok Replica: You draw two cards and you lose 2 life.3.022Artifact Creature — Warrior',
        'Flying\nWhen Faerie Mechanist enters the battlefield, look at the top three cards of your library. You may reveal an artifact card from among them and put it into your hand. Put the rest on the bottom of your library in any order.4.022Artifact Creature — Faerie Artificer',
        ...,
        'Mogg Squad gets -1/-1 for each other creature on the battlefield.2.033Creature — Goblin',
        "Mill three cards, then return a land card or Elf card from your graveyard to your hand. If you can't, draw a card. (To mill a card, put the top card of your library into your graveyard.)2.0nannanSorcery",
       

In [6]:
embedding_dim = 512
max_len_description = 50 
max_len_name = 10 

In [7]:
# Tokenization and padding
tokenizer_sentences = Tokenizer(char_level=True,
                      lower=True,
                      filters='!"#$%&()*,.:;<=>?@[\\]^_`{|}~\t\n')
                    #   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

tokenizer_card_descriptions= Tokenizer(char_level=True,
                             lower=True,
                             filters='!"#$%&()*,.:;<=>?@[\\]^_`{|}~\t\n')
                    #   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

tokenizer_sentences.fit_on_texts(sentences)
sequences_sentences = tokenizer_sentences.texts_to_sequences(sentences)
padded_sequences_sentences = pad_sequences(sequences_sentences, maxlen=max_len_description, padding='post')

tokenizer_card_descriptions.fit_on_texts(card_descriptions)
sequences_card_descriptions = tokenizer_card_descriptions.texts_to_sequences(card_descriptions)
padded_sequences_card_descriptions = pad_sequences(sequences_card_descriptions, maxlen=max_len_description, padding='post')


In [8]:
vocab_size = len(tokenizer_sentences.word_index) + 1  # Plus 1 for padding

padded_sequences_sentences = np.array(padded_sequences_sentences)
target_padded_sequences_sentences= np.expand_dims(padded_sequences_sentences, -1)

padded_sequences_card_descriptions = np.array(padded_sequences_card_descriptions)
target_padded_sequences_card_descriptions= np.expand_dims(padded_sequences_card_descriptions, -1)

In [9]:
def save_tokenizer(tokenizer, path='./models/tokenizer.pkl'):
    with open(path, 'wb') as f:
        pickle.dump(tokenizer, f)

def load_tokenizer(path='./models/tokenizer.pkl'):
    with open(path, 'rb') as f:
        tokenizer = pickle.load(f)
    return tokenizer

save_tokenizer(tokenizer_card_descriptions)
tokenizer_card_descriptions = load_tokenizer()

In [10]:
def save_model(model, name, path='./models'):
    os.makedirs(path, exist_ok=True)
    model.save(os.path.join(path, f"{name}.keras"))

def load_model(name, path='./models'):
    return tf.keras.models.load_model(os.path.join(path, f"{name}.keras"))



In [11]:
def create_lstm_model(vocab_size, input_length, embedding_dim):
    # Encoder
    input_text = Input(shape=(input_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_text)
    x = LSTM(embedding_dim, return_sequences=False)(x)
    
    # Dense layer as the "bottleneck" embedding (this is our sentence embedding)
    encoded = Dense(embedding_dim, activation='relu')(x)
    
    # Decoder
    x = Dense(embedding_dim, activation='relu')(encoded)
    x = tf.keras.layers.RepeatVector(input_length)(x)
    x = LSTM(embedding_dim, return_sequences=True)(x)
    decoded = Dense(vocab_size, activation='softmax')(x)
    
    # Autoencoder model
    autoencoder = Model(inputs=input_text, outputs=decoded)
    
    # Encoder model (for extracting embeddings)
    encoder = Model(inputs=input_text, outputs=encoded)
    
    return autoencoder, encoder

def create_lstm_bilstm(vocab_size, input_length, embedding_dim):
    # Input for both encoders
    input_text = Input(shape=(input_length,))
    
    # BiLSTM Encoder
    x1 = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_text)
    x1 = Bidirectional(LSTM(embedding_dim, return_sequences=False))(x1)
    encoded1 = Dense(embedding_dim, activation='relu')(x1)

    # LSTM Encoder
    x2 = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_text)
    x2 = LSTM(embedding_dim, return_sequences=False)(x2)
    encoded2 = Dense(embedding_dim, activation='relu')(x2)

    # Combine Encodings
    combined_encoding = Concatenate()([encoded1, encoded2])

    # Decoder
    x = Dense(embedding_dim * 2, activation='relu')(combined_encoding)  # Adjust for doubled embedding dimension
    x = RepeatVector(input_length)(x)
    x = LSTM(embedding_dim * 2, return_sequences=True)(x)
    decoded = Dense(vocab_size, activation='softmax')(x)

    # Models
    autoencoder = Model(inputs=input_text, outputs=decoded)
    encoder = Model(inputs=input_text, outputs=combined_encoding)

    return autoencoder, encoder

def create_bilstm_autoencoder(vocab_size, input_length, embedding_dim):
    # Encoder
    input_text = Input(shape=(input_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_text)
    x = Bidirectional(LSTM(embedding_dim, return_sequences=False))(x)
    
    # Bottleneck
    encoded = Dense(embedding_dim, activation='relu')(x)
    
    # Decoder
    x = Dense(embedding_dim, activation='relu')(encoded)
    x = RepeatVector(input_length)(x)
    x = Bidirectional(LSTM(embedding_dim, return_sequences=True))(x)
    decoded = Dense(vocab_size, activation='softmax')(x)
    
    # Models
    autoencoder = Model(inputs=input_text, outputs=decoded)
    encoder = Model(inputs=input_text, outputs=encoded)
    
    return autoencoder, encoder


def create_bilstm_autoencoder_attention(vocab_size, input_length, embedding_dim):
    # Encoder
    input_text = Input(shape=(input_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_text)
    
    # Bidirectional LSTM for richer context encoding
    x = Bidirectional(LSTM(embedding_dim, return_sequences=True))(x)
    
    # Adding a Dropout layer to prevent overfitting
    x = Dropout(0.2)(x)
    
    # Attention layer to focus on important words for MTG cards
    # We calculate attention on the output of the LSTM
    attention = Attention()([x, x])
    x = Concatenate()([x, attention])  # Concatenate original LSTM output with attention output
    
    # Final dense layer as bottleneck embedding (sentence embedding)
    x = LSTM(embedding_dim, return_sequences=False)(x)  # Flatten output for dense layer
    encoded = Dense(embedding_dim, activation='relu')(x)
    
    # Decoder
    x = Dense(embedding_dim, activation='relu')(encoded)
    x = RepeatVector(input_length)(x)
    
    # Second LSTM layer for decoding
    x = Bidirectional(LSTM(embedding_dim, return_sequences=True))(x)
    x = Dropout(0.2)(x)  # Dropout in decoder for robustness
    
    # Final output layer with softmax activation
    decoded = Dense(vocab_size, activation='softmax')(x)
    
    # Autoencoder model
    autoencoder = Model(inputs=input_text, outputs=decoded)
    
    # Encoder model (for extracting embeddings)
    encoder = Model(inputs=input_text, outputs=encoded)
    
    return autoencoder, encoder


def create_cnn_lstm_autoencoder(vocab_size, input_length, embedding_dim):
    # Encoder
    input_text = Input(shape=(input_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_text)
    x = Conv1D(embedding_dim, kernel_size=3, activation='relu', padding='same')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = LSTM(embedding_dim, return_sequences=False)(x)
    
    # Bottleneck
    encoded = Dense(embedding_dim, activation='relu')(x)
    
    # Decoder
    x = Dense(embedding_dim, activation='relu')(encoded)
    x = RepeatVector(input_length // 2)(x)
    x = LSTM(embedding_dim, return_sequences=True)(x)
    x = UpSampling1D(size=2)(x)
    decoded = Dense(vocab_size, activation='softmax')(x)
    
    # Models
    autoencoder = Model(inputs=input_text, outputs=decoded)
    encoder = Model(inputs=input_text, outputs=encoded)
    
    return autoencoder, encoder


def transformer_encoder_decoder(vocab_size, input_length, embedding_dim, num_heads=4, dropout_rate=0.1):
    # Positional Embedding Layer
    class PositionalEmbedding(Layer):
        def __init__(self, vocab_size, embedding_dim, input_length):
            super(PositionalEmbedding, self).__init__()
            self.embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
            self.positional_encoding = self.add_weight(
                shape=(input_length, embedding_dim), initializer="zeros", trainable=True
            )
        
        def call(self, inputs):
            seq_length = tf.shape(inputs)[1]
            return self.embedding(inputs) + self.positional_encoding[:seq_length]

    # Encoder
    input_text = Input(shape=(input_length,))
    x = PositionalEmbedding(vocab_size, embedding_dim, input_length)(input_text)
    for _ in range(2):  # Stacking lightweight attention layers
        attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(x, x)
        x = LayerNormalization()(Add()([x, attn_output]))
        x = Dropout(dropout_rate)(x)
    x = GRU(embedding_dim, return_sequences=False)(x)
    
    # Bottleneck
    encoded = Dense(embedding_dim // 2, activation='relu')(x)
    encoded = Dense(embedding_dim, activation='relu')(encoded)

    # Decoder
    x = Dense(embedding_dim, activation='relu')(encoded)
    x = RepeatVector(input_length)(x)
    for _ in range(2):  # Reuse GRU with shared layers
        x = GRU(embedding_dim, return_sequences=True)(x)
        x = LayerNormalization()(x)
    decoded = Dense(vocab_size, activation='softmax')(x)

    # Models
    autoencoder = Model(inputs=input_text, outputs=decoded)
    encoder = Model(inputs=input_text, outputs=encoded)

    return autoencoder, encoder

In [12]:
early_stopping = EarlyStopping(
    monitor='loss',    
    patience=5,
    min_delta=0.001,
    restore_best_weights=True
)

In [13]:
# model_names = ["lstm", "bilstm", "lstm_bilstm", "bilstm_attention", "cnn_lstm", "transformer"]
model_names = ["lstm", "cnn_lstm"]
# model_names = ["transformer"]
# model_functions = [create_lstm_model, create_bilstm_autoencoder, create_lstm_bilstm, create_bilstm_autoencoder_attention, create_cnn_lstm_autoencoder, transformer_encoder_decoder]
model_functions = [create_lstm_model, create_cnn_lstm_autoencoder]

In [14]:
def compute_embeddings(descriptions):
    sequences = tokenizer_card_descriptions.texts_to_sequences(descriptions)
    padded_seqs = pad_sequences(sequences, maxlen=max_len_description, padding='post')
    return encoder.predict(padded_seqs)

def get_card_description(querry):
    index = np.where(card_names == querry)[0]
    if index.size > 0:
        return card_descriptions[index][0]
    
    return querry

def get_card_name(querry):
    card_index = np.where(card_descriptions == querry)[0][0]
    return card_names[card_index]

def find_similar_cards(querry, card_descriptions, card_embeddings, top_n=3):
    card_description = get_card_description(querry)
    query_embedding = compute_embeddings([card_description])[0]
    similarities = cosine_similarity([query_embedding], card_embeddings)[0]
    similar_indices = similarities.argsort()[-top_n:][::-1]
    return [(card_descriptions[i], similarities[i]) for i in similar_indices]

model_predictions = []

for i in range(len(model_names)):
    model_name = "_" + model_names[i]
    autoencoder_name = model_names[i] + "_autoencoder"
    encoder_name = model_names[i] + "_encoder"
    model_function = model_functions[i]
    autoencoder, encoder = model_function(vocab_size, max_len_description, embedding_dim)
    # autoencoder = load_model(autoencoder_name)
    # encoder = load_model(encoder_name)
    # print(autoencoder.summary(), encoder.summary())
    autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    
    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + model_name
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
    print(f"Model: {model_names[i]}")
    
    start_time = time.time()
    autoencoder.fit(padded_sequences_sentences, target_padded_sequences_sentences, epochs=60, batch_size=128, callbacks=[early_stopping, tensorboard_callback])
    # autoencoder.fit(padded_sequences_sentences, target_padded_sequences_sentences, epochs=1, batch_size=128, callbacks=[early_stopping])
    end_time = time.time()
    training_time = end_time - start_time
    print(f"Training Time: {training_time:.2f} seconds")

    print("Fine tunning:")
    start_time = time.time()
    autoencoder.fit(padded_sequences_card_descriptions, target_padded_sequences_card_descriptions, epochs=20, batch_size=32, callbacks=[early_stopping, tensorboard_callback])
    # autoencoder.fit(padded_sequences_card_descriptions, target_padded_sequences_card_descriptions, epochs=1, batch_size=32, callbacks=[early_stopping])
    end_time = time.time()
    training_time = end_time - start_time
    print(f"Fine tunning time: {training_time:.2f} seconds")
    
    card_embeddings = compute_embeddings(card_descriptions)
    query_descriptions = ['Sol Ring', 'Structural Assault', 'Crossbow Ambush', 'Mephitic Draught', 'Sangromancer']
    for query_description in query_descriptions:
        card_predictions = [model_names[i], query_description]
        similar_cards = find_similar_cards(query_description, card_descriptions, card_embeddings, 10)
        for desc, score in similar_cards:        
            card_predictions.append(get_card_name(desc))

        model_predictions.append(card_predictions)


    save_model(autoencoder, autoencoder_name)
    save_model(encoder, encoder_name)

Model: lstm
Epoch 1/60
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m853s[0m 533ms/step - loss: 2.9001
Epoch 2/60
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m856s[0m 536ms/step - loss: 2.3043
Epoch 3/60
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m859s[0m 537ms/step - loss: 1.8891
Epoch 4/60
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m860s[0m 538ms/step - loss: 1.5595
Epoch 5/60
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m861s[0m 539ms/step - loss: 1.2918
Epoch 6/60
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m862s[0m 539ms/step - loss: 1.0825
Epoch 7/60
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m862s[0m 539ms/step - loss: 0.9474
Epoch 8/60
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m863s[0m 540ms/step - loss: 0.8692
Epoch 9/60
[1m1598/1598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m863s[0m 540ms/step - loss: 0.8031
Epoch 10/60
[1m1598/1598

In [15]:
columns = ['Model Name', 'Card Name'] + [f'Similar Card {i+1}' for i in range(10)]
df = pd.DataFrame(model_predictions, columns=columns)
df

Unnamed: 0,Model Name,Card Name,Similar Card 1,Similar Card 2,Similar Card 3,Similar Card 4,Similar Card 5,Similar Card 6,Similar Card 7,Similar Card 8,Similar Card 9,Similar Card 10
0,lstm,Sol Ring,Sol Ring,Ur-Golem's Eye,Ur-Golem's Eye,Thran Dynamo,Wastes,Riverglide Pathway // Lavaglide Pathway,Riverglide Pathway // Lavaglide Pathway,Barkchannel Pathway // Tidechannel Pathway,Barkchannel Pathway // Tidechannel Pathway,Great Furnace
1,lstm,Structural Assault,Structural Assault,Thrilling Encore,Urborg Justice,Fresh Meat,Faith's Reward,Second Sunrise,Let the Galaxy Burn,Fatal Push,Cradle to Grave,Force of Despair
2,lstm,Crossbow Ambush,Silk Net,Vines of the Recluse,Shape the Sands,Crossbow Ambush,Silk Net,Gloomwidow's Feast,Spidery Grasp,Aim High,Treetop Defense,Aerial Volley
3,lstm,Mephitic Draught,Metalspinner's Puzzleknot,Mephitic Draught,Infernal Idol,Skeletal Scrying,Cut of the Profits,Wooden Sphere,Tablet of Epityr,Urza's Chalice,Crystal Rod,Soul Net
4,lstm,Sangromancer,Sangromancer,Bloodrite Invoker,Kalastria Highborn,Sanctum Seeker,Herald of the Pantheon,Acolyte of Aclazotz,High Fae Negotiator,Judge of Currents,Inspiring Cleric,Centaur Healer
5,cnn_lstm,Sol Ring,Sol Ring,Ur-Golem's Eye,Ur-Golem's Eye,Thran Dynamo,Bloodstone Cameo,Seashell Cameo,Troll-Horn Cameo,Drake-Skull Cameo,Great Furnace,Wastes
6,cnn_lstm,Structural Assault,Structural Assault,Let the Galaxy Burn,Mordor on the March,Fiery Encore,"All of History, All at Once",Empty the Warrens,Urban Evolution,Escape to the Wilds,Flesh Allergy,Galvanic Relay
7,cnn_lstm,Crossbow Ambush,Crossbow Ambush,Shape the Sands,Vines of the Recluse,Silk Net,Silk Net,Gloomwidow's Feast,Treetop Defense,Aim High,Spidery Grasp,Angelic Ascension
8,cnn_lstm,Mephitic Draught,Metalspinner's Puzzleknot,Mephitic Draught,Infernal Idol,Profane Memento,Tithing Blade // Consuming Sepulcher,Thopter Foundry,Druidic Satchel,Ivory Crane Netsuke,Guild Globe,Sphere of the Suns
9,cnn_lstm,Sangromancer,Sangromancer,Bloodrite Invoker,Kalastria Highborn,Shattered Angel,Herald of the Pantheon,Deathgreeter,Bog-Strider Ash,Guardian of Cloverdell,Blood Seeker,Bleak Coven Vampires


In [16]:
output_filename = 'similar_cards.csv'
df.to_csv(output_filename, index=False)