<a href="https://colab.research.google.com/github/aetev/Hearth-Stone-Python-Simulator/blob/main/Welcome_To_Colab_(3).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed, Embedding, Bidirectional, Attention, Concatenate, Masking
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ReduceLROnPlateau
import gc
import random
from sklearn.metrics.pairwise import cosine_similarity





In [5]:
df = pd.read_json("/content/AtomicCards.json")

In [6]:
def replace_card_name(index, text):
  name_parts = index.split(',')  # Split by comma
  possible_matches = [index]

  # Add individual name parts if comma exists
  if len(name_parts) > 1:
    possible_matches.extend([part.strip() for part in name_parts])

  # Add permutation for names with multiple words before comma
  first_part = name_parts[0].strip()  # Get the part before comma
  first_part_words = first_part.split()  # Split into words
  if len(first_part_words) > 1:
      possible_matches.append(first_part_words[0]) # Add the first word as a match

  # Replace occurrences of possible matches in the text, using word boundaries
  for name in possible_matches:
    text = re.sub(r'\b' + re.escape(name) + r'\b', 'this', text) # Use re.escape and word boundaries
  return text

In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'\/', ' ', text)
    text = re.sub(r'\{|\}', ' ', text)
    text = re.sub(r'[^\w\s\+\-]', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [8]:
text_list = []

for index, row in df.iterrows():
    # Process the indices you're interested in: 0, 1, and 3
    for data_index in [0, 1, 3, 4, 5, 6]:
        try:
            text = row['data'][data_index]['text']
            text = replace_card_name(index, text)
            text = clean_text(text)
            text_list.append("<START> " + text + " <END>")
        except:
            pass  # Silently handle the exception



In [9]:
print(len(text_list))

31634


In [24]:
# Step 2: Configure tokenizer with special tokens
tokenizer = Tokenizer(oov_token="<OOV>")
# Add special tokens to ensure they're in the vocabulary
tokenizer.fit_on_texts(text_list)

# Add special token indices manually if needed
word_index = tokenizer.word_index
# Make sure special tokens have specific indices
# This step is optional as they should already be in the vocabulary
start_token_id = word_index.get("<START>")
end_token_id = word_index.get("<END>")
pad_token_id = 0  # Padding token is usually 0

vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")
print(f"START token ID: {start_token_id}")
print(f"END token ID: {end_token_id}")

# Convert to sequences
sequences = tokenizer.texts_to_sequences(text_list)
max_sequence_length = max([len(seq) for seq in sequences])
print(f"Maximum sequence length: {max_sequence_length}")

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Step 3: Define an improved LSTM Autoencoder model with masking
def create_word_level_lstm_autoencoder(vocab_size, max_length, embedding_dim, latent_dim, pad_token_id=0):
    # Define encoder
    inputs = Input(shape=(max_length,))

    # Add masking layer to ignore padding tokens
    masked_inputs = Masking(mask_value=pad_token_id)(inputs)

    x = Embedding(vocab_size, embedding_dim, input_length=max_length)(masked_inputs)
    x = TimeDistributed(Dense(latent_dim, activation='relu'))(x)
    encoded = LSTM(latent_dim)(x)

    # Define decoder
    decoded = RepeatVector(max_length)(encoded)
    decoded = LSTM(latent_dim, return_sequences=True)(decoded)
    decoded = TimeDistributed(Dense(vocab_size, activation='softmax'))(decoded)

    # Create autoencoder model
    autoencoder = Model(inputs, decoded)
    autoencoder.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    # Create encoder model for extracting embeddings later if needed
    encoder_model = Model(inputs, encoded)

    return autoencoder, encoder_model

# Step 4: Create and train the model
embedding_dim = 300
latent_dim = 2000
model, encoder = create_word_level_lstm_autoencoder(
    vocab_size,
    max_sequence_length,
    embedding_dim,
    latent_dim,
    pad_token_id=0
)

# Print model summary
model.summary()

# Step 5: Prepare target data with masking for training
# Reshape target data for sparse categorical crossentropy
target_data = np.expand_dims(padded_sequences, -1)

# Create a mask to ignore padding tokens in the loss calculation
# This is handled automatically by the Masking layer in the model

Vocabulary size: 4409
START token ID: None
END token ID: None
Maximum sequence length: 253




In [25]:
# Create a callback that reduces the learning rate when a metric has stopped improving
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',  # Monitor validation loss
    factor=0.2,          # Factor by which the learning rate will be reduced
    patience=3,          # Number of epochs with no improvement after which learning rate will be reduced
    min_lr=0.00001,      # Lower bound on the learning rate
    verbose=1            # Print message when reducing learning rate
)

# Train the model with the learning rate reduction callback
history = model.fit(
    padded_sequences, target_data,
    epochs=45,
    batch_size=16,
    shuffle=True,
    validation_split=0.1,
    verbose=1,
    callbacks=[reduce_lr]  # Add the callback
)

# Save the model in HDF5 format
model.save('my_model.h5')

Epoch 1/45
[1m1780/1780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 201ms/step - accuracy: 0.9030 - loss: 0.6763 - val_accuracy: 0.9095 - val_loss: 0.5735 - learning_rate: 0.0010
Epoch 2/45
[1m1780/1780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 197ms/step - accuracy: 0.9081 - loss: 0.5709 - val_accuracy: 0.9095 - val_loss: 0.5992 - learning_rate: 0.0010
Epoch 3/45
[1m1780/1780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 197ms/step - accuracy: 0.9075 - loss: 0.5738 - val_accuracy: 0.9095 - val_loss: 0.5718 - learning_rate: 0.0010
Epoch 4/45
[1m1780/1780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 197ms/step - accuracy: 0.9076 - loss: 0.5730 - val_accuracy: 0.9095 - val_loss: 0.5735 - learning_rate: 0.0010
Epoch 5/45
[1m1780/1780[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 197ms/step - accuracy: 0.9075 - loss: 0.5732 - val_accuracy: 0.9095 - val_loss: 0.5749 - learning_rate: 0.0010
Epoch 6/45
[1m1780/1780[0m [32m━━━━━━



In [1]:
# Step 4: Evaluate the model on just a few examples
print("Making predictions on a small subset...")

# Select only a few samples to evaluate (to avoid memory issues)
num_samples_to_evaluate = min(3, len(padded_sequences))  # Evaluate at most 3 examples

# For reproducibility, you might want to set a seed
np.random.seed(42)
sample_indices = np.random.choice(len(padded_sequences), num_samples_to_evaluate, replace=False)

# Get the subset of data to evaluate
subset_to_evaluate = padded_sequences[sample_indices]

# Make predictions on the subset
reconstructed_sequences = model.predict(subset_to_evaluate)

# Convert probability distributions to token indices
reconstructed_indices = np.argmax(reconstructed_sequences, axis=-1)

# Function to convert indices back to text
def indices_to_text(sequences, tokenizer):
    index_word = {v: k for k, v in tokenizer.word_index.items()}
    index_word[0] = ''  # Padding token
    texts = []
    for seq in sequences:
        # Filter out padding tokens and join words
        words = [index_word.get(idx, '<UNK>') for idx in seq if idx > 0]
        text = ' '.join(words).strip()
        texts.append(text)
    return texts

# Display results
original_texts = [text_list[i] for i in sample_indices]  # Get original texts for selected indices
reconstructed_texts = indices_to_text(reconstructed_indices, tokenizer)

print("\nOriginal vs Reconstructed:")
for i, (orig, recon) in enumerate(zip(original_texts, reconstructed_texts)):
    print(f"Example {i+1}:")
    print(f"Original: {orig}")
    print(f"Reconstructed: {recon}")

    # Calculate and display word-level accuracy
    orig_tokens = orig.split()
    recon_tokens = recon.split()
    min_len = min(len(orig_tokens), len(recon_tokens))

    if min_len > 0:
        matches = sum(1 for i in range(min_len) if orig_tokens[i] == recon_tokens[i])
        accuracy = matches / len(orig_tokens) if len(orig_tokens) > 0 else 0
        print(f"Word-level accuracy: {accuracy:.2f} ({matches}/{len(orig_tokens)} words matched)")

    print("-" * 50)



Making predictions on a small subset...


NameError: name 'padded_sequences' is not defined