<a href="https://colab.research.google.com/github/aetev/Hearth-Stone-Python-Simulator/blob/main/text_encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import tensorflow as tf
from tensorflow.keras.layers import Embedding, TextVectorization
# Assuming AtomicCards.json is in your Google Drive, adjust the path if necessary
file_path = 'Data/AtomicCards.json'


with open(file_path, 'r') as f:
  data = json.load(f)['data']

2025-03-10 20:48:08.696512: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741657688.715793   11270 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741657688.722005   11270 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-10 20:48:08.740795: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
def filter_legal_cards(data):
    legal_commander_data = {}
    for card_name, card_data in data.items():
        try:
            if card_data[0]['legalities']['commander'] == 'Legal':
                legal_commander_data[card_name] = card_data[0] # Store only the first element of card_data
        except (KeyError, IndexError):
            pass  # Or print a message, log it, etc.
    return legal_commander_data

filtered_data = filter_legal_cards(data)


In [None]:
class TextCleaningLayer(tf.keras.layers.Layer):
    def __init__(self, card_dataset, output_sequence_length=150, embedding_dim=100, max_tokens=19756, **kwargs):
        super(TextCleaningLayer, self).__init__(**kwargs)
        # Store configuration parameters
        self.output_sequence_length = output_sequence_length
        self.embedding_dim = embedding_dim
        self.max_tokens = max_tokens

        # Initialize vectorization layer
        self.vectorizer = tf.keras.layers.TextVectorization(
            standardize=self._clean_text,
            output_mode='int',
            output_sequence_length=output_sequence_length,
            max_tokens=max_tokens
        )

        # Extract card texts more efficiently using list comprehension
        card_texts = [card_name for card_name in card_dataset.keys()]
        card_texts.extend([
            card_info['text'] for card_info in card_dataset.values()
            if 'text' in card_info and card_info['text']
        ])

        # Create an optimized TensorFlow dataset
        card_texts_ds = tf.data.Dataset.from_tensor_slices(card_texts)

        # Apply performance optimizations
        card_texts_ds = card_texts_ds.batch(512)  # Use larger batches
        card_texts_ds = card_texts_ds.prefetch(tf.data.AUTOTUNE)  # Prefetch next batch

        # Adapt the vectorizer with the dataset
        self.vectorizer.adapt(card_texts_ds)

        # Initialize embedding layer
        self.embedding_layer = tf.keras.layers.Embedding(
            input_dim=max_tokens,
            output_dim=embedding_dim
        )

    def _clean_text(self, text):
        """Clean text by applying various regex transformations"""
        # Convert input to string tensor if it's not already
        if not isinstance(text, tf.Tensor):
            text = tf.convert_to_tensor(text, dtype=tf.string)

        # Apply cleaning operations
        cleaned_text = tf.strings.lower(text)
        cleaned_text = tf.strings.regex_replace(cleaned_text, '\n', ' ')
        cleaned_text = tf.strings.regex_replace(cleaned_text, '[\/\.]', ' ')
        cleaned_text = tf.strings.regex_replace(cleaned_text, '[\"â€”"]', ' ')
        cleaned_text = tf.strings.regex_replace(cleaned_text, '\{|\}', ' ')
        cleaned_text = tf.strings.regex_replace(cleaned_text, '\([^)]*\)', '')
        cleaned_text = tf.strings.regex_replace(cleaned_text, '[^\w\s+/\-]', '')
        cleaned_text = tf.strings.regex_replace(cleaned_text, '\s+', ' ')
        return cleaned_text

    def call(self, inputs):
        """Process inputs through the layer pipeline"""
        # Vectorize the text (cleaning is handled by the vectorizer)
        vectorized_text = self.vectorizer(inputs)
        # Pass through embedding layer
        embedded_text = self.embedding_layer(vectorized_text)
        return embedded_text

    def get_cleaned_text(self, inputs):
        """Utility method to get only the cleaned text without vectorization/embedding"""
        return self._clean_text(inputs)

  cleaned_text = tf.strings.regex_replace(cleaned_text, '[\/\.]', ' ')
  cleaned_text = tf.strings.regex_replace(cleaned_text, '\{|\}', ' ')
  cleaned_text = tf.strings.regex_replace(cleaned_text, '\([^)]*\)', '')
  cleaned_text = tf.strings.regex_replace(cleaned_text, '[^\w\s+/\-]', '')
  cleaned_text = tf.strings.regex_replace(cleaned_text, '\s+', ' ')


In [None]:
card = filtered_data['Aatchik, Emerald Radian']
cleaned_text_layer = TextCleaningLayer(filtered_data)

In [None]:
print(len(cleaned_text_layer.vectorizer.get_vocabulary()))

19756
