<a href="https://colab.research.google.com/github/aetev/Hearth-Stone-Python-Simulator/blob/main/work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed, Embedding, Bidirectional, Attention, Concatenate, Masking
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ReduceLROnPlateau
import gc
import random
from sklearn.metrics.pairwise import cosine_similarity





In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_json("/content/drive/MyDrive/MTGdata/AtomicCards.json")

In [4]:
def replace_card_name(index, text):
  name_parts = index.split(',')  # Split by comma
  possible_matches = [index]

  # Add individual name parts if comma exists
  if len(name_parts) > 1:
    possible_matches.extend([part.strip() for part in name_parts])

  # Add permutation for names with multiple words before comma
  first_part = name_parts[0].strip()  # Get the part before comma
  first_part_words = first_part.split()  # Split into words
  if len(first_part_words) > 1:
      possible_matches.append(first_part_words[0]) # Add the first word as a match

  # Replace occurrences of possible matches in the text, using word boundaries
  for name in possible_matches:
    text = re.sub(r'\b' + re.escape(name) + r'\b', 'this', text) # Use re.escape and word boundaries
  return text

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'\/', ' ', text)
    text = re.sub(r'\{|\}', ' ', text)
    text = re.sub(r'[^\w\s\+\-]', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [55]:
text_list = []

for index, row in df.iterrows():
    # Process the indices you're interested in: 0, 1, and 3
    for data_index in [0, 1, 3, 4, 5, 6]:
        try:
            text = row['data'][data_index]['text']
            text = replace_card_name(index, text)
            text = clean_text(text)
            text_list.append('sos ' + text + ' eos')
        except:
            pass  # Silently handle the exception



In [56]:
print(text_list[0])
print(text_list[1])

sos at the beginning of your upkeep you may say ach hans run its the and the name of a creature card if you do search your library for a card with that name put it onto the battlefield then shuffle that creature gains haste exile it at the beginning of the next end step eos
sos when brims barone this enters put a +1 +1 counter on each other creature you control that has a hat brims barone this has menace as long as youre wearing a hat eos


In [58]:
# Assuming your list of strings is named 'text_list'
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_list)
total_words = len(tokenizer.word_index) + 1
# Convert text to sequences
input_sequences = tokenizer.texts_to_sequences(text_list)

# Pad sequences to have the same length
max_sequence_length = max([len(seq) for seq in input_sequences])
encoder_input_data = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')

# Create decoder input data (shifted by one timestep) and decoder target data
decoder_input_data = np.zeros_like(encoder_input_data)
decoder_target_data = np.zeros_like(encoder_input_data)

for i, seq in enumerate(encoder_input_data):
  decoder_input_data[i, 1:] = seq[:-1]  # Shifted by one timestep
  decoder_target_data[i, :-1] = seq[1:]  # Target is the original sequence without the start token

In [59]:
# Encoder model
encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(total_words, 256)(encoder_inputs)

# First LSTM layer
encoder_lstm1 = LSTM(256, return_sequences=True, return_state=False)(encoder_embedding)
# Second LSTM layer, taking output from the first layer
encoder_lstm2 = LSTM(256, return_state=True)(encoder_lstm1)

encoder_outputs, state_h, state_c = encoder_lstm2
encoder_states = [state_h, state_c]  # Hidden and cell states

# Define encoder model
encoder_model = Model(encoder_inputs, encoder_states)

In [60]:
# Decoder model
decoder_inputs = Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(total_words, 256)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_states = [decoder_state_h, decoder_state_c]

decoder_dense = Dense(total_words, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define decoder model
decoder_model = Model([decoder_inputs] + encoder_states, [decoder_outputs] + decoder_states)

In [65]:
# Connect the encoder and decoder
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=64, epochs=100, validation_split=0.2)

Epoch 1/100
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 81ms/step - accuracy: 0.9349 - loss: 0.3277 - val_accuracy: 0.9359 - val_loss: 0.3218
Epoch 2/100
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 78ms/step - accuracy: 0.9385 - loss: 0.2988 - val_accuracy: 0.9387 - val_loss: 0.3026
Epoch 3/100
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 78ms/step - accuracy: 0.9412 - loss: 0.2791 - val_accuracy: 0.9407 - val_loss: 0.2899
Epoch 4/100
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 78ms/step - accuracy: 0.9438 - loss: 0.2622 - val_accuracy: 0.9419 - val_loss: 0.2816
Epoch 5/100
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 78ms/step - accuracy: 0.9454 - loss: 0.2512 - val_accuracy: 0.9431 - val_loss: 0.2747
Epoch 6/100
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 78ms/step - accuracy: 0.9466 - loss: 0.2434 - val_accuracy: 0.9437 - val_loss: 0.2707
Epoch 7/10

KeyboardInterrupt: 

In [None]:
def test_encoder_decoder(encoder_model, decoder_model, tokenizer, text_lst, sample_size=5, max_sequence_length=max_sequence_length):
  """Tests the encoder-decoder network on a small sample of text.

  Args:
    encoder_model: The trained encoder model.
    decoder_model: The trained decoder model.
    tokenizer: The tokenizer used to convert text to sequences.
    text_lst: The list of text strings.
    sample_size: The number of samples to test.
    max_sequence_length: The maximum sequence length used during training.

  Returns:
    None. Prints the original and predicted text for each sample.
  """
  # Get a random sample from text_lst
  sample_indices = random.sample(range(len(text_lst)), sample_size)
  sample_texts = [text_lst[i] for i in sample_indices]

  for text in sample_texts:
    # Encode the input text
    input_seq = tokenizer.texts_to_sequences([text])[0]  # Get the sequence
    input_seq = pad_sequences([input_seq], maxlen=max_sequence_length, padding='post')  # Pad
    states_value = encoder_model.predict(input_seq)

    # Generate the output sequence
    target_seq = np.zeros((1, 1))  # Start with a single token (e.g., start token)
    target_seq[0, 0] = tokenizer.word_index['sos']  # Assuming '<SOS> ' is your start token

    decoded_sentence = ''
    stop_condition = False

    while not stop_condition:
      output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
      sampled_token_index = np.argmax(output_tokens[0, -1, :])  # Sample the next token

      sampled_word = tokenizer.index_word[sampled_token_index] if sampled_token_index != 0 else 'eos'  # Get the word
      decoded_sentence += ' ' + sampled_word

      if sampled_word == 'eos' or len(decoded_sentence.split()) > max_sequence_length:
        stop_condition = True

      target_seq = np.zeros((1, 1))
      target_seq[0, 0] = sampled_token_index
      states_value = [h, c]

    print('Original:', text)
    print('Predicted:', decoded_sentence)
    print('---')

test_encoder_decoder(encoder_model, decoder_model, tokenizer, text_list, sample_size=3)  # Test with 3 samples


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39