<a href="https://colab.research.google.com/github/aetev/Hearth-Stone-Python-Simulator/blob/main/new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed, Embedding, Bidirectional, Attention, Concatenate, Masking
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ReduceLROnPlateau
import gc
import random
from sklearn.metrics.pairwise import cosine_similarity





In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_json("/content/drive/MyDrive/MTGdata/AtomicCards.json")

In [4]:
def replace_card_name(index, text):
  name_parts = index.split(',')  # Split by comma
  possible_matches = [index]

  # Add individual name parts if comma exists
  if len(name_parts) > 1:
    possible_matches.extend([part.strip() for part in name_parts])

  # Add permutation for names with multiple words before comma
  first_part = name_parts[0].strip()  # Get the part before comma
  first_part_words = first_part.split()  # Split into words
  if len(first_part_words) > 1:
      possible_matches.append(first_part_words[0]) # Add the first word as a match

  # Replace occurrences of possible matches in the text, using word boundaries
  for name in possible_matches:
    text = re.sub(r'\b' + re.escape(name) + r'\b', 'this', text) # Use re.escape and word boundaries
  return text

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'\/', ' ', text)
    text = re.sub(r'\{|\}', ' ', text)
    text = re.sub(r'[^\w\s\+\-]', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [6]:
text_list = []

for index, row in df.iterrows():
    # Process the indices you're interested in: 0, 1, and 3
    for data_index in [0, 1, 3, 4, 5, 6]:
        try:
            text = row['data'][data_index]['text']
            text = replace_card_name(index, text)
            text = clean_text(text)
            text_list.append("<START> " + text + " <END>")
        except:
            pass  # Silently handle the exception



In [7]:
print(len(text_list))

31634


In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_list)  # Your list of strings

vocab_size = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(text_list)
max_length = max([len(seq) for seq in sequences])

padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Create Input and Output Sequences (Teacher Forcing):
encoder_input_data = padded_sequences
decoder_input_data = padded_sequences[:, :-1]  # Input to the decoder (shifted by one timestep)
decoder_target_data = padded_sequences[:, 1:]   # Target for the decoder

# One-hot encode the decoder target data for categorical cross-entropy loss
decoder_target_data = tf.keras.utils.to_categorical(decoder_target_data, num_classes=vocab_size)

In [9]:
latent_dim = 256  # Dimension of the encoded vector

# Encoder
encoder_inputs = Input(shape=(max_length,))
enc_emb =  Embedding(vocab_size, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]  # Hidden and cell states

# Encoder Model
encoder_model = Model(encoder_inputs, encoder_states)

In [10]:
# Decoder
decoder_inputs = Input(shape=(max_length-1,))
dec_emb_layer = Embedding(vocab_size, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Decoder Model
decoder_model = Model([decoder_inputs] + encoder_states, [decoder_outputs] + [state_h, state_c])

In [None]:
# Compile the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, epochs=50, batch_size=64)