<a href="https://colab.research.google.com/github/aetev/Hearth-Stone-Python-Simulator/blob/main/Welcome_To_Colab_(5).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed, Embedding, Bidirectional, Attention, Concatenate, Masking
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ReduceLROnPlateau
import gc
import random
from sklearn.metrics.pairwise import cosine_similarity





In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_json("/content/drive/MyDrive/MTGdata/AtomicCards.json")

In [4]:
def replace_card_name(index, text):
  name_parts = index.split(',')  # Split by comma
  possible_matches = [index]

  # Add individual name parts if comma exists
  if len(name_parts) > 1:
    possible_matches.extend([part.strip() for part in name_parts])

  # Add permutation for names with multiple words before comma
  first_part = name_parts[0].strip()  # Get the part before comma
  first_part_words = first_part.split()  # Split into words
  if len(first_part_words) > 1:
      possible_matches.append(first_part_words[0]) # Add the first word as a match

  # Replace occurrences of possible matches in the text, using word boundaries
  for name in possible_matches:
    text = re.sub(r'\b' + re.escape(name) + r'\b', 'this', text) # Use re.escape and word boundaries
  return text

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'\/', ' ', text)
    text = re.sub(r'\{|\}', ' ', text)
    text = re.sub(r'[^\w\s\+\-]', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [6]:
text_list = []

for index, row in df.iterrows():
    # Process the indices you're interested in: 0, 1, and 3
    for data_index in [0, 1, 3, 4, 5, 6]:
        try:
            text = row['data'][data_index]['text']
            text = replace_card_name(index, text)
            text = clean_text(text)
            text_list.append(text)
        except:
            pass  # Silently handle the exception



In [7]:
# Assuming your list of strings is named 'text_list'
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_list)
total_words = len(tokenizer.word_index) + 1

# Convert text to sequences
input_sequences = tokenizer.texts_to_sequences(text_list)

# Pad sequences to have the same length
max_sequence_length = max([len(seq) for seq in input_sequences])
encoder_input_data = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')

# Create decoder input data (shifted by one timestep) and decoder target data
decoder_input_data = np.zeros_like(encoder_input_data)
decoder_target_data = np.zeros_like(encoder_input_data)

for i, seq in enumerate(encoder_input_data):
  decoder_input_data[i, 1:] = seq[:-1]  # Shifted by one timestep
  decoder_target_data[i, :-1] = seq[1:]  # Target is the original sequence without the start token

In [8]:
# Encoder model
encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(total_words, 256)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]  # Hidden and cell states

# Define encoder model
encoder_model = Model(encoder_inputs, encoder_states)

In [10]:
# Decoder model
decoder_inputs = Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(total_words, 256)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_states = [decoder_state_h, decoder_state_c]

decoder_dense = Dense(total_words, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define decoder model
decoder_model = Model([decoder_inputs] + encoder_states, [decoder_outputs] + decoder_states)

In [None]:
# Connect the encoder and decoder
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=64, epochs=100, validation_split=0.2)

Epoch 1/100
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 65ms/step - accuracy: 0.8989 - loss: 1.1728 - val_accuracy: 0.9171 - val_loss: 0.4583
Epoch 2/100
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 65ms/step - accuracy: 0.9227 - loss: 0.4228 - val_accuracy: 0.9295 - val_loss: 0.3798
Epoch 3/100
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 65ms/step - accuracy: 0.9325 - loss: 0.3541 - val_accuracy: 0.9347 - val_loss: 0.3409
Epoch 4/100
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 65ms/step - accuracy: 0.9380 - loss: 0.3157 - val_accuracy: 0.9377 - val_loss: 0.3182
Epoch 5/100
[1m396/396[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 65ms/step - accuracy: 0.9404 - loss: 0.2961 - val_accuracy: 0.9400 - val_loss: 0.3013
Epoch 6/100
[1m 55/396[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m19s[0m 57ms/step - accuracy: 0.9431 - loss: 0.2770

In [None]:
def encode_string(input_string, encoder_model, tokenizer, max_sequence_length):
  """Encodes a string into a single vector representation.

  Args:
    input_string: The string to encode.
    encoder_model: The trained encoder model.
    tokenizer: The tokenizer used to convert the string to a sequence of token IDs.
    max_sequence_length: The maximum sequence length used during training.

  Returns:
    A single vector representation of the input string.
  """
  # Preprocess the string
  cleaned_string = clean_text(input_string)
  input_sequence = tokenizer.texts_to_sequences([cleaned_string])  # Wrap in a list
  padded_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length, padding='post')

  # Encode using the encoder model
  states = encoder_model.predict(padded_sequence)  # Get hidden and cell states
  encoded_vector = np.concatenate(states, axis=-1)  # Concatenate states

  return encoded_vector[0]  # Return the first element (since we only encoded one string)

# Example usage
string_to_encode = "This is the string I want to encode."
encoded_vector = encode_string(string_to_encode, encoder_model, tokenizer, max_sequence_length)

print("Encoded vector:", encoded_vector)  # To see the output, run the code.