<a href="https://colab.research.google.com/github/bbenip/tetris-ai/blob/main/model/rl/rl_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [91]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [92]:
!git clone https://github.com/bbenip/tetris-ai.git tetris-ai
%cd tetris-ai/model/rl
%ls

Cloning into 'tetris-ai'...
remote: Enumerating objects: 152, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 152 (delta 22), reused 20 (delta 4), pack-reused 93[K
Receiving objects: 100% (152/152), 36.43 MiB | 31.72 MiB/s, done.
Resolving deltas: 100% (54/54), done.
/content/tetris-ai/model/rl/tetris-ai/model/rl
rl_model.ipynb  tetris.py


In [93]:
!git pull

Already up to date.


In [245]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, concatenate, InputLayer

In [246]:
def create_model():
  # Define the input for the CNN branch
  board_input = Input(shape=(21, 10, 1), name='board_input')
  current_piece_input = Input(shape=(2, 4, 1), name='current_piece_input')
  next_piece_input = Input(shape=(2, 4, 1), name='next_piece_input')
  position_input = Input(shape=(2), name="position_input")

  # Create the CNN branch
  board_branch = Conv2D(32, (3, 3), activation='relu')(board_input)
  board_branch = MaxPooling2D((2, 2), padding='same')(board_branch)
  board_branch = Flatten()(board_branch)

  current_branch = Conv2D(32, (2, 2), activation='relu')(current_piece_input)
  current_branch = MaxPooling2D((2, 2), padding='same')(current_branch)
  current_branch = Flatten()(current_branch)

  next_branch = Conv2D(32, (2, 2), activation='relu')(next_piece_input)
  next_branch = MaxPooling2D((2, 2), padding='same')(next_branch)
  next_branch = Flatten()(next_branch)

  # Concatenate the 3 branches + position
  concatenated_inputs = concatenate([board_branch,
                                     current_branch,
                                     next_branch,
                                     position_input],
                                    name='concatenated_inputs')

  # Create the dense branch for the concatenated inputs
  dense_branch = Dense(512, activation='relu')(concatenated_inputs)
  dense_branch = Dense(256, activation='relu')(dense_branch)
  dense_branch = Dense(64, activation='relu')(dense_branch)
  dense_branch = Dense(32, activation='relu')(dense_branch)

  # Create the output layer
  output = Dense(6, activation='softmax', name='output')(dense_branch)

  # Define the model with 4 inputs and one output
  model = tf.keras.Model(inputs=[board_input,
                                 current_piece_input,
                                 next_piece_input,
                                 position_input],
                         outputs=output)

  return model

In [247]:
# Create the neural network model.
model = create_model()
model.summary()

Model: "model_201"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 board_input (InputLayer)    [(None, 21, 10, 1)]          0         []                            
                                                                                                  
 current_piece_input (Input  [(None, 2, 4, 1)]            0         []                            
 Layer)                                                                                           
                                                                                                  
 next_piece_input (InputLay  [(None, 2, 4, 1)]            0         []                            
 er)                                                                                              
                                                                                          

In [248]:
# @title Training Constants
NUM_EPISODES = 5 # @param {type:"integer"}
BATCH_SIZE = 31 # @param {type:"integer"}
DISCOUNT_FACTOR = .99 # @param {type:"slider", min:0, max:1, step:0.01}
EXPLORATION_EPSILON = .1 # @param {type:"slider", min:0, max:1, step:0.1}
LEARNING_RATE = 0.001 # @param {type:"number"}

INVALID_MOVE_REWARD = -999 # @param {type:"integer"}
GAME_OVER_REWARD = -999 # @param {type:"integer"}
VALID_MOVE_REWARD = 1 # @param {type:"integer"}


In [249]:
# Define the optimizer and loss function based on your RL task.
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)  # Adjust the learning rate as needed.
loss_fn = tf.keras.losses.MeanSquaredError()  # Adjust the loss function as needed.

# Compile the model.
model.compile(optimizer=optimizer, loss=loss_fn)

In [250]:
from random import sample
class ReplayMemory:
    def __init__(self, max_size):
        self.buffer = [None] * max_size
        self.max_size = max_size
        self.index = 0
        self.size = 0

    def append(self, obj):
        self.buffer[self.index] = obj
        self.size = min(self.size + 1, self.max_size)
        self.index = (self.index + 1) % self.max_size

    def sample(self, batch_size):
        indices = sample(range(self.size), batch_size)
        return [self.buffer[index] for index in indices]

In [251]:
import numpy as np
from collections import deque

def stateToModelInput(state):
  return (tf.convert_to_tensor(np.reshape(state["board"], (1, 21, 10,))),
          tf.convert_to_tensor(np.reshape(state["current_piece"], (1, 2, 4))),
          tf.convert_to_tensor(np.reshape(state["next_piece"], (1, 2, 4))),
          tf.convert_to_tensor(np.reshape(state["position"], (1, 2))))

def dqn(env,
        num_episodes,
        batch_size=BATCH_SIZE,
        gamma=DISCOUNT_FACTOR,
        epsilon=EXPLORATION_EPSILON,
        learning_rate=LEARNING_RATE):
    replay_buffer = ReplayMemory(max_size=10000)
    model = create_model()  # Define the DQN neural network.
    target_model = create_model()  # Target network for stability.

    for episode in range(NUM_EPISODES):
        env.start_game()
        state = env.getState()
        done = False
        while not done:
            # Epsilon-greedy action selection
            if np.random.rand() < epsilon:
                action = np.random.randint(6)  # Explore
                #print(f"Random Action: {action}")
            else:
                action = np.argmax(model(stateToModelInput(state)))
                #print(f"Action: {action}")

            next_state, reward, done = env.doAction(action)
            replay_buffer.append((state, action, reward, next_state, done))
            state = next_state

            # Sample and train on mini-batch from replay buffer
            if replay_buffer.size >= batch_size:
                mini_batch = replay_buffer.sample(batch_size)
                train_dqn(model, target_model, mini_batch, gamma, learning_rate)

        if episode % 10 == 0:
            model.save("./model")


In [252]:
import numpy as np
import tensorflow as tf

def train_dqn(model, target_model, mini_batch, gamma, learning_rate):
    # Step 1: Compute the target Q-values using the target model
    states, actions, rewards, next_states, dones = zip(*mini_batch)

    target_q_values = []
    for i, (next_state, done) in enumerate(zip(next_states, dones)):
        if done:
            target_q_values.append(rewards[i])  # If it's a terminal state, the Q-value is the immediate reward.
        else:
            # Use the target model to predict Q-values for the next state and select the maximum Q-value.
            max_q_value = np.max(model(stateToModelInput(next_state))[0])
            target_q = rewards[i] + gamma * max_q_value
            target_q_values.append(target_q)

    target_q_values = np.array(target_q_values)

    # Step 2: Compute the predicted Q-values for the current states using the model
    vec_stateToInput = np.vectorize(stateToModelInput)
    model_inputs = list(map(np.vstack, zip(*map(stateToModelInput, states))))
    # print("Curr pieces ", model_inputs[1].shape)
    # print("Boards ", model_inputs[0].shape)
    # print("Targets ", target_q_values.shape)

    # Step 3: Calculate the target Q-values for the mini-batch
    target_q_values = np.expand_dims(target_q_values, axis=1)  # Add an extra dimension for broadcasting

    # Step 4: Compute the loss between the predicted Q-values and the target Q-values

    #input()

    # Step 5: Update the model's weights using gradient descent to minimize the loss
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    with tf.GradientTape() as tape:
        predicted_q_values = model(model_inputs)
        predicted_q_values = tf.reduce_sum(predicted_q_values * tf.one_hot(actions, 6), axis=1)
        loss = tf.keras.losses.mean_squared_error(target_q_values, predicted_q_values)
        gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss.numpy().mean()


In [253]:
from tetris import TetrisApp

REWARDS = (INVALID_MOVE_REWARD, GAME_OVER_REWARD, VALID_MOVE_REWARD)
env = TetrisApp(ai=True, rewards=REWARDS)
dqn(env,
    num_episodes=NUM_EPISODES,
    batch_size=BATCH_SIZE,
    gamma=DISCOUNT_FACTOR,
    epsilon=EXPLORATION_EPSILON,
    learning_rate=LEARNING_RATE)



NameError: ignored