In [10]:
import cv2
import gymnasium as gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
from tetris_gymnasium.envs.tetris import Tetris


In [11]:
env = gym.make("tetris_gymnasium/Tetris", render_mode="human")
env.reset(seed=42)

print("_____OBSERVATION SPACE_____ \n")
#print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample()) # Get a random observation

print("\n _____ACTION SPACE_____ \n")
#print("The Action Space is: ", a_size)
print("Action Space Sample", env.action_space.sample()) # Take a random action

_____OBSERVATION SPACE_____ 

Sample observation {'active_tetromino_mask': array([[1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0],
       [1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1],
       [1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0],
       [1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0],
       [0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1],
       [0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1],
       [0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,

In [41]:
class DQN(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.network = nn.Sequential(
            nn.Conv2d(4, 32, 8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, 4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, env.single_action_space.n),
        )

    def forward(self, x):
        return self.network(x / 255.0)

In [24]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            torch.tensor(np.array(states), dtype=torch.float32),
            torch.tensor(actions, dtype=torch.long),
            torch.tensor(rewards, dtype=torch.float32),
            torch.tensor(np.array(next_states), dtype=torch.float32),
            torch.tensor(dones, dtype=torch.float32),
        )

    def __len__(self):
        return len(self.buffer)


In [None]:
from stable_baselines3.common.buffers import ReplayBuffer
from stable_baselines3.common.utils import linear_schedule
import torch
import torch.nn.functional as F
import numpy as np
import time

def train_dqn_with_stable_baselines(
    num_episodes=50000,
    batch_size=64,
    gamma=0.99,
    start_e=1.0,
    end_e=0.01,
    exploration_fraction=0.1,
    train_frequency=4,
    target_update_frequency=1000,
    replay_capacity=10000,
    learning_starts=1000,
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    q_network = q_network.to(device)
    target_network = target_network.to(device)

    # Initialize replay buffer
    obs_shape = env.observation_space.shape
    replay_buffer = ReplayBuffer(
        replay_capacity, env.observation_space, env.action_space, device, handle_timeout_termination=True
    )

    # Initialize environment and observation
    obs, _ = env.reset(seed=42)
    obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(device)

    # Initialize training variables
    start_time = time.time()

    for global_step in range(num_timesteps):
        # Compute epsilon based on exploration schedule
        epsilon = linear_schedule(
            start_e, end_e, exploration_fraction * num_timesteps, global_step
        )

        # Select action
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                q_values = q_network(obs)
                action = q_values.argmax().item()

        # Step the environment
        next_obs, reward, done, truncated, info = env.step(action)
        next_obs = torch.tensor(next_obs, dtype=torch.float32).unsqueeze(0).to(device)
        reward = torch.tensor([reward], dtype=torch.float32, device=device)
        done = torch.tensor([done], dtype=torch.float32, device=device)

        # Save transition to the replay buffer
        replay_buffer.add(obs, next_obs, action, reward, done)

        # Update observation
        obs = next_obs if not done else env.reset(seed=42)

        # Training logic
        if global_step > learning_starts and global_step % train_frequency == 0:
            # Sample a batch from the replay buffer
            data = replay_buffer.sample(batch_size)

            # Compute target Q-values
            with torch.no_grad():
                target_max, _ = target_network(data.next_observations).max(dim=1)
                td_target = data.rewards.flatten() + gamma * target_max * (1 - data.dones.flatten())

            # Compute current Q-values
            current_q_values = q_network(data.observations).gather(1, data.actions).squeeze()

            # Compute loss
            loss = F.mse_loss(td_target, current_q_values)

            # Optimize the model
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Update target network
        if global_step % target_update_frequency == 0:
            target_network.load_state_dict(q_network.state_dict())

        # Log training information
        if global_step % 100 == 0:
            elapsed_time = time.time() - start_time
            print(f"Step: {global_step}, Loss: {loss.item()}, SPS: {int(global_step / elapsed_time)}")

    env.close()
    print("Training complete.")
    return q_network


AttributeError: 'numpy.dtypes.BoolDType' object has no attribute 'bits'

In [None]:
from stable_baselines3.common.atari_wrappers import ClipRewardEnv

def make_env(env_id="tetris_gymnasium/Tetris", seed=42, capture_video=False, run_name=""):
    env = gym.make(env_id, render_mode="rgb_array" if capture_video else None)
    
    if capture_video:
        env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
    
    env = gym.wrappers.RecordEpisodeStatistics(env)
    env.action_space.seed(seed)
    
    env = ClipRewardEnv(env)
    env = gym.wrappers.ResizeObservation(env, (84, 84))
    env = gym.wrappers.GrayScaleObservation(env)
    env = gym.wrappers.FrameStack(env, 4)

    return env


In [47]:
from stable_baselines3.common.buffers import ReplayBuffer as RB
from stable_baselines3.common.utils import linear_schedule
import torch
import torch.nn.functional as F
import numpy as np
import time

def train_dqn(num_episodes=500, batch_size=64, gamma=0.99, lr=1e-3, replay_capacity=10000, target_update_freq=10, device='cuda'):

    env = make_env()

    # Networks
    policy_net = DQN(env).to(device)
    target_net = DQN(env).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    # Replay Buffer and Optimizer
    replay_buffer = RB(replay_capacity, env.single_observation_space, env.single_action_space, device,
                       handle_timeout_termination=False)
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)
    
    epsilon = 1.0  # Exploration rate
    epsilon_decay = 0.995
    epsilon_min = 0.01

    for episode in range(num_episodes):
        state, _ = env.reset(seed=42)
        total_reward = 0
        terminated = False

        while not terminated:
            # Epsilon-greedy action selection
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                state_tensor = torch.tensor(state).to(device)
                action = policy_net(state_tensor).argmax(dim=1).item()

            # Step the environment
            next_state, reward, terminated, _, _ = env.step(action)
            #next_state = next_state["board"].flatten()
            replay_buffer.push(state, action, reward, next_state, terminated)

            # Train the policy network
            if len(replay_buffer) > batch_size:
                states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

                # Compute Q-values and targets
                states = states.to(device)
                actions = actions.to(device)
                rewards = rewards.to(device)
                next_states = next_states.to(device)
                dones = dones.to(device)

                q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
                with torch.no_grad():
                    next_q_values = target_net(next_states).max(dim=1)[0]
                    targets = rewards + gamma * next_q_values * (1 - dones)

                # Compute loss and update
                loss = nn.MSELoss()(q_values, targets)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            state = next_state
            total_reward += reward

        # Update epsilon
        epsilon = max(epsilon_min, epsilon * epsilon_decay)

        # Update target network
        if episode % target_update_freq == 0:
            target_net.load_state_dict(policy_net.state_dict())

        print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {epsilon}")

    env.close()
    return policy_net


AttributeError: 'numpy.dtypes.BoolDType' object has no attribute 'bits'

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
env = gym.make("tetris_gymnasium/Tetris")
policy_network = DQN(env).to(device)
target_network = DQN(env).to(device)
target_network.load_state_dict(policy_network.state_dict())
optimizer = torch.optim.Adam(policy_network.parameters(), lr=1e-3)

# Train the agent
trained_q_network = train_dqn_with_stable_baselines(env, policy_network, target_network, optimizer)

cuda:0
Episode 0, Total Reward: 8, Epsilon: 0.995


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [None]:
env = gym.make("tetris_gymnasium/Tetris", render_mode="human")
env.reset(seed=42)

terminated = False
while not terminated:
    env.render()
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    key = cv2.waitKey(100) # timeout to see the movement
print("Game Over!")
cv2.destroyAllWindows()