# Deep $Q$-learning

In [9]:
import random
from collections import deque

import gymnasium as gym
import torch
from torch import nn

random.seed(75)  # set random seeds for reproducibility 
torch.manual_seed(75)

<torch._C.Generator at 0x7f864bd6cd50>

In [None]:
class DeepQNetwork(nn.Module):
    def __init__(self, input_dim: int, hidden_dims: list[int], output_dim: int, activation_fn: nn.Module):
        super().__init__()
        assert len(hidden_dims) > 0, "Must have at least one hidden layer."
        
        # add input layer
        self.layers = nn.ModuleList([nn.Linear(in_features=input_dim, out_features=hidden_dims[0])])
        self.layers.append(activation_fn())

        # add hidden layers
        for i in range(1, len(hidden_dims)-1):
            self.layers.append( nn.Linear(in_features=hidden_dims[i-1], out_features=hidden_dims[i]) )
            self.layers.append(activation_fn())
        
        # add output layer
        self.layers.append( nn.Linear(in_features=hidden_dims[-1], out_features=output_dim) )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        for layer in self.layers:
            input = layer(input)
        
        pass


class DQNTrainer:

    def __init__(self, buffer_size: int, batch_size: int, qnet: nn.Module):
        self.replay_buffer = deque([], maxlen=buffer_size)  # initialize replay buffer/memory
        self.batch_size = batch_size
        self.qnet = qnet
        self.target_qnet = target_qnet
    
    def train(self):

        minibatch = random.sample(self.replay_buffer, self.batch_size)  # sample uniformly without replacement

        qnet_state_dict = self.qnet.state_dict()
        
        if t % C == 0:
           self.target_qnet.load_state_dict(qnet_state_dict)  # \hat{Q} = Q

In [17]:
net = DeepQNetwork(200, [100, 50, 25, 10, 5], 2, nn.Tanh)
print(net)

DeepQNetwork(
  (layers): ModuleList(
    (0): Linear(in_features=200, out_features=100, bias=True)
    (1): Tanh()
    (2): Linear(in_features=100, out_features=50, bias=True)
    (3): Tanh()
    (4): Linear(in_features=50, out_features=25, bias=True)
    (5): Tanh()
    (6): Linear(in_features=25, out_features=10, bias=True)
    (7): Tanh()
    (8): Linear(in_features=5, out_features=2, bias=True)
  )
)


In [2]:
env = gym.make("LunarLander-v3", render_mode="rgb_array")

In [5]:
env.reset()
action = env.action_space.sample()

# step (transition) through the environment with the action
# receiving the next observation, reward and if the episode has terminated or truncated
observation, reward, terminated, truncated, info = env.step(action)

print(observation, reward, terminated, truncated, info)

[-0.00915499  1.398767   -0.46301442 -0.2829145   0.01050018  0.10380006
  0.          0.        ] -1.1221082318330957 False False {}
