This is a DQN Network following along [this PyTorch tutorial](https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html).

In [1]:
# IMPORTS

import gymnasium as gym

import math
import random
from collections import namedtuple, deque
from itertools import count

import matplotlib
import matplotlib.pyplot as plt

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

env = gym.make("CartPole-v1")

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display
# In interactive mode (enabled with plt.ion()):
# - newly created figures will be shown immediately;
# - figures will automatically redraw on change;
# - `pyplot.show` will not block by default.
plt.ion()

# check if using DirectML or not
import imp
try:
    imp.find_module('torch_directml')
    # https://learn.microsoft.com/en-us/windows/ai/directml/gpu-pytorch-windows
    device = torch_directml.device()
except ImportError:
    torch.device("cuda" if torch.cuda.is_available() else "cpu")


  import imp


NameError: name 'torch_directml' is not defined

## Replay memory
A replay memory is basically a storage of all previous state transitions and their rewards.
[resource](https://deeplizard.com/learn/video/Bcuj2fTH4_4)

In [None]:

# Experience memory: https://deeplizard.com/learn/video/Bcuj2fTH4_4
# At time t, the agent's experience e_t is defined as this tuple:
# e_t = (s_t, a_t, r_{t+1}, s_{t+1})
# which gives us information about its current state, the action taken from state s_t, the reward at
# t+1, and the next state in the environment (at t+1). The last one, we won't know of course unless
# this experience is in at least 1 timestep in the past. (you can't see the future!)
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
    
    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_sze):
        return random.sample(self.memory, batch_sze)

    def __len(self):
        return len(self.memory)

## The model
The interesting thing is that we will be outputting the Q-value for the 2 available actions (move right/left), not necessarily the probabilty. Hence, the network is trying to predict the expected return of taking that action.

In [None]:
class DQN(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        # nn.Linear(in, uot) is like the "space" between 2 layers. it takes an input and maps it to neurons out.
        # resource https://www.sharetechnote.com/html/Python_PyTorch_nn_Linear_01.html
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)
    
    # x is a singular element: determine next action
    # x is a batch (multiple el.): for optimization (diff. word for training)
    # returns a tensor
    def forward(self, x):
        # basically, drive x through the whole network. Throughout this process, x may change its shape
        # ReLU(x) = max(0, x) (basically), comparable to sigmoid
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

## Training
TODO:

In [None]:
# HYPERPARAMETERS: constants that are set before the machine learning process begins
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
LR = 1e-4

# get size of action space
n_actions = env.action_space.n
# https://gymnasium.farama.org/environments/classic_control/cart_pole/#observation-space
# this will return initial state parameters like pos, vel, etc. will be stochastically initialized
state, info = env.reset()
# size of state space
n_observations = len(state)

# why do we have 2 networks? -> https://stackoverflow.com/a/59869307
policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
# initialize target NN from policy NN - basically clone it -> https://pytorch.org/tutorials/beginner/saving_loading_models.html 
target_net.load_state_dict(policy_net.state_dict())

# AdamW is also called SGD or Stochastic Gradient Descent, you can look into it here:
# https://dev.to/amananandrai/10-famous-machine-learning-optimizers-1e22
# with `amsgrad=True`, we're using its variation:
# https://www.fast.ai/posts/2018-07-02-adam-weight-decay.html
# parameters are the variables we want to change over the training. Conveniently, they all get outputted on
# Module().parameters()
# a layer like nn.Linear() automatically randomly initializes them.
optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
memory = ReplayMemory(capacity=10000)

steps_done = 0

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = 

<class 'torch.nn.parameter.Parameter'> torch.Size([128, 4])
<class 'torch.nn.parameter.Parameter'> torch.Size([128])
<class 'torch.nn.parameter.Parameter'> torch.Size([128, 128])
<class 'torch.nn.parameter.Parameter'> torch.Size([128])
<class 'torch.nn.parameter.Parameter'> torch.Size([2, 128])
<class 'torch.nn.parameter.Parameter'> torch.Size([2])
