# deepQ
- Sara Echeverría 21371
- Ricardo Mendez 21289
- Melissa Pérez 21385

Repository link: https://github.com/bl33h/deepQ

In [None]:
import torch
import numpy as np
import torch.nn as nn
import gymnasium as gym
import torch.optim as optim
from collections import deque

In [None]:
# define the dql neural network model
class DQL(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQL, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # first hidden layer
        self.fc2 = nn.Linear(128, 128)        # second hidden layer
        self.fc3 = nn.Linear(128, output_dim) # output layer

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # relu activation on the first layer
        x = torch.relu(self.fc2(x))  # relu activation on the second layer
        return self.fc3(x)           # output layer, no activation due to q-value output

In [None]:
# initialize the environment
env = gym.make('CartPole-v1')

In [None]:
# hyperparameters
gamma = 0.99  # discount factor
epsilon = 1.0  # initial exploration rate
epsilonDecay = 0.995  # epsilon decay per episode
epsilonMin = 0.01  # minimum epsilon
learningRate = 0.001  # learning rate
batchSize = 64  # batch size for experience replay
memorySize = 10000  # memory capacity
targetUpdateFreq = 100  # target network update frequency
numEpisodes = 500

In [None]:
# initialize the network and optimizer
inputDim = env.observation_space.shape[0]
outputDim = env.action_space.n
policyNet = DQL(inputDim, outputDim)
targetNet = DQL(inputDim, outputDim)
targetNet.load_state_dict(policyNet.state_dict())
targetNet.eval()  # set target network to evaluation mode
optimizer = optim.Adam(policyNet.parameters(), lr=learningRate)

In [None]:
# memory for experience replay
memory = deque(maxlen=memorySize)

# function to choose an action based on epsilon-greedy approach
def selectAction(state, epsilon):
    if np.random.rand() < epsilon:
        return env.action_space.sample()  # explore: select a random action
    else:
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            return policyNet(state).max(1)[1].view(1, 1).item()  # exploit: select the best action based on the current policy

In [None]:
# training loop
for episode in range(numEpisodes):
    state = env.reset()
    totalReward = 0
    done = False
    while not done:
        action = selectAction(state, epsilon)
        # unpack with the additional boolean
        nextState, reward, done, _, _ = env.step(action)

        memory.append((state, action, reward, nextState, done))
        state = nextState
        totalReward += reward

    print(f"Episode {episode}: Total reward = {totalReward}")

# cleanup
env.close()