In [21]:
# https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html 
# Install required libraries
# Import required libraries
import random
import math
import gymnasium as gym
from gymnasium import spaces
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import namedtuple, deque
from itertools import count

device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "cpu"
)

print(device)
CUDA_LAUNCH_BLOCKING=1

cuda


In [22]:
# imports are always needed
import torch
# get index of currently selected device
torch.cuda.current_device() # returns 0 in my case
# get number of GPUs available
torch.cuda.device_count() # returns 1 in my case
# get the name of the device
torch.cuda.get_device_name(0) # good old Tesla K80


'NVIDIA GeForce RTX 4070 Laptop GPU'

In [23]:

# modify this to fit current environment 
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))
class Memory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def batch(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)
class DQN(nn.Module):

    def __init__(self, obs, action):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(obs, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, action)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)
    
class agent():
    def __init__(self,obs,actions):
        self.actions = actions
        self.batch = 128
        self.discount_factor = 0.99
        self.eps = 1
        self.eps_decay = 0.9954
        # self.tau = 0.005
        self.learning_rate = 0.1
        self.memory = Memory(5000) # replay memory 
        # self.policy_net = DQN(obs, actions).to(device) # action value function
        # self.target_net = DQN(obs, actions).to(device) # target action value function 
        self.policy_net = DQN(obs, actions)
        self.target_net = DQN(obs, actions)
        self.optimizer = optim.SGD(self.policy_net.parameters(), lr=0.01)
        self.Q_table = self.print_table() # not working rn
        
    
    def select_action(self,state):
        p = random.random()
        if p < self.eps:
            #random action
            return torch.tensor([[env.action_space.sample()]], dtype=torch.long)
        else:
            #best action
            with torch.no_grad():
                # t.max(1) will return the largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                return self.policy_net(state).max(1).indices.view(1, 1)

    def update(self,timestep,episode,terminated):
        #update weight every 5 steps
        t = timestep +1
        if t%5 == 0:
            #sample and train policy_net
            self.replay()
        #update target_net every 5 eps, If solved, set as target_net
        ep = episode+1
        if ep%5 == 0 or terminated == True:
            self.target_net.load_state_dict(self.policy_net.state_dict())
        return 
    
    def replay(self):
        if len(agent.memory) < self.batch:
            return
        else:
            sample = self.memory.batch(self.batch)
            batch = Transition(*zip(*sample))
            non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
            non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
            print(batch.reward)
            print(batch.state)
            state_batch = torch.cat(batch.state)
            action_batch = torch.cat(batch.action)
            reward_batch = torch.cat(batch.reward)
            state_action_values = self.policy_net(state_batch).gather(1, action_batch)
            next_state_values = torch.zeros(self.batch)
            with torch.no_grad():
                next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1).values
                expected_state_action_values = (next_state_values * self.learning_rate) + reward_batch
                criterion =nn.MSELoss
                print(state_action_values)
                print(expected_state_action_values)
                loss = criterion(state_action_values,expected_state_action_values.unsqueeze(1))

            
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return 
        #replay algorithim here
    def print_table(self): # not working rn
        
        return 



env = gym.make("CartPole-v1")
state, info = env.reset()
obs = len(state)
actions = env.action_space.n
agent = agent(obs,actions)
        
    

In [24]:
#create environment to run DQN
env = gym.make("CartPole-v1")

max_episodes = 600
max_timestep = 500
state, info = env.reset()
obs = len(state)
actions = env.action_space.n
# agent = agent(obs,actions)
reward_per_episode = []
for episode in range(max_episodes):
    # Initialize the environment and get its state
    state, info = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
    terminated,truncated = False, False
    for timestep in range(500):
        action = agent.select_action(state)
        observation, reward, terminated, truncated, _ = env.step(action.item())
        # reward +=timestep
        if terminated:
            next_state = None #used to check if object has fallen during replay
        else:
            next_state = torch.tensor(observation, dtype=torch.float32).unsqueeze(0)
        # reward = torch.tensor(reward, dtype=torch.float32).unsqueeze(0)
        agent.memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state
        agent.update(timestep=timestep,episode=episode,terminated=truncated) #update network weights
        if terminated or truncated: # close loop if it ends
            print('loop end at:', timestep)
            reward_per_episode.append(timestep)
            break
    agent.eps = agent.eps * agent.eps_decay #eps decay

print('solved',truncated)
plt.figure()
fig ,  ax = plt.subplots()
plt.xlabel('Episode', fontsize=20)
plt.ylabel('Timestep', fontsize=20)
plt.title('Cumulative Reward Per Episode', fontsize=24)
ax.plot(reward_per_episode,linestyle='solid',label = 'Q learning')
plt.grid()
plt.show()

loop end at: 13
loop end at: 10
loop end at: 12
loop end at: 14
loop end at: 29
loop end at: 17
loop end at: 13
(1.0, 18.0, 7.0, 9.0, 1.0, 10.0, 3.0, 8.0, 16.0, 4.0, 8.0, 7.0, 15.0, 3.0, 29.0, 13.0, 4.0, 13.0, 2.0, 8.0, 1.0, 12.0, 10.0, 3.0, 3.0, 24.0, 8.0, 5.0, 12.0, 12.0, 11.0, 6.0, 8.0, 20.0, 2.0, 12.0, 27.0, 12.0, 5.0, 1.0, 8.0, 9.0, 3.0, 16.0, 26.0, 10.0, 13.0, 1.0, 2.0, 6.0, 5.0, 9.0, 22.0, 12.0, 5.0, 10.0, 14.0, 14.0, 9.0, 15.0, 18.0, 1.0, 11.0, 11.0, 14.0, 6.0, 10.0, 10.0, 5.0, 11.0, 9.0, 7.0, 11.0, 1.0, 14.0, 3.0, 4.0, 2.0, 17.0, 5.0, 7.0, 6.0, 6.0, 1.0, 9.0, 13.0, 6.0, 14.0, 2.0, 2.0, 2.0, 13.0, 6.0, 21.0, 9.0, 28.0, 4.0, 11.0, 4.0, 7.0, 4.0, 14.0, 3.0, 15.0, 15.0, 8.0, 4.0, 11.0, 13.0, 7.0, 4.0, 11.0, 9.0, 7.0, 10.0, 3.0, 30.0, 8.0, 25.0, 17.0, 2.0, 23.0, 7.0, 5.0, 19.0, 5.0, 6.0, 12.0)
(tensor([[-0.0435, -0.0072,  0.0360, -0.0374]]), tensor([[ 0.0657,  0.9927, -0.0333, -1.3914]]), tensor([[ 0.0188, -0.3697,  0.0217,  0.5817]]), tensor([[-0.0419, -0.7661,  0.0684,  1.2466]])

TypeError: expected Tensor as element 0 in argument 0, but got float

TypeError: 'DQN' object is not subscriptable