In [4]:
import random
# Install required libraries
# Import required libraries
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import namedtuple, deque
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "cpu"
)

print(device)
CUDA_LAUNCH_BLOCKING=1

cuda


In [5]:
class Net(nn.Module):

    def __init__(self, obs, action):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(obs, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, action)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

In [6]:
Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward'))
class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [18]:


class DQN:
  # initialize values
  def __init__(self, N, C, env):
    # initialize environment
    self.env = env
    # initialize replay memory to capacity N
    self.replay = ReplayMemory(N)
    self.pointer = 0
    self.policy_net = Net(3, 2)
    self.target_net = Net(3, 2)
    # initialize action-value function Q with random weights 0
    self.action_value = None

    self.optimizer = optim.SGD(self.policy_net.parameters(), lr=0.01)

    self.C = C
    pass

  
  # Main training function
  def train(self, episodes, epsilon, discount, action_function):
    total_reward = [0] * episodes  
    for i in range(episodes):
      # initialize sequence S and preprocessed sequence o
      seq  = [None , None]
      seq[0] = torch.tensor([0,0], dtype=torch.float32).unsqueeze(0)
      terminated = False
      t = rewards = 0
      state, info = self.env.reset()
      while not terminated:
        # Select action
        action = action_function(state, epsilon)
        observation, reward, terminated, truncated, _ = env.step(action)
        state = torch.tensor(observation, dtype=torch.float32).unsqueeze(0)
        # Set sequence
        seq[1] = state       
        rewards += reward
        # store transition in replay buffer
        self.replay.push(seq[0], action,  seq[1], reward)
        seq[0] = state
        # Every C steps reset Q' = Q
        t += 1
        if t % self.C == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())
            self.replay_function(discount)
        
      # Decay epsilon after every episode
      epsilon *= epsilon
      total_reward[i] = rewards  
    return total_reward
  # Determine the action for the warehouse environment
      
  def mountain_car_action(self, state, epsilon):
    if np.random.rand() < epsilon:
        action_type = np.random.randint(3)
    else:
        # select max(Q)
        with torch.no_grad():
            action_type = self.policy_net(state).max(1).indices.view(1, 1).item()    
    return action_type
  
  def replay_function(self, discount):
    if len(self.replay) < 128:
        return
    else:
            #print("sample")
            sample = self.replay.sample(128)
            q_values_list = []
            target_list = []
            for state, action, next_state, reward in sample:
                if next_state == None:
                    target = reward
                else:
                    with torch.no_grad():
                        #predict target Q-values
                        q = self.target_net(next_state).max(1)[0]
                        next_q = q.item() #predict next state with target network 
                        target = (reward + discount * np.amax(next_q))
                q_values = self.policy_net(state)[0][action].item()#predict with policy network
                q_values_list.append(q_values)
                target_list.append(target)
            # Set loss function 
            loss = nn.MSELoss()
            # Set action lists require grad
            state_action_list = torch.FloatTensor(q_values_list)
            state_action_list.requires_grad = True
            target_list = torch.FloatTensor(target_list)
            target_list.requires_grad = True
            # back prop
            self.optimizer.zero_grad()
            output = loss(state_action_list, target_list.unsqueeze(1))
            output.backward()
            self.optimizer.step()

  # Save the current weights
  def save(self, filename):
    with open("pickles/" + filename, 'wb') as file:
      pickle.dump(self.policy_net, file)


In [8]:
#Prints the reward per epsisode graph
def reward_print(reward_per_episode, episodes, info): 
    mins = int(min(reward_per_episode)) + int(min(reward_per_episode)) * (.2)
    maxs = int(max(reward_per_episode)) - int(max(reward_per_episode)) * (.2) 
    plt.figure()
    plt.plot(reward_per_episode)
    plt.xlabel('Episode', fontsize=20)
    plt.ylabel('Cumulative Reward', fontsize=20)
    plt.title(f'Cumulative Reward Per Episode ({info})', fontsize=24)
    plt.xticks([0, episodes * .2, episodes * .4, episodes * .6, episodes * .8, episodes], fontsize=18)
    plt.yticks(fontsize=18)
    plt.ylim(ymin= mins, ymax=maxs)
    plt.xlim(xmin=0, xmax=episodes)
    plt.grid()
    plt.show()

#prints the epsilon decay graph
def ep_decay(eps, episodes):
    epsilon_values = [(eps ** i) * 1 for i in range(episodes)]
    plt.figure()
    plt.plot(epsilon_values, linewidth=4)
    plt.xlabel('Episode', fontsize=20)
    plt.ylabel('Epsilon Value', fontsize=20)
    plt.title(f"Epsilon Decay for {eps}", fontsize=24)
    plt.xticks([0, episodes * .2, episodes * .4, episodes * .6, episodes * .8, episodes], fontsize=18)
    plt.yticks(fontsize=18)
    plt.ylim(ymin=0, ymax=1)
    plt.xlim(xmin=0, xmax=episodes)
    plt.grid()
    plt.show()


In [19]:
N = 5000
C = 3
env = gym.make("MountainCar-v0")
env.reset()
agent = DQN(N, C, env)

episodes = 2
epsilon = .8
discount = .4
action = agent.mountain_car_action
total_rewards = agent.train(episodes, epsilon, discount, action)
agent.save("mountain_car")
reward_print(total_rewards, episodes, "MountainCar")
ep_decay(epsilon, episodes)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x2 and 3x128)