In [None]:
 pip install gymnasium

In [None]:
from time import sleep
import numpy as np
from IPython.display import clear_output
import gymnasium as gym
from gymnasium.envs.registration import register
import torch
from torch import nn


In [None]:
#Give colab access to your google drive:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
#Change current directory to folder with MiniPacMan
%cd /gdrive/MyDrive/SP 25/Reinforcement Learning/A2C

In [None]:
#Import MiniPacMan environment class definition
from MiniPacManGymV2 import MiniPacManEnv

In [None]:
#Register MiniPacMan in your gymnasium environments
register(
    id="MiniPacMan-v0",
    entry_point=MiniPacManEnv,
    max_episode_steps=20
)

In [None]:
#Create a MiniPacMan gymnasium environment
envs = gym.make_vec("MiniPacMan-v0", render_mode="human", frozen_ghost=False, num_envs=16, wrappers=[gym.wrappers.Autoreset])

In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self):
      super().__init__()
      self.linear1=nn.Linear(36,32)
      self.linear2=nn.Linear(32,16)
      self.linear3=nn.Linear(16,4)

    def forward(self, x):
      x=nn.Flatten()(x)
      x=self.linear1(x)
      x=nn.ReLU()(x)
      x=self.linear2(x)
      x=nn.ReLU()(x)
      x=self.linear3(x)
      x=nn.Softmax(dim=1)(x)
      return x

class ValueNetwork(nn.Module):
    def __init__(self):
      super().__init__()
      self.linear1=nn.Linear(36,32)
      self.linear2=nn.Linear(32,16)
      self.linear3=nn.Linear(16,1)

    def forward(self, x):
      x=nn.Flatten()(x)
      x=self.linear1(x)
      x=nn.ReLU()(x)
      x=self.linear2(x)
      x=nn.ReLU()(x)
      x=self.linear3(x)
      return x


In [None]:
pi = PolicyNetwork()
V = ValueNetwork()

In [None]:
pi_optimizer = torch.optim.Adam(pi.parameters(), lr=0.01)
V_optimizer = torch.optim.Adam(V.parameters(), lr=0.01)

In [None]:
from math import log

#set hyperparams
gamma=0.99
num_updates=1000
num_trajectories_parallel=16
num_steps = 7

done_log = []
reward_log = []

win_pct=np.zeros(num_updates)
episode_rewards = [[] for _ in range(num_trajectories_parallel)]
current_rewards = np.zeros(num_trajectories_parallel)

new_obs, info = envs.reset()

for e in range(num_updates):

    states_list = []
    log_probs_list = []
    rewards_list = []
    next_states_list = []
    dones_list = []

    obs = torch.as_tensor(new_obs,dtype=torch.float32)

    for k in range(num_steps):
        probs = pi(obs)
        actions=probs.multinomial(num_samples=1).squeeze(-1)
        action_probs = probs.gather(1, actions.unsqueeze(1)).squeeze(1)
        log_probs = torch.log(action_probs)

        #step in all parallel trajectories
        new_obs,rewards, dones, truncated, infos = envs.step(actions.cpu().numpy())
        new_obs = torch.as_tensor(new_obs,dtype=torch.float32)

        #update rewards
        current_rewards += np.array(rewards)

        for i in range(num_trajectories_parallel):
            if dones[i]:
                episode_rewards[i].append(current_rewards[i])
                current_rewards[i] = 0.0

        #append current state to done and reward logs
        done_log.extend(dones)
        reward_log.extend(rewards)

        # add (s, pi(s, a), r, s', done) to respective arrays
        states_list.append(obs)
        log_probs_list.append(log_probs)
        rewards_list.append(torch.tensor(rewards, dtype=torch.float32))
        next_states_list.append(new_obs.clone().detach())
        dones_list.append(torch.tensor(dones, dtype=torch.float32))

        obs=new_obs

    rewards_t = torch.stack(rewards_list)
    states_t = torch.stack(states_list)
    log_probs_t = torch.stack(log_probs_list)
    dones_t = torch.stack(dones_list)

    #use final next states from last step
    final_next_states = next_states_list[-1]
    final_next_states_flat = final_next_states.view(final_next_states.size(0), -1)
    with torch.no_grad():
        v_final = V(final_next_states_flat).squeeze(-1)

    #buffer for n step targets
    targets = torch.zeros_like(rewards_t)

    targets[-1] = rewards_t[-1] + (1-dones_t[-1])*gamma*v_final

    for t in reversed(range(num_steps - 1)):
        targets[t] = rewards_t[t] + (1-dones_t[t])*gamma*targets[t+1]

    num_transitions=num_steps * num_trajectories_parallel
    targets_flat = targets.view(num_transitions)
    states_flat = states_t.view(num_transitions, -1)
    log_probs_flat = log_probs_t.view(num_transitions)

    #for t in reversed(range(5)):
        #for i in range(num_trajectories_parallel):
            #idx = t * num_trajectories_parallel + i

            #if t == 4:
            ##r_t + (1 - done_t)*gamma*V(next_state)
                #targets[idx] = rewards_t[idx] + (1 - dones_t[idx]) * gamma * v_next[idx]
            #else:
            ##r_t + (1 - done_t)*gamma*targets[t+1]
                #idx_next = (t + 1) * num_trajectories_parallel + i
                #targets[idx] = rewards_t[idx] + (1 - dones_t[idx]) * gamma * targets[idx_next]

    #compute value loss
    v_s = V(states_flat).squeeze(-1)
    advantages = targets_flat - v_s
    loss_V = (advantages ** 2).mean()

    V_optimizer.zero_grad()
    loss_V.backward()
    V_optimizer.step()

    with torch.no_grad():
        v_s = V(states_flat).squeeze(-1)
        advantages = targets_flat - v_s

    loss_pi = -(log_probs_flat * advantages.detach()).mean()

    pi_optimizer.zero_grad()
    loss_pi.backward()
    pi_optimizer.step()



    #periodic reporting:
    done_log_np = np.array(done_log)
    reward_log_np = np.array(reward_log)

    terminal_rewards = reward_log_np[done_log_np.astype(bool)]
    win_pct = np.mean(terminal_rewards == 20) * 100
    # win_pct[e] = np.mean((reward_log_np[done_log_np] == 20)[:-100])

    if (e + 1) % 10 == 0:
      print(f"Update {e+1}/{num_updates} | Policy Loss: {loss_pi.item():.2f} | Value Loss: {loss_V.item():.2f} | Win %: {win_pct:.2f}%")

Update 10/1000 | Policy Loss: 0.00 | Value Loss: 75.52 | Win %: 55.42%
Update 20/1000 | Policy Loss: 0.00 | Value Loss: 43.59 | Win %: 61.54%
Update 30/1000 | Policy Loss: 0.00 | Value Loss: 70.78 | Win %: 62.07%
Update 40/1000 | Policy Loss: 0.00 | Value Loss: 44.58 | Win %: 60.40%
Update 50/1000 | Policy Loss: -0.00 | Value Loss: 60.57 | Win %: 61.75%
Update 60/1000 | Policy Loss: -0.00 | Value Loss: 79.21 | Win %: 59.27%
Update 70/1000 | Policy Loss: 0.00 | Value Loss: 86.37 | Win %: 60.33%
Update 80/1000 | Policy Loss: 0.00 | Value Loss: 47.39 | Win %: 59.80%
Update 90/1000 | Policy Loss: 0.00 | Value Loss: 62.41 | Win %: 58.70%
Update 100/1000 | Policy Loss: 0.00 | Value Loss: 40.76 | Win %: 59.86%
Update 110/1000 | Policy Loss: -0.00 | Value Loss: 57.77 | Win %: 59.58%
Update 120/1000 | Policy Loss: -0.00 | Value Loss: 113.68 | Win %: 60.27%
Update 130/1000 | Policy Loss: -0.00 | Value Loss: 65.94 | Win %: 60.37%
Update 140/1000 | Policy Loss: 0.00 | Value Loss: 68.68 | Win %: 59

In [None]:
obs, info = envs.reset()
num_envs = obs.shape[0]
dones = np.zeros(num_envs, dtype=bool)
truncateds = np.zeros(num_envs, dtype=bool)

while not np.all(dones) and not np.all(truncateds):
    envs.envs[0].render()
    obs = torch.tensor(obs, dtype=torch.float32)
    actions = pi(obs).multinomial(num_samples=1).squeeze(-1)
    obs, rewards, dones, truncated, infos = envs.step(actions.cpu().numpy())

    sleep(1)
    clear_output(wait=True)

envs.envs[0].render()
envs.close()

xxxxxx
x····x
xᗧ··ᗣx
x····x
x···◯x
xxxxxx



KeyboardInterrupt: 

In [None]:
from matplotlib.pyplot import plot
plot(win_pct)

NameError: name 'win_pct' is not defined