In [1]:
pip install gymnasium



In [2]:
from time import sleep
import numpy as np
from IPython.display import clear_output
import gymnasium as gym
from gymnasium.envs.registration import register
import torch
from torch import nn
import torch.optim as optim

In [3]:
#Give colab access to your google drive:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [4]:
#Change current directory to folder with MiniPacMan
%cd /gdrive/MyDrive/SP 25/Reinforcement Learning/Qnetwork

/gdrive/MyDrive/SP 25/Reinforcement Learning/Qnetwork


In [5]:
#Import MiniPacMan environment class definition
from MiniPacManGym import MiniPacManEnv

In [6]:
#Register MiniPacMan in your gymnasium environments
register(
    id="MiniPacMan-v0",
    entry_point=MiniPacManEnv,  # Update with your actual module path
    max_episode_steps=20          # You can also set a default here
)

In [7]:
#Create a MiniPacMan gymnasium environment
env = gym.make("MiniPacMan-v0", render_mode="human", frozen_ghost=False)

In [15]:
class QNetwork(nn.Module):
    def __init__(self):
      super().__init__()
      self.flatten = nn.Flatten()
      self.linear1 = nn.Linear(6 * 6, 64)
      self.activation = nn.ReLU()
      self.linear2 = nn.Linear(64, 4)

    def forward(self, x):
        x = self.flatten(x)
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        return x

In [18]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) >= self.capacity:
            self.buffer.pop(0)
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, next_states, dones = zip(*[self.buffer[i] for i in indices])
        return torch.stack(states), actions, torch.tensor(rewards), torch.stack(next_states), torch.tensor(dones)

In [19]:
q_network = QNetwork()
optimizer = optim.Adam(q_network.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

In [24]:
#set hyperparams -- play with any of these!
gamma=0.95
buffer_size=1000
batch_size=32
num_episodes=10000

RB=ReplayBuffer(buffer_size) #initialize Replay Buffer
epsilon=1 #initialize epsilon

for e in range(num_episodes):
  new_obs,info=env.reset()
  new_obs=torch.tensor(new_obs,dtype=torch.float32).unsqueeze(0)

  done=False
  truncated=False
  steps=0

  while not done and not truncated: #Loop for one episode
    obs=new_obs

    #choose action
    t=np.random.random()
    if t>epsilon:
      with torch.no_grad():
        action=torch.argmax(q_network(obs)).item() #exploitation
    else:
      # action=torch.randint(4,(1,)).item()
      action= env.action_space.sample()

    #take a step:
    new_obs,reward, done, truncated, info=env.step(action)
    new_obs=torch.tensor(new_obs,dtype=torch.float32).unsqueeze(0)
    RB.push(obs,action,reward,new_obs,done)
    steps+=1

    if len(RB.buffer)>=batch_size:
      states, actions, rewards, next_states, dones=RB.sample(batch_size)

    #compute target q-value
    with torch.no_grad():
      reward_tensor = torch.tensor(reward, dtype=torch.float32)
      if done:
          target_q = reward_tensor
      else:
          target_q = reward_tensor + gamma * torch.max(q_network(new_obs))

    #current q-value
    # q_values = q_network(obs).squeeze(0)
    # current_q = q_values[action]
    current_q = q_network(obs)[0, action]

    #compute loss
    loss = loss_fn(current_q, target_q)

    #Q-network update rule:
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  #reduce episilon if its not too low:
  epsilon=max(0.01, epsilon - 1.0/num_episodes)

  #periodic reporting:
  if e>0 and e%100==0:
    print(f'episode: {e}, steps: {steps}, epsilon: {epsilon},win: {reward==10}')


episode: 100, steps: 1, epsilon: 0.9899000000000011,win: False
episode: 200, steps: 3, epsilon: 0.9799000000000022,win: False
episode: 300, steps: 1, epsilon: 0.9699000000000033,win: False
episode: 400, steps: 1, epsilon: 0.9599000000000044,win: False
episode: 500, steps: 13, epsilon: 0.9499000000000055,win: False
episode: 600, steps: 1, epsilon: 0.9399000000000066,win: False
episode: 700, steps: 1, epsilon: 0.9299000000000077,win: False
episode: 800, steps: 1, epsilon: 0.9199000000000088,win: False
episode: 900, steps: 2, epsilon: 0.9099000000000099,win: False
episode: 1000, steps: 1, epsilon: 0.899900000000011,win: False
episode: 1100, steps: 3, epsilon: 0.8899000000000121,win: False
episode: 1200, steps: 1, epsilon: 0.8799000000000132,win: False
episode: 1300, steps: 1, epsilon: 0.8699000000000143,win: False
episode: 1400, steps: 3, epsilon: 0.8599000000000154,win: False
episode: 1500, steps: 1, epsilon: 0.8499000000000165,win: False
episode: 1600, steps: 1, epsilon: 0.8399000000000

In [1]:
obs, info = env.reset()
done = False
truncated = False

while not done and not truncated:
    env.render()
    obs=torch.tensor(obs,dtype=torch.float32).unsqueeze(0)
    with torch.no_grad():
        action = torch.argmax(q_network(obs)).item()
    obs, reward, done, truncated, info = env.step(action)
    sleep(1)
    clear_output(wait=True)

env.render()
env.close()

NameError: name 'env' is not defined