In [None]:
import torch
from tqdm import tqdm
from torch import nn
from torchvision import transforms as T
from PIL import Image
import numpy as np
from pathlib import Path
from collections import deque
import random, datetime, os, copy
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros

In [None]:
torch.cuda.is_available()

True

In [None]:
# !pip install gym-super-mario-bros==7.3.0

# Traning the RL Agent

In [None]:
# env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", render_mode='human')
env = gym.make("SuperMarioBros-1-1-v0")
#Limiting action space to only walk right and jump right
#Because of computational limit

env = JoypadSpace(env, [['right'], ['right','A']])

#Initialize env

env.reset()
next_state, reward, done, info = env.step(action=0)
print(f'{reward},\n{done},\n{info},\n{next_state.shape}')

0.0,
False,
{'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79},
(240, 256, 3)


In [None]:
next_state.shape

(240, 256, 3)

In [None]:
env = gym.make("SuperMarioBros-1-1-v0")
done = True
for step in range(5000):
    if done:
        state = env.reset()
    state, reward, done, info = env.step(env.action_space.sample())
    env.render()

env.close()

**SkipFrame** is a customized wrapper that comes from gym.wrapper and also impliments step() method. We can skip n-intermediate frames without losing a lot of information because the differences between consecutive frames are not required. Rewards accrued over each skipped frame are combined in the n-th frame.

In [None]:
class SkipFrame(gym.Wrapper):
  def __init__(self, env, skipframes):
    super().__init__(env)
    self._frames = skipframes

  def step(self, action):
    total_reward = 0.0
    finish = False
    for i in range(self._frames):
      #Calculate the reward abd repeat the same action
      obs, reward, finish, info = self.env.step(action)
      total_reward += reward
      if finish:
        break
    
    return obs, total_reward, finish, info

**GrayScaleObs** is a popular wrapper to convert an RGB image to grayscale, which shrinks the state representation without sacrificing important information. The size of each state is now: [1, 240, 256]

In [None]:
class GrayScaleObs(gym.ObservationWrapper):
  def __init__(self, env):
    super().__init__(env)
    observation_shape = self.observation_space.shape[:2]
    self.observation_space = Box(low = 0, high = 255, shape = observation_shape, dtype = np.uint8)

  def observation(self, obs):
    obs = self.permute_proentation(obs)
    transform = T.Grayscale()
    return  transform(obs)

  def permute_proentation(self, obs):
    obs = np.transpose(obs, (2, 0, 1))
    obs = torch.tensor(obs.copy(), dtype=torch.float)
    return obs

**ResizeObservation** - Each observation is downsampled into a square picture. New size: [1, 84, 84]

In [None]:
class ResizeObservation(gym.ObservationWrapper):
  def __init__(self, env, shape):
    super().__init__(env)
    if isinstance(shape, int):
      self.shape = (shape, shape)
    else:
      self.shape = tuple(shape)
    
    observation_shape = self.shape + self.observation_space.shape[2:]
    self.observation_space = Box(low = 0, high = 255, shape = observation_shape, dtype = np.uint8)

  def observation(self, obs):
    transforms = T.Compose([T.Resize(self.shape), T.Normalize(0, 255)])
    obs = transforms(obs).squeeze(0)
    return obs

**FrameStack** is a wrapper that enables us to condense multiple environmental frames into one observation point for our learning model. By comparing the direction of Mario's movement in the preceding few frames, we can determine whether he was landing or jumping.

In [None]:
#Try Applying wrappers to env
env = SkipFrame(env, skipframes = 4)
env = GrayScaleObs(env)
env = ResizeObservation(env, shape = 84)
env = FrameStack(env, num_stack=4)

## Mario Agent



In [None]:
class Mario:
  def __init__():
    pass
  
  def act(self, state):
    #Given a state, return an action
    pass
  
  def cache(self, experience):
    #Add experience ie. actions perfromed on states into cache
    pass

  def recall(self):
    #Sample experiences from cache
    pass

  def learn(self):
    #Update action value (Q) function with a batch of experiences
    pass

In [None]:
class Mario:
  def __init__(self, state_dim, action_dim, save_dir):
    self.state_dim = state_dim
    self.action_dim = action_dim
    self.save_dir = save_dir
    self.use_cuda = torch.cuda.is_available()
    
    self.NN = MarioNN(self.state_dim, self.action_dim).float() #Mario's DMM to predict the optimal action given state
    if self.use_cuda:
      self.NN= self.NN.to(device='cuda')
    
    self.memory = deque(maxlen=10000) 
    self.batch_size = 32
    self.exploration_rate = 1
    self.exploration_rate_decay = 0.9999998
    self.exploration_rate_min = 0.1
    self.curr_step = 0
    self.save_every = 500000
    self.gamma  = .9
    self.optimizer = torch.optim.Adam(self.NN.parameters(), lr = .0003)
    self.smooth_L1_loss = torch.nn.SmoothL1Loss()
    self.exp_before_train = 10000
    self.learn_every = 3
    self.sync_every = 10000

  def act(self, state):
    """
      Input:
            state - A single obersarvation of the current state frame (state_dim - dimensions)
      Output:
            action_index - An index value of the action that Mario will perform
    """
    #EXPLORE
    if np.random.rand() < self.exploration_rate:
      action_index = np.random.randint(self.action_dim)

    #EXPLOIT
    else:
      state = state.__array__()
      if self.use_cuda:
        state = torch.tensor(state).cuda()
      else:
        state = torch.tensor(state)
      
      state = state.unsqueeze(0)
      action_values = self.NN(state, model = 'online')
      action_index = torch.argmax(action_values, axis = 1).item()
    
    self.exploration_rate = max(self.exploration_rate*self.exploration_rate_decay, self.exploration_rate_min)
    self.curr_step += 1

    return action_index

  def cache(self, state, next_state, action, reward, finish):
    state = state.__array__()
    next_state = next_state.__array__()

    if self.use_cuda:
      state = torch.tensor(state).cuda()
      next_state = torch.tensor(next_state).cuda()
      action = torch.tensor([action]).cuda()
      reward = torch.tensor([reward]).cuda()
      finish = torch.tensor([finish]).cuda()

    else:
      state = torch.tensor(state)
      next_state = torch.tensor(next_state)
      action = torch.tensor([action])
      reward = torch.tensor([reward])
      finish = torch.tensor([finish])
    
    self.memory.append((state, next_state, action, reward, finish))
  
  def recall(self):
    batch = random.sample(self.memory, self.batch_size)
    state, next_state, action, reward, finish = map(torch.stack, zip(*batch))

    return state, next_state, action.squeeze(), reward.squeeze(), finish.squeeze()
  
  # Q determines optimal state-action value
  def td_estimate(self, state, action):
    current_Q = self.NN(state, model = 'online')[np.arange(0, self.batch_size), action]
    # online(state, action)
    return current_Q

  @torch.no_grad()
  def td_target(self, reward, next_state, finish):
    next_state_Q = self.NN(next_state, model = 'online')
    best_action = torch.argmax(next_state_Q, axis = 1)
    next_Q = self.NN(next_state, model = 'target')[np.arange(0, self.batch_size), best_action]

    return (reward + (1 - finish.float()) * self.gamma * next_Q).float()

  def update_Q(self, td_estimate, td_target):
    loss = self.smooth_L1_loss(td_estimate, td_target)
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()
    return loss.item()
  
  def sync_Q(self):
    self.NN.target_model.load_state_dict(self.NN.model.state_dict())

  def save_checkpoint(self):
    path = (self.save_dir / f'mario_NN_{int(self.curr_step // self.save_every)}.chkpt')

    torch.save(dict(model = self.NN.state_dict(), exploration_rate = self.exploration_rate), path)

    print(f'Mario NN saved to {path} at step {self.curr_step}')

  def learn(self):
    if self.curr_step % self.sync_every == 0:
      self.sync_Q()

    if self.curr_step % self.save_every == 0:
      self.save_checkpoint()

    if self.curr_step < self.exp_before_train:
      return None, None

    if self.curr_step % self.learn_every != 0:
      return None, None

    # Get Sample from Cache
    state, next_state, action, reward, finish = self.recall()

    td_est = self.td_estimate(state, action)

    td_tgt = self.td_target(reward, next_state, finish)

    # calculate loss, backpropagate
    loss = self.update_Q(td_est, td_tgt)

    return (td_est.mean().item(), loss)

In [None]:
class MarioNN(nn.Module):
  """
      CCN Structure:
  input -> (conv2d + ReLU) x 3 -> Flatten -> (fully connected dense + ReLU) x 2 -> output
  """
  def __init__(self, input_dim, output_dim):
    super().__init__()
    
    channel, height, width = input_dim
    if height != 84 or width != 84:
      raise ValueError(f'Expected input height and width: 84, got: height {height}, width {width}')
    
    self.model = nn.Sequential(
        nn.Conv2d(in_channels = channel, out_channels = 32, kernel_size = 8, stride = 4), # (20 X 20 X 32)
        nn.ReLU(),
        nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 4, stride = 2), # (8 X 8 X 64)
        nn.ReLU(),
        nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, stride = 1), # (7 X 7 X 64)
        nn.ReLU(),
        nn.Flatten(), # (7 X 7 X 64 = 3136)

        #[(height − filter_size + 1) / 𝑠tride] + 1

        nn.Linear(in_features = 3136, out_features = 512),
        nn.ReLU(),
        nn.Linear(in_features = 512, out_features = output_dim)
    )

    self.target_model = copy.deepcopy(self.model)

    # Q_target model parameters needs to be frozen by disabling the gredients
    for w in self.target_model.parameters():
      w.requires_grad = False

  def forward(self, input, model):
    if model == 'online':
      return self.model(input)
    elif model == 'target':
      return self.target_model(input)

In [None]:
use_cuda = torch.cuda.is_available()
if use_cuda:
  print(f'Using CUDA: {use_cuda}\n')
else:
  print('Using CPU\n')

episodes = 10000

for e in tqdm(range(episodes), desc='Episodes'):
  state = env.reset() # return initial state
  
  while True:

    # env.render()

    action = mario_agent.act(state) # return action (index) on given state

    next_state, reward, finish, info = env.step(action) # perform the action on the environment

    mario_agent.cache(state, next_state, action, reward, finish) # store the state-action-reward cache

    Q, loss = mario_agent.learn() # Learn

    state = next_state # update the state

    if finish or info['flag_get']:
      break
  if e % 500 == 0 and e != 0:
    mario_agent.save_dir= Path("checkpoints") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
    mario_agent.save_dir.mkdir(parents=True)
    mario_agent.save_checkpoint()
  # env.close()

Using CUDA: True



  logger.deprecation(
  return (self.ram[0x86] - self.ram[0x071c]) % 256
Episodes:   5%|███▎                                                              | 501/10000 [30:31<9:57:30,  3.77s/it]

Mario NN saved to checkpoints\2022-10-05T01-07-29\mario_NN_0.chkpt at step 97922


Episodes:  10%|██████▌                                                          | 1001/10000 [57:46<6:55:56,  2.77s/it]

Mario NN saved to checkpoints\2022-10-05T01-34-44\mario_NN_0.chkpt at step 194440


Episodes:  15%|█████████▍                                                     | 1501/10000 [1:24:38<9:22:21,  3.97s/it]

Mario NN saved to checkpoints\2022-10-05T02-01-36\mario_NN_0.chkpt at step 291404


Episodes:  20%|████████████▌                                                  | 2001/10000 [1:52:43<8:21:20,  3.76s/it]

Mario NN saved to checkpoints\2022-10-05T02-29-40\mario_NN_0.chkpt at step 386346


Episodes:  25%|███████████████▊                                               | 2501/10000 [2:20:05<5:40:35,  2.73s/it]

Mario NN saved to checkpoints\2022-10-05T02-57-03\mario_NN_0.chkpt at step 482539


Episodes:  26%|████████████████▎                                              | 2596/10000 [2:25:02<7:30:33,  3.65s/it]

Mario NN saved to checkpoints\2022-10-05T02-57-03\mario_NN_1.chkpt at step 500000


Episodes:  30%|██████████████████▉                                            | 3001/10000 [2:47:49<7:07:16,  3.66s/it]

Mario NN saved to checkpoints\2022-10-05T03-24-47\mario_NN_1.chkpt at step 579861


Episodes:  35%|██████████████████████                                         | 3501/10000 [3:15:49<7:14:12,  4.01s/it]

Mario NN saved to checkpoints\2022-10-05T03-52-46\mario_NN_1.chkpt at step 678064


Episodes:  40%|█████████████████████████▏                                     | 4001/10000 [3:45:18<5:08:10,  3.08s/it]

Mario NN saved to checkpoints\2022-10-05T04-22-16\mario_NN_1.chkpt at step 781354


Episodes:  45%|████████████████████████████▎                                  | 4501/10000 [4:12:48<5:10:32,  3.39s/it]

Mario NN saved to checkpoints\2022-10-05T04-49-46\mario_NN_1.chkpt at step 877344


Episodes:  50%|███████████████████████████████▌                               | 5001/10000 [4:41:56<5:05:57,  3.67s/it]

Mario NN saved to checkpoints\2022-10-05T05-18-54\mario_NN_1.chkpt at step 978495


Episodes:  51%|████████████████████████████████                               | 5091/10000 [4:47:58<4:41:31,  3.44s/it]

Mario NN saved to checkpoints\2022-10-05T05-18-54\mario_NN_2.chkpt at step 1000000


Episodes:  55%|██████████████████████████████████▋                            | 5501/10000 [5:12:04<3:09:23,  2.53s/it]

Mario NN saved to checkpoints\2022-10-05T05-49-02\mario_NN_2.chkpt at step 1084018


Episodes:  60%|█████████████████████████████████████▊                         | 6001/10000 [5:39:50<2:46:55,  2.50s/it]

Mario NN saved to checkpoints\2022-10-05T06-16-48\mario_NN_2.chkpt at step 1181233


Episodes:  65%|████████████████████████████████████████▉                      | 6501/10000 [6:06:55<3:49:26,  3.93s/it]

Mario NN saved to checkpoints\2022-10-05T06-43-52\mario_NN_2.chkpt at step 1276718


Episodes:  70%|████████████████████████████████████████████                   | 7001/10000 [6:34:20<4:31:32,  5.43s/it]

Mario NN saved to checkpoints\2022-10-05T07-11-18\mario_NN_2.chkpt at step 1373018


Episodes:  75%|███████████████████████████████████████████████▎               | 7501/10000 [7:04:44<2:18:05,  3.32s/it]

Mario NN saved to checkpoints\2022-10-05T07-41-42\mario_NN_2.chkpt at step 1478684


Episodes:  76%|███████████████████████████████████████████████▊               | 7589/10000 [7:13:00<2:25:23,  3.62s/it]

Mario NN saved to checkpoints\2022-10-05T07-41-42\mario_NN_3.chkpt at step 1500000


Episodes:  80%|██████████████████████████████████████████████████▍            | 8001/10000 [7:37:23<1:46:08,  3.19s/it]

Mario NN saved to checkpoints\2022-10-05T08-14-21\mario_NN_3.chkpt at step 1581883


Episodes:  85%|█████████████████████████████████████████████████████▌         | 8501/10000 [8:05:57<1:36:26,  3.86s/it]

Mario NN saved to checkpoints\2022-10-05T08-42-53\mario_NN_3.chkpt at step 1681002


Episodes:  90%|████████████████████████████████████████████████████████▋      | 9001/10000 [8:35:47<1:37:50,  5.88s/it]

Mario NN saved to checkpoints\2022-10-05T09-12-45\mario_NN_3.chkpt at step 1784245


Episodes:  95%|█████████████████████████████████████████████████████████████▊   | 9501/10000 [9:06:55<28:39,  3.45s/it]

Mario NN saved to checkpoints\2022-10-05T09-43-53\mario_NN_3.chkpt at step 1891878


Episodes: 100%|████████████████████████████████████████████████████████████████▉| 9984/10000 [9:37:53<02:00,  7.53s/it]

Mario NN saved to checkpoints\2022-10-05T09-43-53\mario_NN_4.chkpt at step 2000000


Episodes: 100%|████████████████████████████████████████████████████████████████| 10000/10000 [9:39:18<00:00,  3.48s/it]


In [None]:
mario_agent.save_dir= Path("checkpoints") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
mario_agent.save_dir.mkdir(parents=True)
mario_agent.save_checkpoint()

Mario NN saved to checkpoints\2022-10-05T11-06-38\mario_NN_4.chkpt at step 2006293


In [None]:
env.close()

#Loading the model

In [None]:
env = gym.make("SuperMarioBros-1-1-v0")

# #Limiting action space to only walk right and jump right
# #Because of computational limit
env = JoypadSpace(env, [['right'], ['right','A']])

#Try Applying wrappers to env
env = SkipFrame(env, skipframes = 4)
env = GrayScaleObs(env)
env = ResizeObservation(env, shape = 84)
env = FrameStack(env, num_stack=4)

save_dir = Path("checkpoints") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
save_dir.mkdir(parents=True)

mario_agent = Mario(state_dim = (4, 84, 84), action_dim = env.action_space.n, save_dir = save_dir)

  logger.warn(
  deprecation(
  deprecation(


In [None]:
load_model_path = "F:/checkpoints/2022-10-05T11-06-38/mario_NN_4.chkpt"

mario_agent.NN.load_state_dict(torch.load(load_model_path)['model'])
mario_agent.exploration_rate = torch.load(load_model_path)['exploration_rate']
# mario_agent.exploration_rate = 0    #Setting exploration rate to 0, so agent can use optimal state value action

In [None]:
mario_agent.NN.eval()

MarioNN(
  (model): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=3136, out_features=512, bias=True)
    (8): ReLU()
    (9): Linear(in_features=512, out_features=2, bias=True)
  )
  (target_model): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=3136, out_features=512, bias=True)
    (8): ReLU()
    (9): Linear(in_features=512, out_features=2, bias=True)
  )
)

In [None]:
epi = 10
for i in range(epi):
  state = env.reset() # return initial state
  while True:

    env.render()

    action = mario_agent.act(state) # return action (index) on given state

    next_state, reward, finish, info = env.step(action) # perform the action on the environment

    state = next_state # update the state

    if finish or info['flag_get']:
      break

In [None]:
env.close()

In [None]:
mario_agent.exploration_rate = 0.1