# RL Final Project: Using Double Deep Q-Learning to play Super Mario Bro

### This project references the official Pytorch's tutorial, github's code and youtube videos.
I would list their links here:  
  
https://github.com/Kautenja/playing-mario-with-deep-reinforcement-learning  
https://github.com/giorgioskij/SuperMario-RL  
https://pytorch.org/tutorials/intermediate/mario_rl_tutorial.html  
https://www.youtube.com/watch?v=oskrcfjVFC0  
https://www.youtube.com/watch?v=O2QaSh4tNVw  
https://www.youtube.com/watch?v=_gmQZToTMac  

### We mainly accomplish following three stuff:

1. We reproduce the paper's double deep q learning algorithm.
2. We apply this algorithm on a specific mission which is Super Mario
3. We extend the experiment results as changing the exploration strategy (from greedy to softmax and thompson sampling), and also change the hyperparameters to test the stability of the networks.

In [14]:
import torch
from torch import nn
from torchvision import transforms as T
from PIL import Image
import numpy as np
from pathlib import Path
import random, datetime, os
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from tensordict import TensorDict
from torchrl.data import TensorDictReplayBuffer, LazyMemmapStorage
from DealingStates import SkipFrame, GrayScaleObservation, ResizeObservation

## Environment

### Initialize Environment

The environment consists of tubes, mushrooms, blocks, and even components.

In [15]:
if gym.__version__ < '0.26':
    env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", new_step_api=True)
else:
    env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", render_mode='rgb', apply_api_compatibility=True)

env = JoypadSpace(env, [["right"], ["right", "A"]])

env.reset()
next_state, reward, done, trunc, info = env.step(action=0)

  logger.warn(
  logger.warn(


### Dealing with states

In order to understand the dynamic environment, we need to use a stack a of the most recent frames to give a sense of motion.  
In super mario game, each state can be seen as a frame which is [3, 240, 256]. 3 is for 3 dimensions' RGB frame. 240 and 256 are for width and length.  

However, the color does not matter. Hence, we can change the frame from colorful to greyscale as [1,240,256]. Further, resize the image to reduce the resources comsuption and we have [1,84,84].

At lask, stacking the wrapped states, we have [4,84,84]



In [16]:

# Apply Wrappers to environment
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
if gym.__version__ < '0.26':
    env = FrameStack(env, num_stack=4, new_step_api=True)
else:
    env = FrameStack(env, num_stack=4)

### Define the neural networks

In [17]:
from MarioNet import MarioNet

In [18]:
import torch.nn.functional as F

class Mario:
    def __init__(self, state_dim, action_dim, save_dir, exploration_rate, decay_rate, gamma, target_sync_frequency):

       
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.save_dir = save_dir
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.net = MarioNet(self.state_dim, self.action_dim).float().to(self.device)


        # Exploration settings
        self.exploration_rate = exploration_rate
        self.decay_rate = decay_rate
        self.exploration_rate_min   = 0.1
        self.steps_taken  = 0



        self.memory = TensorDictReplayBuffer(
            storage=LazyMemmapStorage(
                50000, 
                device=torch.device("cpu"), 
                scratch_dir="/root/autodl-tmp/thompson_sampling2/mem/")
        )

        self.batch_size = 32

        self.gamma = gamma

        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025)
        self.loss_fn = torch.nn.SmoothL1Loss()

        self.burn_in_period = 1e4
        self.learning_frequency = 3
        self.target_sync_frequency = target_sync_frequency
        
    def act(self, state):
        ## This is thompson sampling implementation
        state = state[0].__array__() if isinstance(state, tuple) else state.__array__()
        state = torch.tensor(state, device=self.device).unsqueeze(0)

        
        action_values = self.net(state, model="online")
        noise = torch.randn_like(action_values) * self.exploration_rate
        noisy_action_values = action_values + noise

        action_idx = torch.argmax(noisy_action_values, axis=1).item()

        self.exploration_rate *= self.decay_rate
        self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)

        self.steps_taken += 1
        return action_idx

    def cache(self, env_state, subsequent_state, action_taken, reward_earned, episode_done):

        def process_state(state):
            return state[0].__array__() if isinstance(state, tuple) else state.__array__()

        processed_env_state = torch.tensor(process_state(env_state))
        processed_subsequent_state = torch.tensor(process_state(subsequent_state))
        tensor_action = torch.tensor([action_taken])
        tensor_reward = torch.tensor([reward_earned])
        tensor_done = torch.tensor([episode_done])

        experience_data = {
            "state": processed_env_state, 
            "next_state": processed_subsequent_state, 
            "action": tensor_action, 
            "reward": tensor_reward, 
            "done": tensor_done
        }

        self.memory.add(TensorDict(experience_data, batch_size=[]))


    
    def recall(self):

        batch = self.memory.sample(self.batch_size).to(self.device)
        extracted_states = batch.get("state")
        extracted_next_states = batch.get("next_state")
        extracted_actions = batch.get("action").squeeze()
        extracted_rewards = batch.get("reward").squeeze()
        extracted_dones = batch.get("done").squeeze()

        return extracted_states, extracted_next_states, extracted_actions, extracted_rewards, extracted_dones


    def td_values(self, state, action, reward, next_state, done):
        td_estimate = self.net(state, model="online")[
            np.arange(0, self.batch_size), action
        ]
        with torch.no_grad():
            next_state_Q = self.net(next_state, model="online")
            best_action = torch.argmax(next_state_Q, axis=1)
            next_Q = self.net(next_state, model="target")[
                np.arange(0, self.batch_size), best_action
            ]
            td_target = (reward + (1 - done.float()) * self.gamma * next_Q).float()
        return td_estimate, td_target


    
    def update_Q_online(self, td_estimate, td_target):
        loss = self.loss_fn(td_estimate, td_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def sync_Q_target(self):
        self.net.target.load_state_dict(self.net.online.state_dict())
    
    def learn(self):
       
        if self.steps_taken % self.target_sync_frequency == 0:
            self.sync_Q_target()

        if self.steps_taken % 50000 == 0:
            self.save_model_checkpoint()

        if self.steps_taken < self.burn_in_period or self.steps_taken % self.learning_frequency != 0:
            return None, None

        fetched_states, fetched_next_states, fetched_actions, fetched_rewards, fetched_dones = self.recall()
        td_estimate, td_target = self.td_values(fetched_states, fetched_actions, fetched_rewards, fetched_next_states, fetched_dones)

        training_loss = self.update_Q_online(td_estimate, td_target)
        return td_estimate.mean().item(), training_loss

    def save_model_checkpoint(self):

        checkpoint_path = self.save_dir / f"mario_net_{int(self.steps_taken // 50000)}.chkpt"
        torch.save(
            dict(model=self.net.state_dict(), exploration_rate=self.exploration_rate),
            checkpoint_path
        )

    

### Logging




In [21]:
import numpy as np
import time, datetime
import matplotlib.pyplot as plt



class MetricLogger:
    def __init__(self, save_dir):
        self.save_log = save_dir / "log"
        self._initialize_log_file()

        # Dictionary to hold history metrics
        self.metrics = {
            'rewards': [],
            'lengths': [],
            'losses': [],
            'q_values': [],
            'avg_rewards': [],
            'avg_lengths': [],
            'avg_losses': [],
            'avg_q_values': []
        }

        # Current episode metrics
        self.init_episode()

        # Timing
        self.record_time = time.time()

    def _initialize_log_file(self):
        with open(self.save_log, "w") as f:
            headers = (
                "Episode", "Step", "Epsilon", "MeanReward",
                "MeanLength", "MeanLoss", "MeanQValue", "TimeDelta", "Time"
            )
            f.write(f"{headers[0]:>8}{headers[1]:>8}{headers[2]:>10}{headers[3]:>15}"
                    f"{headers[4]:>15}{headers[5]:>15}{headers[6]:>15}{headers[7]:>15}{headers[8]:>20}\n")

    def log_step(self, reward, loss, q):
        self.curr_ep_reward += reward
        self.curr_ep_length += 1
        if loss:
            self.curr_ep_loss += loss
            self.curr_ep_q += q
            self.curr_ep_loss_length += 1

    def log_episode(self):
        "Mark end of episode"
        self.metrics['rewards'].append(self.curr_ep_reward)
        self.metrics['lengths'].append(self.curr_ep_length)
        avg_loss = avg_q = 0
        if self.curr_ep_loss_length > 0:
            avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5)
            avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5)
        self.metrics['losses'].append(avg_loss)
        self.metrics['q_values'].append(avg_q)

        self.init_episode()

    def init_episode(self):
        self.curr_ep_reward = 0.0
        self.curr_ep_length = 0
        self.curr_ep_loss = 0.0
        self.curr_ep_q = 0.0
        self.curr_ep_loss_length = 0

    def record(self, episode, epsilon, step):
        mean_reward = np.round(np.mean(self.metrics['rewards'][-100:]), 3)
        mean_length = np.round(np.mean(self.metrics['lengths'][-100:]), 3)
        mean_loss = np.round(np.mean(self.metrics['losses'][-100:]), 3)
        mean_q = np.round(np.mean(self.metrics['q_values'][-100:]), 3)

        self.metrics['avg_rewards'].append(mean_reward)
        self.metrics['avg_lengths'].append(mean_length)
        self.metrics['avg_losses'].append(mean_loss)
        self.metrics['avg_q_values'].append(mean_q)

        time_delta = np.round(time.time() - self.record_time, 3)
        self.record_time = time.time()
        print(
            f"Episode {episode} - "
            f"Step {step} - "
            f"Epsilon {epsilon} - "
            f"Mean Reward {mean_reward} - "
            f"Mean Length {mean_length} - "
            f"Mean Loss {mean_loss} - "
            f"Mean Q Value {mean_q} - "
        )

        with open(self.save_log, "a") as f:
            f.write(
                f"{episode:8d}{step:8d}{epsilon:10.3f}"
                f"{mean_reward:15.3f}{mean_length:15.3f}{mean_loss:15.3f}{mean_q:15.3f}"
                f"{time_delta:15.3f}{self.record_time:20.3f}\n"
            )



In [22]:

save_dir = Path("/root/autodl-tmp/thompson_sampling2/checkpoints/") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
save_dir.mkdir(parents=True)


exploration_rate = 1
decay_rate = 0.999999975
gamma = 0.1
target_sync_frequency = 1e4

mario = Mario(
    state_dim=(4, 84, 84), 
    action_dim=env.action_space.n, 
    save_dir=save_dir, 
    exploration_rate = 1,
    decay_rate = 0.999999975,
    gamma = 0.1,
    target_sync_frequency = 1e4)



logger = MetricLogger(save_dir)

episodes = 100000
for e in range(episodes):

    state = env.reset()

    while True:

        action = mario.act(state)

        next_state, reward, done, trunc, info = env.step(action)

        mario.cache(state, next_state, action, reward, done)

        q, loss = mario.learn()

        logger.log_step(reward, loss, q)

        state = next_state

        if done or info["flag_get"]:
            break

    logger.log_episode()

    if (e % 20 == 0) or (e == episodes - 1):
        logger.record(episode=e, epsilon=mario.exploration_rate, step=mario.steps_taken)

Episode 0 - Step 107 - Epsilon 0.9999973250035489 - Mean Reward 628.0 - Mean Length 107.0 - Mean Loss 0.0 - Mean Q Value 0.0 - 
Episode 20 - Step 4822 - Epsilon 0.99987945726455 - Mean Reward 607.333 - Mean Length 229.619 - Mean Loss 0.0 - Mean Q Value 0.0 - 
Episode 40 - Step 10468 - Epsilon 0.9997383342376162 - Mean Reward 652.024 - Mean Length 255.317 - Mean Loss 0.124 - Mean Q Value 0.034 - 
Episode 60 - Step 13474 - Epsilon 0.9996632067239334 - Mean Reward 621.541 - Mean Length 220.885 - Mean Loss 0.318 - Mean Q Value 0.86 - 
Episode 80 - Step 15842 - Epsilon 0.9996040284131473 - Mean Reward 599.852 - Mean Length 195.58 - Mean Loss 0.347 - Mean Q Value 1.384 - 
Episode 100 - Step 20461 - Epsilon 0.9994886058010536 - Mean Reward 601.61 - Mean Length 203.54 - Mean Loss 0.364 - Mean Q Value 1.751 - 


KeyboardInterrupt: 