# Kung Fu Master

Traning A2C AI to solve [kung fu master](https://ale.farama.org/environments/kung_fu_master/) from [Gymnasium](https://gymnasium.farama.org/)

## Intalling packages and importing libraries

### Installing NumPy and PyTorch

In [None]:
%pip install numpy
%pip install torch
%pip install torchvision

### Installing Gymnasium

In [None]:
%pip install gymnasium
%pip install ale-py
%pip install swig # Necessary to build the wheel for box2d-py
%pip install gymnasium[box2d]

Collecting swig
  Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.0
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2379496 sha256=b1258d12315d7d16c97a8b984ec746c8282943058c07e1b5b038bb7e9908a30d
  Stored in directory: /root/.cache/pip/wheels/

### Importing Libraries

In [24]:
import cv2
import math
import random
import numpy as np

# Pytorch stuff

import torch
import torch.nn as nn # Neural network library
import torch.optim as optim # Optimizer to train AI
import torch.nn.functional as F # Activation & loss function
import torch.multiprocessing as mp # Multiprocessing for parallel training
import torch.distributions as distributions # Distributions for action selection
from torch.distributions import Categorical

# Gymnasium stuff
import gymnasium as gym
import ale_py # For Atari games
from gymnasium import ObservationWrapper
from gymnasium.spaces import Box # For environment
from gymnasium.vector import SyncVectorEnv # Execute multiple environments in parallel


  and should_run_async(code)


## Building AI

### Neural Net Architecture

In [4]:
class Network(nn.Module):
    """
    A2C Network with dynamically computed feature size

    paramters:
    action_size: number of actions
    input_shape: shape of input frames (default to 4 stacked frames that are 42 x 42)
    """
    def __init__(self, action_size, input_shape=(4, 42, 42)):
        super(Network, self).__init__()
        self.action_size = action_size

        # Going to have stack of 4 frames
        # 32 output channels to be cheap
        self.conv1 = nn.Conv2d(in_channels=4, out_channels=32, kernel_size=(3,3), stride=2)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3), stride=2)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3), stride=2)

        self.flatten = nn.Flatten()

        # Dynamically compute feature size
        self.feature_size = self._compute_feature_size(input_shape)

        # Fully connected layers
        self.fc1 = nn.Linear(in_features=self.feature_size, out_features=128)
        self.fc2_action_values = nn.Linear(in_features=128, out_features=action_size) # Q values for each action
        self.fc2_state_values = nn.Linear(in_features=128, out_features=1) # Estimate of value of current state

    def _compute_feature_size(self, input_shape):
        """
        Computes feature size
        """
        with torch.no_grad():
          x = torch.zeros(1, *input_shape)  # Batch size of 1 with input shape
          x = self.conv1(x)
          x = self.conv2(x)
          x = self.conv3(x)

          return x.numel()  # Total number of elements

    def forward(self, state):
        # State here is input frames
        x = F.relu(self.conv1(state))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))

        x = self.flatten(x) # No activation needed before flattening

        x = F.relu(self.fc1(x))

        action_values = self.fc2_action_values(x)
        state_value = self.fc2_state_values(x).squeeze(-1) # Remove last dimension

        return action_values, state_value

## Training AI

### Pre-process frames

In [29]:
class PreProcessing(ObservationWrapper):
    """
    Combines frames received to be stacks of 4 grayscale images

    Paramters:
    env: environment to wrap
    height: height of image
    width: width of image
    crop: function to crop images (default is none)
    dim_order: order of dimensions (default is (C, H, W) by PyTorch) -> tensorflow does (H, C, W)
    color: use RGB or grasycale (default is grayscale)
    n_frames: number of frames to combine
    """
    def __init__(self, env, height = 42, width = 42, crop = lambda img: img, dim_order = 'pytorch', color = False, n_frames = 4):
      super(PreProcessing, self).__init__(env)
      self.img_size = (height, width)
      self.crop = crop
      self.dim_order = dim_order
      self.color = color
      self.frame_stack = n_frames
      n_channels = 3 * n_frames if color else n_frames
      obs_shape = {'tensorflow': (height, width, n_channels), 'pytorch': (n_channels, height, width)}[dim_order]
      self.observation_space = Box(0.0, 1.0, obs_shape)
      self.frames = np.zeros(obs_shape, dtype = np.float32) # Store frames here

    def reset(self, **kwargs):
      """
      Reset environment to initial state and return stacked frames + environment information
      """
      self.frames = np.zeros_like(self.frames) # Clear previous frames
      obs, info = self.env.reset(**kwargs) # Reset environment
      self.update_buffer(obs) # Pre-process first frame
      return self.frames, info

    def observation(self, img):
      img = self.crop(img) # Apply cropping
      img = cv2.resize(img, self.img_size) # Resize to proper dimensions

      if not self.color: # Convert to grayscale
        if len(img.shape) == 3 and img.shape[2] == 3:
          img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

      img = img.astype('float32') / 255 # Map pixels to [0, 1] to improve neural net performance

      # Frame stacking -> remove oldest frame + shift every frame forward when new frame received
      # New frame is added at the end
      if self.color:
        self.frames = np.roll(self.frames, shift = -3, axis = 0)
      else:
        self.frames = np.roll(self.frames, shift = -1, axis = 0)

      # Add frames to buffer
      if self.color:
        self.frames[-3:] = img # Replace last 3 channels for RGB
      else:
        self.frames[-1] = img # Replace last channel for grayscale

      return self.frames

    def update_buffer(self, obs):
      """
      Pre-process and store frames
      """
      self.frames = self.observation(obs)

  and should_run_async(code)


### Set up Kung Fu Master environment

In [21]:
def make_env():
  def _init():
    env = gym.make("KungFuMasterDeterministic-v4", render_mode='rgb_array')
    env = PreProcessing(env, height=42, width=42)  # Apply preprocessing correctly
    return env
  return _init  # Return the function, not an instance

env = make_env().__call__()

kung_fu_state_shape = env.observation_space.shape
kung_fu_num_actions = env.action_space.n
print("State shape:", kung_fu_state_shape)
print("Number actions:", kung_fu_num_actions)

State shape: (4, 42, 42)
Number actions: 14


### Initialize hyperparameters

In [8]:
learning_rate = 1e-4
discount_factor = 0.99 # Optimal discount factor
number_environments = 10
num_episodes = 3000

### A2C

In [40]:
class Agent():
  """
  Use A2C to train an agent to play Kung Fu Master

  Use reward normalization via moving average to better adapt to latest rewards
  """

  def __init__(self, action_size, alpha = 0.01) -> None:
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # GPU acceleration if possible

    self.action_size = action_size

    self.network = Network(action_size).to(self.device) # Neural network
    self.optimizer = optim.Adam(self.network.parameters(), lr=learning_rate) # Optimizer

    # Moving average parameters for reward normalization
    self.running_mean = 0
    self.running_std = 1
    self.alpha = alpha # Controls the rate of updates -> higher = more recent rewards influence mean & std more

  def normalize_reward(self, reward):
    """
    Normalize reward using an exponential moving average
    """

    self.running_mean = self.alpha * reward + (1 - self.alpha) * self.running_mean
    self.running_std = self.alpha * (reward - self.running_mean) ** 2 + (1 - self.alpha) * self.running_std

    return (reward - self.running_mean) / (np.sqrt(self.running_std) + 1e-8)  # Avoid divide-by-zero

  def act(self, state, epsilon=0):
    """
    Agent takes actions based on given states in environment using softmax action selection policy.

    Returns actions for each state in the batch

    A2C is faster than deep Q learning, so can use softmax
    """

    if state.ndim == 3: # Make sure state is in a batch
      state = [state]

    # Convert state to float32 before moving to device
    state = torch.tensor(state, dtype=torch.float32).to(self.device)  # Convert state to tensor and add batch

    with torch.no_grad(): # Disable gradients during inference
      action_values, _ = self.network(state) # Get action values (automatically calls forward method)
      softmax_policy = F.softmax(action_values, dim = -1)

    return torch.multinomial(softmax_policy, 1).squeeze(dim=-1).cpu().numpy() # Use softmax to select action (1 sample per batch entry)


  def step(self, state, action, reward, next_state, done):
    """
    Implement A2C formulas to make a move with normalized rewards
    """
    batch_size = state.shape[0] # First dimension of state tensor is batch size

    # Convert to float32 before moving to device
    state = torch.tensor(state, dtype=torch.float32).to(self.device)
    next_state = torch.tensor(next_state, dtype=torch.float32).to(self.device)
    reward = torch.tensor([self.normalize_reward(r) for r in reward], dtype=torch.float32, device=self.device)
    done = torch.tensor(done, dtype=torch.bool, device=self.device).to(torch.float32)

    action_values, state_values = self.network(state)
    _, next_state_values = self.network(next_state)

    target_state_value = reward + (1 - done) * discount_factor * next_state_values # Bellman equation

    # Add A2C algorithm parts
    advantage = target_state_value - state_values

    action_distribution = F.softmax(action_values, dim = -1)
    log_action_distribution = F.log_softmax(action_values, dim = -1)

    entropy = -torch.sum(action_distribution * log_action_distribution, dim = -1) # Sum over last dimension

    selected_action_log_probs = log_action_distribution[torch.arange(batch_size), action]


    # Detach b/c don't need advantage gradients going into critic network
    # Including entropy allows for some exploration
    actor_loss = -(selected_action_log_probs * advantage.detach()).mean() - 0.01 * entropy.mean()


    critic_loss = F.mse_loss(state_values, target_state_value.detach()) # Prevent target gradients from impacting calculation

    total_loss = actor_loss + critic_loss

    # Backpropagate results
    self.optimizer.zero_grad()
    total_loss.backward()
    self.optimizer.step()

  and should_run_async(code)


### Train an Agent

In [41]:
# Initialze an agent
agent = Agent(action_size=kung_fu_num_actions)

In [11]:
# Single episode evaluation

def evaluate_agent(agent, env, num_episodes = 1):
  episode_rewards: list = [] # Rewards for each episode this is done in

  for _ in range(num_episodes):
    state, _ = env.reset()
    total_reward = 0

    while True:
      action = agent.act(state)
      next_state, reward, done, env_info, _ = env.step(action[0])
      total_reward += reward

      if done:
        break

    episode_rewards.append(total_reward)

  return episode_rewards

In [None]:
# Sychronous multiple environment evaluation

class MultipleEnv:
  def __init__(self, num_envs = 10):
    self.envs = SyncVectorEnv([make_env() for _ in range(num_envs)])  # So environments can be executed in parallel

  def reset(self):
    """
    Reset all environments simultaneously and return batched states
    """
    return self.envs.reset()

  def step(self, actions):
    """"
    Do action in all environments in parallel and return batched results
    """
    next_states, rewards, dones, envs_info, _ = self.envs.step(actions)

    # Reset environment if it has finished
    if dones.any():
      reset_states = self.envs.reset()
      reset_indices = np.where(dones)[0].astype(int)
      next_states[reset_indices] = reset_states[0][reset_indices]

    return next_states, rewards, dones, envs_info


In [None]:
# Final training loop

import tqdm # Adds progress bar

envs = MultipleEnv(number_environments)
batch, _ = envs.reset()

# Use tqdm to iterate
with tqdm.trange(0, num_episodes + 1) as progress_bar:
  for episode in progress_bar:
    actions = agent.act(batch)
    next_batch_state, rewards, dones, _ = envs.step(actions)

    agent.step(batch, actions, rewards, next_batch_state, dones)

    batch = next_batch_state

    if (episode) % 1000 == 0: # Print average score every 1000 iterations
      print(f"Average agent reward: {np.mean(evaluate_agent(agent, env))}") # Rewards are normalized, so this is not the actual reward

  0%|          | 4/3001 [00:02<24:46,  2.02it/s]  

Average agent reward: 400.0


 34%|███▎      | 1006/3001 [00:32<07:44,  4.29it/s]

Average agent reward: 700.0


 67%|██████▋   | 2006/3001 [01:02<02:49,  5.88it/s]

Average agent reward: 1100.0


100%|██████████| 3001/3001 [01:34<00:00, 31.76it/s]

Average agent reward: 0.0





## Visualization

### Imports

In [52]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display
from gym.wrappers import RecordVideo

### Video

In [68]:

def show_video_of_model(agent, env_name):
    state, _ = env.reset()
    done = False
    frames = []
    while not done:
      frame = env.render()
      frames.append(frame)
      action = agent.act(state)
      state, reward, done, _, _ = env.step(action[0])

    env.close()
    imageio.mimsave('video.mp4', frames, fps=30)

show_video_of_model(agent, 'KungFuMasterDeterministic-v4')

def show_video():
    # Show video in notebook
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video()

