# Atari Behavior Cloning
In this example we use a special form of imitation learning (IL) called behavior cloning for learning a policy to play the Atari game "Breakout". The agent witll be an NCP using a CfC as recurrent NN.

## Setup and Requirement

In [1]:
!pip3 install ncps torch "ale-py==0.7.4" "ray[rllib]==2.1.0" "gym[atari,accept-rom-license]==0.23.1"

Collecting ale-py==0.7.4
  Downloading ale_py-0.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m:01[0m:01[0m
[?25hCollecting ray[rllib]==2.1.0
  Downloading ray-2.1.0-cp310-cp310-manylinux2014_x86_64.whl (58.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0mm
[?25hCollecting gym[accept-rom-license,atari]==0.23.1
  Downloading gym-0.23.1.tar.gz (626 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.2/626.2 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting i

## Defining the model
The model consists of a convolutional block (for feature extraction from the image), followed by a CfC recurrent neural network (keeping track of the state), and a final linear layer (deriving actions from the state).

In [2]:
import torch.nn as nn
import torch.nn.functional as F

class ConvBlock(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=4, out_channels=64, kernel_size=5, padding=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, padding=2, stride=2)
        self.bn2 = nn.BatchNorm2d(num_features=128)
        self.conv3 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=5, padding=2, stride=2)
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=5, padding=2, stride=2)
        self.bn4 = nn.BatchNorm2d(num_features=256)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.conv3(x))
        x = F.relu(self.bn4(self.conv4(x)))
        x = x.mean((-1, -2))
        return x

In [3]:
from ncps.torch import CfC

class ConvCfC(nn.Module):
    def __init__(self, n_actions):
        super().__init__()
        self.conv_block = ConvBlock()
        self.rnn = CfC(input_size=256, units=64, batch_first=True, proj_size=n_actions)

    def forward(self, x, hx=None):
        batch_size = x.size(0)
        seq_len = x.size(1)
        # Merge batch and time dimension into a single one (because the Conv layers require this)
        x = x.view(batch_size * seq_len, *x.shape[2:])
        x = self.conv_block(x)
        # Seperate time and batch dimension again
        x = x.view(batch_size, seq_len, *x.shape[1:])
        x, hx = self.rnn(x, hx) # hx is the hidden state of the RNN
        return x, hx

## Dataloader
Now we define the Atari environment and the dataset. For this we have to wrap the environemnt with some helper functions, which apply the following transformations:
- Downscales the Atari frames to 84-by-84 pixels
- Converts the frames to grayscale
- Stacks 4 consecutive frames into a single observation

In [4]:
import gym
import ale_py
from ray.rllib.env.wrappers.atari_wrappers import wrap_deepmind
import numpy as np

env = gym.make("ALE/Breakout-v5")
env = wrap_deepmind(env)

  from pkg_resources import packaging
  VALID_NP_HPARAMS = (np.bool8, np.float32, np.float64, np.int32, np.int64)
A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]
  deprecation(
  deprecation(


In [5]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
from ncps.datasets.torch import AtariCloningDataset
from tqdm import tqdm

train_ds = AtariCloningDataset("breakout", split="train")
val_ds = AtariCloningDataset("breakout", split="val")
trainloader = DataLoader(train_ds, batch_size=32, num_workers=4, shuffle=True)
valloader = DataLoader(val_ds, batch_size=32, num_workers=4)

## Running the model in a closed-loop
Next, we have to define the code for applying the model in a continuous control loop with the environment. Important things to take care of are:
1. Reset the RNN hidden states when a new episode starts in the Atari game
2. Reshape the input frames to have an extra batch and time dimension of size 1 as the network accepts only batches of sequences instead of single frames
3. Pass the current hidden state together with the observation as input, and unpack the prediciton and next hidden state from the output

In [6]:
def run_closed_loop(model, env, num_episodes=None):
    obs = env.reset()
    device = next(model.parameters()).device
    hx = None # Initialize hidden state of the RNN
    returns = []
    total_reward = 0
    with torch.no_grad():
        while True:
            # PyTorch requires channel first image -> tranpose data
            obs = np.tranpose(obs, [2, 0, 1]).astype(np.float32) / 255.0
            obs = torch.from_numpy(obs).unsqueeze(0).unsqueeze(0).to(device) # create tensor, add time and batch dim, move to device
            pred, hx = model(obs, hx)
            # remove time and batch dimension -> then argmax
            action = pred.squeeze(0).squeeze(0).argmax().item()
            obs, r, done, _ = env.step(action)
            total_reward += r
            if done:
                obs = env.reset() # reset the environment
                hx = None # reset the hidden state
                returns.append(total_reward)
                total_reward = 0 # reset reward
                if num_episodes is not None:
                    num_episodes -= 1 # count down the number of episodes
                    if num_episodes == 0:
                        return returns

## Training loop
Let's write our training loop

In [7]:
def train_one_epoch(model, criterion, optimizer, trainloader):
    running_loss = 0.0
    pbar = tqdm(total=len(trainloader))
    model.train()
    device = next(model.parameters()).device
    for i, (inputs, labels) in enumerate(trainloader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()
        outputs, hx = model(inputs)
        labels = labels.view(-1, *labels.shape[2:])  # flatten
        outputs = outputs.reshape(-1, *outputs.shape[2:])  # flatten
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        pbar.set_description(f"loss={running_loss / (i + 1):0.4g}")
        pbar.update(1)
    pbar.close()

def validate(model, valloader):
    losses, accs = [], []
    model.eval()
    device = next(model.parameters()).device
    with torch.no_grad():
        for inputs, labels in valloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs, _ = model(inputs)
            outputs = outputs.reshape(-1, *outputs.shape[2:])
            labels = labels.view(-1, *labels.shape[2:])
            loss = criterion(outputs, labels)
            acc = (outputs.argmax(-1) == labels).float().mean()
            losses.append(loss.item())
            accs.append(acc.item())
    return np.mean(losses), np.mean(accs)

## Training the model

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ConvCfC(n_actions=env.action_space.n).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

for epoch in range(50):
    train_one_epoch(model, criterion, optimizer, trainloader)

    # Evaluate model no the validation set
    val_loss, val_acc = validate(model, valloader)
    print(f"Epoch {epoch+1}, val_loss={val_loss:0.4g}, val_acc={100*val_acc:0.2f}%")

    # Apply model in closed-loop environment
    returns = run_closed_loop(model, env, num_episodes=10)
    print(f"Mean return {np.mean(returns)} (n={len(returns)})")

# Display how our model plays the game
env = gym.make("ALE/Breakout-v5", render_mode="human")
env = wrap_deepmind(env)
run_closed_loop(model, env)


[A

  0%|          | 0/938 [00:42<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 222.00 MiB. GPU 0 has a total capacity of 1.95 GiB of which 201.19 MiB is free. Including non-PyTorch memory, this process has 1.04 GiB memory in use. Of the allocated memory 1005.83 MiB is allocated by PyTorch, and 14.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [8]:
# Display how our model plays the game
env = gym.make("ALE/Breakout-v5", render_mode="human")
env = wrap_deepmind(env)
run_closed_loop(model, env)

  deprecation(
  deprecation(


AttributeError: module 'numpy' has no attribute 'tranpose'