In [1]:
import gym
from ale_py import ALEInterface
ale = ALEInterface()

A.L.E: Arcade Learning Environment (version 0.7.5+db37282)
[Powered by Stella]


In [2]:
import stable_baselines3.common.atari_wrappers as atari_wrappers

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [4]:
# initial environment
env = gym.make('PongNoFrameskip-v4')

A.L.E: Arcade Learning Environment (version 0.7.5+db37282)
[Powered by Stella]


In [5]:
# Atari preprocessing wrapper
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, 
                                      frame_skip=4, screen_size=84, 
                                      terminal_on_life_loss=False, 
                                      grayscale_obs=True, grayscale_newaxis=False, 
                                      scale_obs=False)

# Frame stacking
env = gym.wrappers.FrameStack(env, 4)

# using atari_wrappers
env = atari_wrappers.ClipRewardEnv(env)

In [6]:
#actions in this environment
env.unwrapped.get_action_meanings()

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

In [7]:
# just testing a state to make sure it works
s0 = env.reset()
a0 = env.action_space.sample()

# execute the action
s1, r1, is_final_state, info = env.step(a0)

In [8]:
s1_tensor = torch.tensor(np.expand_dims(s1, axis=0))

In [9]:
s1_tensor.shape

torch.Size([1, 4, 84, 84])

# Defining important building blocks

In [10]:
def epsilon_greedy(eps, model, env, state):
    if np.random.random() < eps:
        # exploration
        action = np.random.randint(0, env.action_space.n)
        return action
    else:
        # exploitation
        q_vals = model.predict(state)
        action = np.argmax(q_vals)
        return action

In [11]:
def tensor_transpose(tensor):
    """Converting a tensor from NCWH to NWHC
    Parameter:
    - tensor (4D Array): NCWH tensor
    Returns:
    - (4D Array): tensor in NWHC format"""
    return torch.transpose(tensor, 1, 3)

In [12]:
# example of transposing a tensor with pytorch
dummy = torch.ones((1, 4, 84, 84))
tensor_transpose(dummy).shape

torch.Size([1, 84, 84, 4])

In [13]:
from collections import deque

def memory_initialization(env, MAX_MEM, INIT_MEM):
    """Initializes the memory for experience replay"""
    
    memory = deque(maxlen=MAX_MEM)
    
    while len(memory) < INIT_MEM:
        
        state = env.reset()
        
        is_final_state = False
        
        while not is_final_state:
            
            # generate a random action
            action = env.action_space.sample()
            
            # execute the action
            new_state, reward, is_final_state, info = env.step(action)
            
            # add transition quadruple to memory
            memory.append([np.expand_dims(state, axis=0), action, reward, 
                           np.expand_dims(new_state, axis=0), is_final_state])
            
            # update state
            state = new_state
            
    return memory

In [14]:
# testing memory initialization
test_memory = memory_initialization(env, 100, 50)

In [15]:
test_memory

deque([[array([[[[ 52,  52,  52, ...,  87,  87,  87],
                 [ 87,  87,  87, ...,  87,  87,  87],
                 [ 87,  87,  87, ...,  87,  87,  87],
                 ...,
                 [236, 236, 236, ..., 236, 236, 236],
                 [236, 236, 236, ..., 236, 236, 236],
                 [236, 236, 236, ..., 236, 236, 236]],
        
                [[ 52,  52,  52, ...,  87,  87,  87],
                 [ 87,  87,  87, ...,  87,  87,  87],
                 [ 87,  87,  87, ...,  87,  87,  87],
                 ...,
                 [236, 236, 236, ..., 236, 236, 236],
                 [236, 236, 236, ..., 236, 236, 236],
                 [236, 236, 236, ..., 236, 236, 236]],
        
                [[ 52,  52,  52, ...,  87,  87,  87],
                 [ 87,  87,  87, ...,  87,  87,  87],
                 [ 87,  87,  87, ...,  87,  87,  87],
                 ...,
                 [236, 236, 236, ..., 236, 236, 236],
                 [236, 236, 236, ..., 236, 236, 23

# Implementing the neural network with PyTorch

The network developped below

In [16]:
class ConvNet(nn.Module):
    
    def __init__(self, n_actions):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU()
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        self.hidden = nn.Sequential(
            nn.Linear(64 * 7 * 7, 512, bias=True),
            nn.ReLU()
        )
        self.out = nn.Sequential(
            nn.Linear(512, n_actions, bias=True)
        )
        
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = x.view(x.size(0), -1)
        x = self.hidden(x)
        x = self.out(x)
        return x

In [17]:
input_test = torch.randn(1, 4, 84, 84)
net = ConvNet(env.action_space.n)
out_test = net(input_test)
print(out_test)

tensor([[-0.0573,  0.0385,  0.0543,  0.0039,  0.0386, -0.0131]],
       grad_fn=<AddmmBackward0>)


In [18]:
s1_tensor.shape

torch.Size([1, 4, 84, 84])

In [20]:
s1_tensor/255

tensor([[[[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
          [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
          [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
          ...,
          [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
          [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
          [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255]],

         [[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
          [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
          [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
          ...,
          [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
          [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
          [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255]],

         [[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
          [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
          [0.3412, 0.3412, 0.3412,  ..., 0

In [21]:
# Testing the network on an environment state
# s1_tensor_T = tensor_transpose(s1_tensor)
# Dividing by 255 in order to have 
s1_out_test = net(s1_tensor/255)
print(s1_out_test)

tensor([[-0.0443,  0.0313,  0.0408, -0.0187,  0.0498, -0.0047]],
       grad_fn=<AddmmBackward0>)


In [22]:
int(torch.argmax(out_test))

2

In [23]:
net = ConvNet(env.action_space.n)
optimizer = optim.RMSprop(net.parameters() ,lr=0.01)
criterion = nn.MSELoss()

# Experience replay

In [24]:
import copy

In [None]:
# def experience_replay(memory, model, target_model, discount_factor, batch_size):
#     ''' Fits the model with minibatch of states from memory
#     Args:
#     - memory (Array): array of environment transitions
#     - model (Model): Keras model to be fit
#     - target_model (Model): Keras model to get target Q val
#     - discount_factor (float): discount factor for future utility
#     - batch_size (int): size of minibatch
    
#     Returns: None
#     '''
    
#     #if memory is less than batch size, return nothing
#     if len(memory) < batch_size:
#         return
#     else:
#         states = []
#         targets = []
        
#         #sample a batch
#         minibatch = random.sample(memory, batch_size)
        
#         #iterate through bastch
#         for state, action, reward, new_state, done in minibatch:
#             #scale states to be [0,1]. We only scale before fitting cuz storing uint8 is cheaper
            
#             state = state/255
#             new_state = new_state/255

#             target = reward
            
#             #if game not over, target q val includes discounted future utility
#             #we use a cloned model to predict here for stability. Model is changed every C frames
#             #we use the online model to choose best action to deal with overestimation error (Double-Q learning)
#             if not done:
#                 best_future_action = np.argmax(model.predict(new_state))
#                 target = reward + discount_factor * target_model.predict(new_state)[0][best_future_action]
            
#             #get current actions vector
#             target_vector = model.predict(state)[0]
            
#             #update current action q val with target q val
#             target_vector[action] = target
            
#             #add to states
#             states.append(state)
            
#             #add to targets
#             targets.append(target_vector)
            
#         #fit model
#         model.fit(np.array(states), np.array(targets), epochs=1, verbose=0)

In [25]:
net

ConvNet(
  (conv1): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
  )
  (conv2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (1): ReLU()
  )
  (conv3): Sequential(
    (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
  )
  (hidden): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
  )
  (out): Sequential(
    (0): Linear(in_features=512, out_features=6, bias=True)
  )
)

In [26]:
# Running the Deep Q-network
# Generate a copy of the model as a target model
target_model = copy.deepcopy(net)

In [27]:
target_model

ConvNet(
  (conv1): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
  )
  (conv2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (1): ReLU()
  )
  (conv3): Sequential(
    (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
  )
  (hidden): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
  )
  (out): Sequential(
    (0): Linear(in_features=512, out_features=6, bias=True)
  )
)

In [38]:
def experience_replay(memory, model, target_model, gamma, batch_size):
    
    if len(memory) < batch_size:
        
        return
    
    else:
        states = []
        targets = []
        losses = []
    
    # sample minibatches of size batch_size from memory
    minibatch = random.sample(memory, batch_size)
    
    model = model.double()
    target_model = target_model.double()
    
    # iterate through the batch
    for state, action, reward, new_state, is_final in minibatch:
        # scaling of images
        state = state/255
        state = torch.tensor(state)
        
        new_state = new_state/255
        new_state = torch.tensor(new_state)
        new_state = new_state.double()

        
        #print(new_state)
        
        if not is_final:
            
            best_future_action = int(torch.argmax(model(new_state)))
            target = reward + gamma * target_model(new_state)[0][best_future_action]
        
        else:
            
            target = reward
            
        target_vector = model(state)[0]
        
        target_vector[action] = target
        
        states.append(state)
        
        targets.append(target_vector)
        
    for epoch in range(1):
        
        running_loss = 0.0
        
        for state, target in zip(states, targets):
        
            # Updating the network parameters
            # update parameters with a gradient descent step
            
            optimizer.zero_grad()   # zero the gradient buffers
            output = model(state)
            loss = criterion(output, target)
            losses.append(loss)
            running_loss += loss
            loss.backward()
            print(model.parameters())
            optimizer.step(model.parameters())    # Does the update
    
    return losses

In [39]:
import random
test_losses = experience_replay(test_memory, net, target_model, 0.05, 32)

<generator object Module.parameters at 0x7efde4649660>


TypeError: 'generator' object is not callable

In [None]:
#run frames
while total_frames < NUM_FRAMES:
        
    state = env.reset()
    done = False
    score = 0
    frames = 0
            
    #playing through this round
    for frame in range(MAX_ITERATIONS):
        env.render()
        
        frames += 1
        
        #epsilon greedy choose action
        action = epsilon_greedy(eps, model, env, np.expand_dims(state, axis=0))
        
        
        #execute action
        new_state, reward, done, info = env.step(action)
        
        #track score
        score += reward
        
        #memorize
        memory.append([np.expand_dims(state, axis=0), action, reward, np.expand_dims(new_state, axis=0), done])
        
        #exp replay
        losses = experience_replay(memory, model, target_model, discount_factor, batch_size)
        
        #clone target network every C frames
        num_updates += batch_size
        
        if num_updates > TARGET_C:
            
            num_updates = 0
            
            with torch.no_grad:
                target_model.fc.weight = model.fc.weight
            
            
            #save memory and model
            # np.save('memory', memory)
            # model.save('tmp_model')
            
        
        #update state
        state = new_state
        
        #decay epsilon
        eps -= eps_linear_decay
        eps = max(eps, eps_min)
        
        if done:
            break
    
    scores.append(score)
    total_frames += frames

# Using pytorch tutorial for DQN to see that it works

The tutorial can be useful for other models, it uses interesting concepts in pytorch


In [40]:
def get_screen():
    # Returned screen requested by gym is 400x600x3, but is sometimes larger
    # such as 800x1200x3. Transpose it into torch order (CHW).
    screen = env.render(mode='rgb_array').transpose((2, 0, 1))
    # Cart is in the lower half, so strip off the top and bottom of the screen
    _, screen_height, screen_width = screen.shape
    screen = screen[:, int(screen_height*0.4):int(screen_height * 0.8)]
    view_width = int(screen_width * 0.6)
    cart_location = get_cart_location(screen_width)
    if cart_location < view_width // 2:
        slice_range = slice(view_width)
    elif cart_location > (screen_width - view_width // 2):
        slice_range = slice(-view_width, None)
    else:
        slice_range = slice(cart_location - view_width // 2,
                            cart_location + view_width // 2)
    # Strip off the edges, so that we have a square image centered on a cart
    screen = screen[:, :, slice_range]
    # Convert to float, rescale, convert to torch tensor
    # (this doesn't require a copy)
    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
    screen = torch.from_numpy(screen)
    # Resize, and add a batch dimension (BCHW)
    return resize(screen).unsqueeze(0)


env.reset()
plt.figure()
plt.imshow(get_screen().cpu().squeeze(0).permute(1, 2, 0).numpy(),
           interpolation='none')
plt.title('Example extracted screen')
plt.show()

NameError: name 'plt' is not defined

In [None]:
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
init_screen = get_screen()
_, _, screen_height, screen_width = init_screen.shape

# Get number of actions from gym action space
n_actions = env.action_space.n

policy_net = DQN(n_actions).to(device)
target_net = DQN(n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)


steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)


episode_durations = []


def plot_durations():
    plt.figure(2)
    plt.clf()
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        display.clear_output(wait=True)
        display.display(plt.gcf())