# Reinforcement Learning

In [1]:
LOAD_NETWORK = False

In [2]:
import gym
import math
import random
import numpy as np
import matplotlib.pyplot as plt
#from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from tensorboardX import SummaryWriter

%matplotlib inline

env = gym.make('BreakoutDeterministic-v4')

tensorboard = SummaryWriter()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Parameters

In [3]:
MAX_FRAMES = 30000000            # Total number of frames the agent sees 

MAX_EPISODE_LENGTH = 18000       # Equivalent of 5 minutes of gameplay at 60 frames per second

#MEMORY_CAPACITY = 50000
MEMORY_CAPACITY = 25000

#REPLAY_MEMORY_START_SIZE = 50000 # Number of completely random actions, before the agent starts learning
REPLAY_MEMORY_START_SIZE = 15000 # Number of completely random actions, before the agent starts learning

BATCH_SIZE = 32

GAMMA = 0.99

TARGET_UPDATE = 10000           # Number of chosen actions between updating the target network. 
                                # According to Mnih et al. 2015 this is measured in the number of 
                                # parameter updates (every four actions), however, in the 
                                # DeepMind code, it is clearly measured in the number
                                # of actions the agent choses
LEARNING_RATE = 0.00001

IMG_TARGET_SIZE = (84, 84, 1)

TARGET_NETWORK_PATH = 'model/target_net_state_dict.pt'

N_ACTIONS = env.action_space.n # Get number of actions from gym action space

# Replay Memory

In [4]:
from replaymemory import ReplayMemory

# Frame Buffer

In [5]:
class FrameBuffer(object):

    def __init__(self, shape, n_frames=4):
        b, c, h, w = shape
        self.capacity = n_frames
        self.framebuffer = np.zeros((b, c * n_frames, h, w), 'float32')

    def push(self, state):
        """Saves a transition."""
        self.framebuffer = np.roll(self.framebuffer, -1, axis=1)
        self.framebuffer[0, 3] = state
        
        if self.__len__() < self.capacity:
            self.framebuffer = np.repeat(self.framebuffer[:,3,:,:], [self.capacity], axis=0)
            self.framebuffer = np.expand_dims(self.framebuffer, axis=0)

    def pull(self):

        if self.__len__()==0: raise Exception('Framebuffer empty.')

        return torch.tensor(self.framebuffer, device=device)

    def __len__(self):
        return np.sum(np.sum(self.framebuffer, axis=(2,3)) > 0)

# Dueling Network

In [6]:
class DuelingDQN(nn.Module):
    
    def __init__(self, outputs):
        
        super(DuelingDQN, self).__init__()
        
        self.relu_gain = nn.init.calculate_gain('relu')
        
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4, bias=False)
        torch.nn.init.xavier_normal_(self.conv1.weight, gain=self.relu_gain)
        
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2, bias=False)
        torch.nn.init.xavier_normal_(self.conv2.weight, gain=self.relu_gain)
        
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, bias=False)
        torch.nn.init.xavier_normal_(self.conv3.weight, gain=self.relu_gain)
        
        self.value = nn.Linear(3136, 1)
        self.advantage = nn.Linear(3136, outputs)
 
    def forward(self, x):
        assert (x.size()[2] == 84) & (x.size()[3] == 84), "Wrong h ou w size"
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        advantage = self.advantage(x.view(x.size(0), -1))
        value     = self.value(x.view(x.size(0), -1))
        return value + advantage  - advantage.mean()

policy_net = DuelingDQN(N_ACTIONS).to(device)
target_net = DuelingDQN(N_ACTIONS).to(device)

if LOAD_NETWORK:
        policy_net.load_state_dict(torch.load(TARGET_NETWORK_PATH))

target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)
    

# Input extraction

In [7]:
from skimage.transform import resize

def get_screen():
    
    # Returned screen requested by gym is 400x600x3, but is sometimes larger
    # such as 800x1200x3. Transpose it into torch order (CHW).
    screen = env.render(mode='rgb_array')
    
    screen_height, screen_width, _ = screen.shape
    
    screen = screen[int(screen_height*0.28):int(screen_height * 0.92), :]
    
    view_width = int(screen_width * 0.95)

    slice_range = slice(screen_width - view_width, view_width)

    # Strip off the edges
    screen = screen[:, slice_range, :]
    
    screen = screen.mean(2, keepdims=True) # 3 channel to 1
    
    screen = resize(screen, IMG_TARGET_SIZE) / 255.
    
    screen = screen.transpose((2, 0, 1))
    
    screen = np.expand_dims(screen, axis=0)

    return screen

#env.reset()
#plt.figure()
#plt.imshow(get_screen()[0,0,:,:], cmap='gray')
#plt.title('Example extracted screen')
#plt.show()

# Select Action

In [8]:
class ExplorationExploitationScheduler(object):
    """Determines an action according to an epsilon greedy strategy with annealing epsilon"""
    def __init__(self, net, n_actions, eps_initial=1, eps_final=0.1, eps_final_frame=0.01, 
                 eps_evaluation=0.0, eps_annealing_frames=1000000, 
                 replay_memory_start_size=50000, max_frames=25000000):
        """
        Args:
            DQN: A DQN object
            n_actions: Integer, number of possible actions
            eps_initial: Float, Exploration probability for the first 
                replay_memory_start_size frames
            eps_final: Float, Exploration probability after 
                replay_memory_start_size + eps_annealing_frames frames
            eps_final_frame: Float, Exploration probability after max_frames frames
            eps_evaluation: Float, Exploration probability during evaluation
            eps_annealing_frames: Int, Number of frames over which the 
                exploration probabilty is annealed from eps_initial to eps_final
            replay_memory_start_size: Integer, Number of frames during 
                which the agent only explores
            max_frames: Integer, Total number of frames shown to the agent
        """
        self.n_actions = n_actions
        self.eps_initial = eps_initial
        self.eps_final = eps_final
        self.eps_final_frame = eps_final_frame
        self.eps_evaluation = eps_evaluation
        self.eps_annealing_frames = eps_annealing_frames
        self.replay_memory_start_size = replay_memory_start_size
        self.max_frames = max_frames
        
        # Slopes and intercepts for exploration decrease
        self.slope = -(self.eps_initial - self.eps_final)/self.eps_annealing_frames
        self.intercept = self.eps_initial - self.slope*self.replay_memory_start_size
        self.slope_2 = -(self.eps_final - self.eps_final_frame)/(self.max_frames - self.eps_annealing_frames - self.replay_memory_start_size)
        self.intercept_2 = self.eps_final_frame - self.slope_2*self.max_frames
        self.steps_done = 0
        self.net = net

    def select_action(self, state, evaluation=False):
        """
        Args:
            state: A (4, 84, 84) sequence of frames of an Atari game in grayscale
            evaluation: A boolean saying whether the agent is being evaluated
        Returns:
            An integer between 0 and n_actions - 1 determining the action the agent perfoms next
        """
        self.steps_done += 1
        
        if evaluation:
            eps = self.eps_evaluation
        elif self.steps_done < self.replay_memory_start_size:
            eps = self.eps_initial
        elif self.steps_done >= self.replay_memory_start_size and self.steps_done < self.replay_memory_start_size + self.eps_annealing_frames:
            eps = self.slope * self.steps_done + self.intercept
        elif self.steps_done >= self.replay_memory_start_size + self.eps_annealing_frames:
            eps = self.slope_2 * self.steps_done + self.intercept_2
        
        if np.random.rand(1) < eps:
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)
        return self.net(state).max(1)[1].view(1, 1)

# Plot Results

In [9]:
episode_rewards = []

def plot_rewards():
    
    plt.rcParams["figure.figsize"] = [15, 20]
    rewards_t = torch.tensor(episode_rewards, dtype=torch.float)
    loss_t = torch.tensor(episode_loss, dtype=torch.float)
    plt.subplot(4, 1, 1)
    plt.title('Training...')
    plt.ylabel('Rewards')
    plt.plot(rewards_t.numpy())
    
    
    
    plt.subplot(4, 1, 2)
    plt.xlabel('Episode')
    plt.ylabel('Avg Loss')
    plt.plot(loss_t.numpy())
    
    plt.subplot(4, 1, 3)
    plt.xlabel('Steps')
    plt.ylabel('Threshold - Epsilon')
    plt.plot(episode_threshold)
    plt.show()
    
    # Take 100 episode averages and plot them too
    if len(rewards_t) >= REPLAY_MEMORY_START_SIZE:
        plt.subplot(4, 1, 4)
        plt.title('Average Rewards')
        plt.xlabel('# Rewards')
        plt.ylabel('Avg')
        plt.plot(means.numpy())

    #plt.pause(1)  # pause a bit so that plots are updated
    
    #if is_ipython:
    #    display.clear_output(wait=True)
    #    display.display(plt.gcf())
    



# Optimizer

In [10]:
def optimize_model():

    if len(memory) < REPLAY_MEMORY_START_SIZE:
        return
    
    tree_idx, batch, ISWeights_mb = memory.sample(BATCH_SIZE)
      
    state_batch = [each[0][0] for each in batch]
    action_batch = [each[0][1] for each in batch]
    reward_batch = [each[0][2] for each in batch]
    next_states_batch = [each[0][3] for each in batch]
    dones_batch = [each[0][4] for each in batch]
    
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          next_states_batch)), device=device, dtype=torch.uint8)
    
    non_final_next_states = torch.cat([s for s in next_states_batch
                                                if s is not None])
    
    #state_batch = torch.cat([s for s in state_batch if s is not None])
    state_batch = torch.cat(state_batch)
    action_batch = torch.cat(action_batch)
    reward_batch = torch.cat(reward_batch).unsqueeze(1)
    #next_states_batch = torch.cat([s for s in next_states_batch if s is not None])
    is_not_done = (1 - torch.tensor(dones_batch, dtype=torch.float)).unsqueeze(1).to(device)
                
    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch) * is_not_done

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    
    action_next_state = torch.zeros(BATCH_SIZE, device=device, dtype=torch.long)
    next_state_values = torch.zeros((BATCH_SIZE,1), device=device)
    
    action_next_state[non_final_mask] = policy_net(non_final_next_states).argmax(1)
    
    next_state_values[non_final_mask] = target_net(non_final_next_states).gather(1, action_next_state.unsqueeze(1))
    
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
    
    # Write loss into Tensorboard
    tensorboard.add_scalar('Loss', loss.item(), explore_exploit_sched.steps_done)
    
    # Optimize the model
    optimizer.zero_grad()
    
    loss.backward()
    
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    
    prios = (state_action_values - expected_state_action_values).cpu().detach().numpy().squeeze()
    
    memory.batch_update(tree_idx, prios)
    
    optimizer.step()
    
    return loss


# Train

In [11]:
from IPython.display import clear_output

# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
init_screen = get_screen()
screen_batch, screen_channel, screen_height, screen_width = init_screen.shape

memory = ReplayMemory(MEMORY_CAPACITY)

explore_exploit_sched = ExplorationExploitationScheduler(policy_net, N_ACTIONS, 
        replay_memory_start_size = REPLAY_MEMORY_START_SIZE, max_frames = MAX_FRAMES)

frame_buffer = FrameBuffer(shape=(screen_batch, screen_channel, screen_height, screen_width))

rewards_list = np.zeros(10)

for _ in range(MAX_FRAMES):
    
    # Initialize the environment and state
    env.reset()
    s = get_screen()

    frame_buffer.push(s)
    state = frame_buffer.pull()
    
    for t in range(MAX_EPISODE_LENGTH):
        
        # Select and perform an action
        action = explore_exploit_sched.select_action(state)
        
        _, reward, done, _ = env.step(action.item())

        rewards_list = np.append(rewards_list, reward)
        rewards_list = np.delete(rewards_list, 0)
        
        # Get next state with frame buffer
        next_s = get_screen()
        frame_buffer.push(next_s)
        next_state = frame_buffer.pull()
        
        # Store the transition in memory (for Replay)
        reward = torch.tensor([reward], device=device)
        
        experience = state, action, reward, next_state, done
        memory.store(experience)
        
        # Perform one step of the optimization (on the target network)
        loss = optimize_model()

        if done:
            break
        
        state = next_state

        if explore_exploit_sched.steps_done % 10 == 0:
            tensorboard.add_scalar('Rewards', rewards_list.mean(), explore_exploit_sched.steps_done)
            
        # Update the target network, copying all weights and biases in DQN
        # Save model
        if explore_exploit_sched.steps_done % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
            torch.save(target_net.state_dict(), TARGET_NETWORK_PATH)

print('Complete')
#env.render()
env.close()

  max_weight = (p_min * n) ** (-self.PER_b)


KeyboardInterrupt: 