# Deep Reinforcement learning on PyTorch to solve Open AI Mountain Car

In [2]:
#import required packages
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [None]:
env = gym.make('MountainCar-v1').unwrapped

# setting matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_python:
    from Ipython import Display
    
# check if gpu is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Replay Memory:

Replay memory will be used for training our DQN network which helps in storing the transitions the agent undergoes allowing us to make use of this data later.

In [None]:
Transition = namedtuple('transition',('state','action','next_state','reward'))

class ReplayMem(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0
    
    def push(self, *args):
        ## Save a transition
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity
    
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)
        

### DQN Neural Network Architecture:

In [2]:


class Network(nn.Module):
    def __init__(self, h, w, out):
        super(Network, self).__init__()
        
        self.total_actions = 3
        self.gamma = 0.99
        self.final_epsilon = 0.0001
        self.initial_epsilon = 0.1
        self.num_iterations = 2000000
        self.replay_mem_size = 10000
        self.batch_size = 32
        
        self.conv1 = nn.Conv2d(3, 16, kernel_size = 5, stride = 2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size = 5, stride = 2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size = 5, stride = 2)
        self.bn3 = nn.BatchNorm2d(32)
        
        # compute the number of linear input connections
        def conv2d_size_out(size, kernel_size = 5, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride + 1
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * 32
        self.head = nn.Linear(linear_input_size,out)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        
        return self.head(x.view(x.size(0), -1))

### Extracting inputs

In [None]:
resize = T.compose([T.ToPILImage(),
                   T.Resize(40, interpolation = Image.CUBIC),
                   T.ToTensor()])

def capt_screen():
    screen = env.render(mode = 'rgb_array').transpose(2,0,1)
    
    # convert to float, rescale and convert to torch tensor
    screen = np.ascontiguousarray(screen, dtype = np.float32) / 255
    screen = torch.from_numpy(screen)
    
    return resize(screen).unsqueeze(0).to(device)

env.reset()
plt.figure()
plt.imshow(get_screen().cpu().squeeze(0).permute(1,2,0).numpy(),interpolation='none')
plt.title('Example extracted screen')
plt.show()

### Training:

In [None]:
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

# Get the screen size so that we can initialize layers correctly based on what we get from AI gym
ini_screen = capt_screen()
_, _, screen_height, screen_width = ini_screen.shape

# get total possible actions from gym action space
tot_actions = env.action_space.n

policy_net = Network(screen_height, screen_width, tot_actions).to(device)
target_net = Network(screen_height, screen_width, tot_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)

steps_done = 0

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device = device, dtype=torch.long)
    
episode_durations = []


def plot_durations():
    plt.figure(2)
    plt.clf()
    durations_t = torch.tensor(episode_durations, dtype = torch.float)
    plt.title('Training')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99),means))
        plt.plot(means.numpy())
        
    plt.pause(0.001)
    if is_ipython:
        display.clear_output(wait=True)
        display.display(plt.gcf())
    

### Training Loop:

In [None]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    
    # Transposing the batch which converts a batch-array of transitions to a transition of batch array
    batch = Transition(*zip(*transitions))
    
    # Determine mask of non-final states and concatenate batch elements
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.uint8)
    
    non_final_next_states = torch.cat([ s for s in batch.next_state if s is not None])
    
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    
    # Now compute Q(s_t,a) by calculating Q(s_t) from the output of the model and selecting the action which we would take according to the present state from policy net
    state_action_val = policy_net(state_batch).gather(1, action_batch)
    
    # Determine expected values of action for non final next states which is V(s_{t+1}) based on old target net
    next_state_val = torch.zeros(BATCH_SIZES, device = device)
    next_state_val[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    
    # compute expected Q values
    expected_state_action_val = (next_state_val * GAMMA) + reward_batch
    
    # compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_val.unsqueeze(1))
    
    # optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1,1)
    optimizer.step()

In [None]:
num_eps = 55
for i_episode in range(num_eps):
    # Initiliaze the environment and state
    env.reset()
    last_screen = capt_screen()
    current_screen = capt_screen()
    state = current_screen - last_screen
    for t in count():
        # select and perform an action
        action = select_action(state)
        _, reward, done, _ = env.step(action.item())
        reward = torch.tensor([reward], device = device)
        
        # observe new state
        last_screen = current_screen
        current_screen = capt_screen()
        if not done:
            next_state = current_screen - last_screen
        else:
            next_state = None
        
        # Store transition in memory
        memory.push(state, action, next_state, reward)
        
        # Move to next state
        state = next_state
        
        # Perform one step of optimization
        optimize_model()
        if done:
            episode_durations.append(t+1)
            plot_durations()
            break
        # Update the target network
        if i_episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
            
print('Complete')
env.render()
env.close()
plt.ioff()
plt.show()