In [1]:
import gym
from gym.wrappers import Monitor
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
import copy
import concurrent.futures
import time
import json
import random

import matplotlib.pyplot as plt

In [2]:
class DQNAgent():
    def __init__(
        self, 
        epsilon = 1, 
        epsilon_decay = 0.999,
        epsilon_min = 0.2,
        replay_size = 1e6,
        batch_size = 2048,
        gamma = 0.99,
        alpha = 0.001,
        layers = [256, 256]
        ):
        
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.env = gym.make("LunarLander-v2")
        self.batch_size = batch_size
        self.gamma = gamma
        self.alpha = alpha
        self.train_initialized = False
        
        self.memory = ReplayMemory(replay_size)
        
        self.initialize_q()
        
    def initialize_q(self):
        self.q = NeuralNetwork(8, 4, [256, 256], self.alpha)
        self.loss_fn = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.q.parameters(), lr=self.alpha)
    
    def train(self, episodes, eval_interval=100):
        if not self.train_initialized:
            self.score_list = []
            self.values_list = []
            self.collect_performance_states()
            self.train_initialized = True
        #previous_weights = self.q.get_weights().flatten()
        for e in range(episodes):
            s = self.env.reset()
            score = 0
            done = False
            while not done:
                a = self.get_action(s, training=True)
                s1, r, done, _ = self.env.step(a)
                score += r
                self.memory.add_transition((s, a, r, s1, int(done)))
                self.update()
            
                s = s1
            self.epsilon = max(self.epsilon_decay*self.epsilon, self.epsilon_min)
            self.score_list.append(score)
            if (e+1)%eval_interval == 0:
                value = self.evaluate(100)
                self.values_list.append(value)
                print(f'{e+1}: {value}')
        
            
    def get_action(self, state, training=True):
        if not training:
            with torch.no_grad():
                action = torch.argmax(self.q.forward(np.expand_dims(state, 0), train=False)[0,:]).item()
                return action
        val = np.random.random()
        if val >= self.epsilon:
            with torch.no_grad():
                action = torch.argmax(self.q.forward(np.expand_dims(state, 0), train=False)[0,:]).item()
        else:
            action = np.random.randint(0, high=4)
        return action
        
    def compute_performance_values(self):
        with torch.no_grad():
            values = self.q.forward(self.performance_states)
        mean_value = torch.mean(torch.max(values, axis=1)[0]).item()
        return mean_value
    
    def collect_performance_states(self, episodes=100):
        state_list = []
        for e in range(episodes):
            s = self.env.reset()
            done = False
            while not done:
                state_list.append(s)
                a = np.random.randint(0, high=4)
                s1, r, done, _ = self.env.step(a)
                s = s1
        self.performance_states = np.array(state_list)
        
    def update(self):
        
        if self.memory.replay_index > self.batch_size or self.memory.replay_full:
            data = self.memory.get_replay_sample(self.batch_size)
            
            # Extract values from data array
            s = torch.tensor(data[:,0:8]).float()
            a = torch.tensor(np.expand_dims(data[:,8], 1)).to(torch.int64)
            r = torch.tensor(np.expand_dims(data[:,9], axis=1))
            s1 = torch.tensor(data[:,10:18]).float()
            non_term = torch.tensor(np.expand_dims(1 - data[:,18], axis=1))
            
            # Calculate Target
            with torch.no_grad():
                v1 = torch.max(self.q.forward(s1), 1, keepdim=True)[0]
            target = r + (self.gamma * v1 * non_term)
            target = target.float()

            # Calculate prediction
            self.optimizer.zero_grad()
            pred = self.q.forward(s).gather(dim=1, index=a).float()

            # Perform gradient descent step
            loss = self.loss_fn(pred, target)
            loss.backward()
            self.optimizer.step()
            delta = loss.item()
        
        
    def evaluate(self, episodes):
        score_arr = np.empty(episodes)
        for e in range(episodes):
            s = self.env.reset()
            score = 0
            done = False
            while not done:
                a = self.get_action(s, training=False)
                s1, r, done, _ = self.env.step(a)
                score += r
                s = s1
            score_arr[e] = score
        return np.mean(score_arr)

In [3]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, layer_dims):
        super(NeuralNetwork, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.layer_dims = layer_dims
        current_dim = input_dim
        
        self.create_model()
        
        
    def create_model(self):
        
        self.model = nn.ModuleList()
        input_dim = self.input_dim
        for layer_dim in self.layer_dims:
            self.model.append(nn.Linear(input_dim, layer_dim))
            self.model.append(nn.ReLU())
            self.model.append(nn.BatchNorm1d(layer_dim))
            input_dim = layer_dim
        self.model.append(nn.Linear(input_dim, self.output_dim))
        
    def forward(self, x, train=False):
        if type(x) is not torch.Tensor:
            x = torch.tensor(x)
        if not train:
            self.model.eval()
        else:
            self.model.train()
        out = x
        for module in self.model:
            out = module(out)
        return out            

In [4]:
class ReplayMemory():
    
    def __init__(self, size):
        self.replay_size = int(size)
        self.initialize_replay_memory()
    
    def initialize_replay_memory(self):
        self.memory = np.empty((self.replay_size, 19))
        self.replay_index = 0
        self.replay_full = False
        self.replay_max = 1
        
    def convert_transition_to_array(self, episode):
        transition_array = np.empty(19)
        transition_array[0:8] = episode[0]
        transition_array[8] = episode[1]
        transition_array[9] = episode[2]
        transition_array[10:18] = episode[3]
        transition_array[18] = episode[4]
        return transition_array
    
    def add_transition(self, episode):
        transition_array = self.convert_transition_to_array(episode)
        i = self.replay_index
        self.memory[i,:] = transition_array
        self.replay_index += 1
        self.replay_max = min(self.replay_max+1, self.replay_size)
        if self.replay_index == self.replay_size:
            self.replay_index = 0
            self.replay_full = True
            
    def get_replay_sample(self, samples, index=None):
        if index is None:
            index = np.random.randint(0,high=self.replay_max, size=samples)
        return self.memory[index.reshape(samples,1),np.arange(19).reshape(1,19)]

In [None]:
agent = DQNAgent(layers=[256, 256], alpha=0.00001, gamma=0.99, epsilon=1, epsilon_decay=0.995, epsilon_min=0.3)

In [None]:
agent.train(1000, eval_interval=50)

In [None]:
torch.save(agent.q.state_dict(), 'q_model')

In [5]:
model = NeuralNetwork(8, 4, [256, 256])
model.load_state_dict(torch.load('q_model'))

<All keys matched successfully>

In [6]:
def evaluate(episodes, env, model):
    score_arr = np.empty(episodes)
    for e in range(episodes):
        s = env.reset()
        score = 0
        done = False
        while not done:
            a = get_action(s, model)
            s1, r, done, _ = env.step(a)
            score += r
            s = s1
        score_arr[e] = score
    return np.mean(score_arr)

def get_action(state, model):
    with torch.no_grad():
        action = torch.argmax(model.forward(np.expand_dims(state, 0), train=False)[0,:]).item()
    return action

In [7]:
def show_video(env, model, episodes):
    for episode in range(episodes):
        s = env.reset()
        done = False
        while not done:
            a = get_action(s, model)
            s1, r, done, _ = env.step(a)
            env.render()
            time.sleep(0.0001)
            s = s1
    env.close()

In [8]:
env = Monitor(gym.make("LunarLander-v2"), './video', force=True)
show_video(env, model, 10)



DependencyNotInstalled: Found neither the ffmpeg nor avconv executables. On OS X, you can install ffmpeg via `brew install ffmpeg`. On most Ubuntu variants, `sudo apt-get install ffmpeg` should do it. On Ubuntu 14.04, however, you'll need to install avconv with `sudo apt-get install libav-tools`.