In [None]:
!pip install box2d-py
!pip install gym[Box_2D]

Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/06/bd/6cdc3fd994b0649dcf5d9bad85bd9e26172308bbe9a421bfc6fdbf5081a6/box2d_py-2.3.8-cp36-cp36m-manylinux1_x86_64.whl (448kB)
[K     |████████████████████████████████| 450kB 4.9MB/s 
[?25hInstalling collected packages: box2d-py
Successfully installed box2d-py-2.3.8


In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
import gym
from collections import deque
import random
import numpy as np
from sklearn.utils import shuffle
import os

In [2]:
class DQN(nn.Module):
    def __init__(self, n_actions):
        super().__init__()
        self.fc1 = nn.Linear(8, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 256)
        self.output_layer = nn.Linear(256, n_actions)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.output_layer(x)
        return x

In [3]:
class Agent():
    def __init__(self):
        self.MODEL_PATH = 'deep_q_model.pth'
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_decay_rate = 1e-2
        self.min_epsilon = 0.01
        self.MEMORY_SIZE = 5000
        self.memory = {
            'current_state' : deque(maxlen=self.MEMORY_SIZE), 
            'next_state' : deque(maxlen=self.MEMORY_SIZE), 
            'action' : deque(maxlen=self.MEMORY_SIZE), 
            'reward' : deque(maxlen=self.MEMORY_SIZE),
            'done' : deque(maxlen=self.MEMORY_SIZE)
        }
        self.env = gym.make('LunarLander-v2')
        self.n_actions = self.env.action_space.n
        self.Q_network = DQN(n_actions=self.n_actions).cuda()
        if os.path.exists(self.MODEL_PATH):
            print('Existing model found!')
            self.Q_network.load_state_dict(torch.load(self.MODEL_PATH))
            self.Q_network.eval()
        else:
            print('No existing model.')
        
        self.NUM_EPISODES = 2000
        self.MAX_FRAMES = 10000
        self.batch_size = 64
        self.optimizer = optim.Adam(self.Q_network.parameters(), lr=1e-3)
        self.criterion = torch.nn.MSELoss()

    def remember(self, current_state, next_state, action, reward, done):
        self.memory['current_state'].append(current_state)
        self.memory['next_state'].append(next_state)
        self.memory['action'].append(action)
        self.memory['reward'].append(reward)
        self.memory['done'].append(done)

    def choose_action(self, curr_state):
        if self.epsilon > random.uniform(0, 1):
            return self.env.action_space.sample()
        else:
            return torch.argmax(self.Q_network(torch.tensor(curr_state).cuda())).item()

    def replay(self):
        bs = 0
        if len(self.memory['reward']) < self.batch_size:
            bs = len(self.memory['reward'])
        else:
            bs = self.batch_size
            
            
        a = [x for x in range(len(self.memory['reward']))]
        a = shuffle(a)
        idx = a[:bs]

        self.optimizer.zero_grad()
        done = torch.tensor(self.memory['done']).cuda()[idx]
        rewards = torch.tensor(self.memory['reward']).cuda()[idx]
        curr_states = torch.tensor(self.memory['current_state']).cuda()[idx]
        actions = np.array(self.memory['action'])[idx]
        next_states = torch.tensor(self.memory['next_state']).cuda()[idx]
        
        target = rewards + self.gamma * torch.max(self.Q_network(next_states), dim=1)[0] * (1 - done).to(dtype=torch.float)
        target_values = self.Q_network(curr_states.cuda())
        for i in range(0, len(actions)):
            target_values[i][actions[i]] = target[i]
            
            
        output = self.Q_network(curr_states.cuda())
        loss = self.criterion(output, target_values)
        loss.backward()
        self.optimizer.step()
        torch.cuda.empty_cache()
        if self.epsilon - self.epsilon_decay_rate > self.min_epsilon:
            self.epsilon -= self.epsilon_decay_rate
        else:
            self.epsilon = self.min_epsilon

    def play(self):
        for i_episode in range(1, self.NUM_EPISODES):
            episode_rewards = 0
            observation = self.env.reset()
            curr_state = observation
            done = False

            while not done:
                # self.env.render()
                action = self.choose_action(curr_state)
                observation, reward, done, info = self.env.step(action)
                episode_rewards += reward
                self.remember(curr_state, observation, action, reward, done)
                curr_state = observation
                self.replay()
                
                if done:
                    print(f'Episode # {i_episode}, reward: {episode_rewards: .2f}, epsilon: {self.epsilon: .2f}')
                    
                    break
            if i_episode % 200 == 0:
                torch.save(self.Q_network.state_dict(), self.MODEL_PATH)
        self.env.close()

In [4]:
agent = Agent()
agent.play()

No existing model.
Episode # 1, reward: -79.31, epsilon:  0.29
Episode # 2, reward: -522.39, epsilon:  0.01
Episode # 3, reward: -635.33, epsilon:  0.01
Episode # 4, reward: -309.68, epsilon:  0.01
Episode # 5, reward: -81.27, epsilon:  0.01
Episode # 6, reward: -13.20, epsilon:  0.01
Episode # 7, reward: -152.19, epsilon:  0.01
Episode # 8, reward: -134.08, epsilon:  0.01
Episode # 9, reward: -131.51, epsilon:  0.01
Episode # 10, reward: -18.27, epsilon:  0.01
Episode # 11, reward: -143.04, epsilon:  0.01
Episode # 12, reward: -322.22, epsilon:  0.01
Episode # 13, reward: -120.86, epsilon:  0.01
Episode # 14, reward: -124.87, epsilon:  0.01
Episode # 15, reward: -128.10, epsilon:  0.01
Episode # 16, reward: -95.13, epsilon:  0.01
Episode # 17, reward: -133.06, epsilon:  0.01
Episode # 18, reward: -101.50, epsilon:  0.01
Episode # 19, reward: -157.63, epsilon:  0.01
Episode # 20, reward: -103.45, epsilon:  0.01
Episode # 21, reward: -106.30, epsilon:  0.01
Episode # 22, reward: -25.73,

In [None]:
torch.save(agent.Q_network.state_dict(), agent.MODEL_PATH)