In [1]:
import numpy as np
import gym
import tqdm
from collections import deque
import random
import matplotlib.pyplot as plt   

import torch 
from torch import nn, optim
import torch.nn.functional as F

In [2]:
debug = True

In [3]:
env = gym.make('CartPole-v0')

In [4]:
config = {
    'agent_params' : {
        'obs_space' : env.observation_space.shape[0],
        'action_space' : env.action_space.n,
        'hidden_dims' : [12, 12],
        'discount_factor' : 1.0,
        'min_exp' : 128,
        'max_exp' : 1024,
        'batch_size' : 128,
        'lr' : 0.02,
        'lr_decay' : 1.0,
        'lr_decay_step' : 1
    },
    'train_params' : {
        'episodes' : 10000,
        'epsilon_decay' : 0.999,
        'epsilon' : 1.0,
        'min_epsilon' : 0.01,
    }
}

In [5]:
class DeepModel(nn.Module):
    def __init__(self, obs_space, hidden_dims, action_space):
        super(DeepModel, self).__init__()
        self.input_layer = nn.Linear(obs_space, hidden_dims[0])
        hidden_layers = []
        for idx, h_dim in enumerate(hidden_dims[:-1]):
            hidden_layers.append(nn.Linear(h_dim, hidden_dims[idx + 1]))
            hidden_layers.append(nn.ReLU())
        self.hidden_layers = nn.Sequential(*hidden_layers)
        self.output_layer = nn.Linear(hidden_dims[-1], action_space)
    
    def forward(self, xb):
        h = self.input_layer(xb)
        h = self.hidden_layers(h)
        out = self.output_layer(h)
        return out

if debug:
    model = DeepModel(env.observation_space.shape[0], [32,32], env.action_space.n)
    preds = model(torch.atleast_2d(torch.tensor(env.observation_space.sample())))
    print(preds)


tensor([[-1.2698e+37,  9.1889e+36]], grad_fn=<AddmmBackward>)


In [6]:
class DQN:
    def __init__(self, obs_space, action_space, hidden_dims, discount_factor, min_exp, max_exp, batch_size, lr, lr_decay, lr_decay_step):
        self.action_space = action_space
        self.discount_factor = discount_factor
        self.min_exp = min_exp
        self.batch_size = batch_size
        self.lr = lr
        self.lr_decay = lr_decay
        self.lr_decay_step = lr_decay_step
        self.model = DeepModel(obs_space, hidden_dims, action_space)
        self.optimizer = optim.Adam(model.parameters(), lr)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, lr_decay_step, gamma=lr_decay)
        self.experience = {
            's' : deque(maxlen=max_exp),
            'a' : deque(maxlen=max_exp),
            'r' : deque(maxlen=max_exp),
            's2' : deque(maxlen=max_exp),
            'done' : deque(maxlen=max_exp)
        }
    
    def predict(self, inputs):
        with torch.no_grad():
            return self.model(torch.tensor(np.atleast_2d(inputs.astype('float32'))))

    def train(self):
        if len(self.experience['s']) < max(self.min_exp, self.batch_size):
            return

        ids = np.random.randint(0, len(self.experience['s']), self.batch_size)

        states = np.asarray([self.experience['s'][i] for i in ids])
        actions = np.asarray([self.experience['a'][i] for i in ids])
        rewards = np.asarray([self.experience['r'][i] for i in ids])
        states_next = np.asarray([self.experience['s2'][i] for i in ids])
        dones = np.asarray([self.experience['done'][i] for i in ids])

        actual_values = self.predict(states).detach().cpu().numpy()
        next_values = self.predict(states_next).detach().cpu().numpy()
        for idx, d in enumerate(dones):
            actual_values[idx][actions[idx]] = rewards[idx]
            if d:
                actual_values[idx][actions[idx]] += self.discount_factor * np.max(next_values[idx])
        
        model.train()
        rewards = self.model(torch.FloatTensor(states))
        loss = F.mse_loss(rewards, torch.FloatTensor(actual_values))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()            

    def get_action(self, state, epsilon):
        if np.random.normal() < epsilon:
            return np.random.randint(self.action_space)
        else:
            predictions = self.predict(state).detach().cpu().numpy()
            return np.argmax(predictions[0])
    
    def add_experience(self, exp):
        for key in self.experience.keys():
            self.experience[key].extend(exp[key])
    
if debug:
    agent = DQN(**config['agent_params'])

In [7]:
def play_game(env, agent : DQN, epsilon, render=False):
    states = []
    actions = []
    rewards = []
    dones = []
    states_next = []

    iter = 0
    done = False
    running_reward = 0
    observation = env.reset()

    while not done:
        # if render:
        #     env.render()
        iter += 1

        action = agent.get_action(observation, epsilon)
        prev_observation = observation
        observation, reward, done, _ = env.step(action)

        if iter == 195:
            done = True
            reward = 1
        elif done:
            reward = -1
        else:
            reward = 0
            
        running_reward += reward

        states.append(prev_observation)
        actions.append(action)
        states_next.append(observation)
        rewards.append(reward)
        dones.append(done)

    exp = {
        's' : states,
        'a' : actions,
        's2' : states_next,
        'r' : rewards,
        'done' : dones
    }

    agent.add_experience(exp)
        
    return running_reward

In [8]:
locals().update(config['train_params'])

In [9]:
env = gym.make('CartPole-v0')
agent = DQN(**config['agent_params'])

running_reward = np.empty(episodes)

In [10]:
pbar = tqdm.tqdm(range(episodes))
for n in pbar:
    if n > 500 and n%10 == 0:
        running_reward[n] = play_game(env, agent, epsilon, render=True)
    else:
        running_reward[n] = play_game(env, agent, epsilon, render=False)

    # if n > 10:
    #     break
    pbar.set_postfix({
        'episode reward' : running_reward[n],
        'avg (100 last) reward' : running_reward[max(0, n - 100) : n].mean(),
        'epsilon' : epsilon
    })

    agent.train()


    epsilon *= epsilon_decay
    epsilon = max(epsilon, min_epsilon)

  'avg (100 last) reward' : running_reward[max(0, n - 100) : n].mean(),
  ret = ret.dtype.type(ret / rcount)
 18%|█▊        | 1787/10000 [00:11<00:53, 154.41it/s, episode reward=-1, avg (100 last) reward=-1, epsilon=0.167]


KeyboardInterrupt: 