In [0]:
!pip install box2d-py
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!pip install gym pyvirtualdisplay > /dev/null 2>&1

Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/06/bd/6cdc3fd994b0649dcf5d9bad85bd9e26172308bbe9a421bfc6fdbf5081a6/box2d_py-2.3.8-cp36-cp36m-manylinux1_x86_64.whl (448kB)
[K    100% |████████████████████████████████| 450kB 10.7MB/s 
[?25hInstalling collected packages: box2d-py
Successfully installed box2d-py-2.3.8


In [0]:
import gym
from gym import wrappers
import random
import torch
import numpy as np
from collections import deque, namedtuple
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
%matplotlib inline
from scipy.misc import imresize
def show_video(folder):
    mp4list = glob.glob('%s/*.mp4' % folder)
    if len(mp4list) > 0:
        encoded = base64.b64encode(io.open(mp4list[0], 'r+b').read())
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay loop controls style="height: 400px;"> 
        <source src="data:video/mp4;base64,{0}" type="video/mp4" /> </video>'''.format(encoded.decode('ascii'))))
        
display = Display(visible=0, size=(400, 300))
display.start()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '400x300x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '400x300x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

### 2. Try it

The following code will output a sample video whose action is random sampled.

In [0]:

atari_game = "Pong-v0"
env = gym.wrappers.Monitor(gym.make(atari_game), 'sample', force=True)
env.seed(0)
print('State shape: ', env.observation_space.shape)
print('Number of actions: ', env.action_space.n)

state = env.reset()

cr = 0
for j in range(2000):
    action = env.action_space.sample()
    env.render()
    state, reward, done, _ = env.step(action)
    
    cr += reward
    print('\r %.5f' % cr, end="")
    if done:
        break 
env.close()
show_video('sample')

State shape:  (210, 160, 3)
Number of actions:  6
 -20.00000

### 3. Define QNetwork, agent and replay buffer

In [0]:
BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 128         # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR = 1e-3               # learning rate 
UPDATE_EVERY = 5        # how often to update the network

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print (device)
prepro = lambda img: imresize(img[35:195].mean(2), (80,80)).astype(np.float32).reshape(1,80,80)/255.
prepro(state).shape

class QNetwork(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed, channels=1):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(QNetwork, self).__init__()
        self.seed   = torch.manual_seed(seed)
        self.state_size  = state_size
        self.action_size = action_size
        self.channels    = channels
#         self.fc1_units   = fc1_units
#         self.fc2_units   = fc2_units
#         self.fc3_units   = fc3_units
#         self.fc4_units   = fc4_units
#         self.layer1 = nn.Linear(self.state_size, self.fc1_units, bias=True)
#         self.bn1 = nn.BatchNorm1d(self.fc1_units)
#         self.dp1 = nn.Dropout(p=0.5)
#         self.layer2 = nn.Linear(self.fc1_units,  self.fc2_units, bias=True)
#         self.bn2 = nn.BatchNorm1d(self.fc2_units)
#         self.dp2 = nn.Dropout(p=0.5)
#         self.layer3 = nn.Linear(self.fc2_units,  self.fc3_units, bias=True)
#         self.bn3 = nn.BatchNorm1d(self.fc3_units)
#         self.dp3 = nn.Dropout(p=0.5)
        
#         self.layer4 = nn.Linear(self.fc3_units,  self.fc4_units, bias=True)
#         self.layer5 = nn.Linear(self.fc4_units,  self.action_size, bias=True)
        
        self.conv1 = nn.Conv2d(self.channels, 32, 3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.gru = nn.GRUCell(32 * 5 * 5, 100) 
        self.layer = nn.Linear(100, self.action_size)
#              
        
        
    def forward(self, state):
        """Build a network that maps state -> action values."""
        state = state.view(-1,1,80,80)
        x = F.elu(self.conv1(state))
        x = F.elu(self.conv2(x))
        x = F.elu(self.conv3(x))
        x = F.elu(self.conv4(x))
        x = self.gru(x.view(-1, 32 * 5 * 5))
        x = self.layer(x)
        return x
        
# nn1= QNetwork(4, 2, 1)
# state = torch.zeros((5,4))
# # state = torch.tensor([1,1,1,1])
# print (state.type())

cuda:0


`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.
  # Remove the CWD from sys.path while we load stuff.


In [0]:
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed, buffer_size, batch_size, learning_rate, update_every,gamma, tau):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size  = batch_size
        self.learning_rate = learning_rate
        self.update_every = update_every
        self.gamma        = gamma
        self.tau          = tau

        # Q-Network
#         self.Q_network     = QNetwork(self.state_size, self.action_size, seed ).to(device) 
#         self.Q_network_val = QNetwork(self.state_size, self.action_size, seed ).to(device)

        self.Q_network     = QNetwork(self.state_size, self.action_size, seed ) 
        self.Q_network_val = QNetwork(self.state_size, self.action_size, seed )       
        
        # Replay memory
        self.memory        = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed)
        self.optimizer     = optim.Adam(self.Q_network.parameters(), lr=self.learning_rate)
        
        self.steps_until_update = 0
       
    def step(self, state, action, reward, next_state, done):
      
        # Save experience in replay memory    
        self.memory.push(state, action, reward, next_state, done)
        self.steps_until_update = (self.steps_until_update + 1)%self.update_every
        
        if(self.steps_until_update==0):
          if(self.memory.__len__()>self.batch_size):
#             print (1)
            sample = self.memory.sample()
#             print (sample)
#             return sample.cpu().numpy()
            self.learn(sample)
            
        

        

    def act(self, state, eps=0.1):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
#         state = torch.from_numpy(state).float().unsqueeze(0).to(device)
#         state = torch.from_numpy(state).float().unsqueeze(0)
        self.Q_network.eval()
      
        with torch.no_grad():
            action_values = self.Q_network(state)
        self.Q_network.train()
        uniform_random = random.random()
#         action        =  action_values.max(1)[1].cpu().numpy()[0]
        if(uniform_random >eps):
          action = np.argmax(action_values.cpu().data.numpy())
        else:
          action = np.random.randint(self.action_size)
        return action
 
    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
      
        states, actions, rewards, next_states, dones = experiences
        self.Q_network_val.eval()
        with torch.no_grad():
          target_rewards = rewards + self.gamma*(torch.max(self.Q_network_val.forward(next_states), dim=1, keepdim=True)[0])*(1-dones)
#         print (target_rewards)
        self.Q_network.train()
        expected_rewards = self.Q_network.forward(states).gather(1, actions)
        loss             = F.mse_loss(expected_rewards, target_rewards)
#         loss             = F.smooth_l1_loss(expected_rewards, target_rewards)
        self.optimizer.zero_grad()
        loss.backward()
#         for f in self.Q_network.parameters():
#           print (f.grad)
        self.optimizer.step()
  
        for Q_network_val_parameters, Q_network_parameters in zip(self.Q_network_val.parameters(), self.Q_network.parameters()):
            Q_network_val_parameters.data.copy_(self.tau * Q_network_parameters.data + (1.0 - self.tau) * Q_network_val_parameters.data)
      
        
#         print (Q_network_val_parameters.data[0])
#         print (Q_network_parameters.data[0])
#         self.Q_network
#         print (expected_rewards)
#         loss = nn.MSELoss()
#         output= 
#         output.backward(Q_network.parameters)
           

class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.
        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size  = batch_size
        self.seed        = random.seed(seed) 
        self.position    = 0
        self.memory      = []
        self.transition = namedtuple("Transition", field_names=["state", "action", "reward", "next_state", "done"])
        
    def push(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        self.current_transition = self.transition(state, action, reward, next_state, done)
        if(len(self.memory)< self.buffer_size):
          self.memory.append(None)
        self.memory[self.position] = self.current_transition
        self.position = (self.position + 1) % self.buffer_size
          
          
#     def sample(self):
#         """Randomly sample a batch of experiences from memory."""

#         sample = random.sample(self.memory, self.batch_size)
#         sample_list = self.transition(*zip(*sample))
#         states = torch.from_numpy(np.vstack(sample_list.state)).float().to(device)
#         actions = torch.from_numpy(np.vstack(sample_list.action)).long().to(device)
#         rewards = torch.from_numpy(np.vstack(sample_list.reward)).float().to(device)
#         next_states = torch.from_numpy(np.vstack(sample_list.next_state)).float().to(device)
#         dones = torch.from_numpy(np.vstack(sample_list.done).astype(np.uint8)).float().to(device)
        
#         return (states, actions, rewards, next_states, dones)
      
    def sample(self):
        """Randomly sample a batch of experiences from memory."""

        experiences = random.sample(self.memory, k=self.batch_size)

# #         states = torch.from_numpy(np.vstack([e.state.cpu() for e in experiences if e is not None])).float().to(device)
#         states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
#         actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
#         rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
# #         next_states = torch.from_numpy(np.vstack([e.next_state.cpu() for e in experiences if e is not None])).float().to(
# #             device)
#         next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
#             device)        
#         dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
#             device)

        states = torch.from_numpy(np.vstack([e.state.numpy() for e in experiences if e is not None])).float()
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long()
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float()
#         next_states = torch.from_numpy(np.vstack([e.next_state.cpu() for e in experiences if e is not None])).float().to(
#             device)
        next_states = torch.from_numpy(np.vstack([e.next_state.numpy() for e in experiences if e is not None])).float()
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float()
        return (states, actions, rewards, next_states, dones)      
      

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

### 3. Train the Agent with DQN



In [0]:
def train_dqn(n_episodes=1000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    
    env = gym.wrappers.Monitor(gym.make(atari_game), 'output', force=True)
    
    render = True
    for i_episode in range(0, n_episodes):
        if render and i_episode % 100 == 0:
            env = gym.wrappers.Monitor(gym.make(atari_game), 'output_%d' % i_episode, force=True)
        state = env.reset()
        state = torch.tensor(prepro(state))
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            if render and i_episode % 100 == 0:
                env.render()
            next_state, reward, done, _ = env.step(action)
            next_state = torch.tensor(prepro(next_state))
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            if render:
                env.close()
                show_video('output_%d' % i_episode)
                env = gym.make(atari_game)
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=0.0: # You can change for different game
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.Q_network.state_dict(), 'checkpoint.pth')
            break
    return scores
BUFFER_SIZE = int(1e6)  # replay buffer size
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-2              # for soft update of target parameters
LR = 5e-4               # learning rate 
UPDATE_EVERY = 4       # how often to update the network
agent = Agent(state_size=(1,80,80), action_size=4, seed=0, buffer_size= BUFFER_SIZE, batch_size= BATCH_SIZE, learning_rate= LR, update_every = UPDATE_EVERY, gamma= GAMMA, tau=TAU)

scores = train_dqn(eps_start=0.5, eps_end=0.05)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.
  


Episode 0	Average Score: -20.00

Episode 0	Average Score: -20.00
Episode 99	Average Score: -9.92

`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.
  


Episode 100	Average Score: -9.76

Episode 100	Average Score: -9.76
Episode 199	Average Score: -6.40

`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.
  


Episode 200	Average Score: -6.47

Episode 200	Average Score: -6.47
Episode 299	Average Score: -4.95

`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.
  


Episode 300	Average Score: -4.91

Episode 300	Average Score: -4.91
Episode 399	Average Score: -4.27

`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.
  


Episode 400	Average Score: -4.23

Episode 400	Average Score: -4.23
Episode 434	Average Score: -4.49