In [0]:
import torch
from collections import deque
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
import cv2

In [0]:
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
  print('GPU not available. traininig on CPU!')
else:
  print('GPU Available, Training on GPU!')

GPU Available, Training on GPU!


In [0]:
!pip3 install gym-retro

import retro




In [0]:
! wget http://www.atarimania.com/roms/Roms.rar && unrar x Roms.rar && unzip Roms/ROMS.zip
! pip3 install gym-retro
! python3 -m retro.import ROMS/

--2019-04-17 06:06:02--  http://www.atarimania.com/roms/Roms.rar
Resolving www.atarimania.com (www.atarimania.com)... 195.154.81.199
Connecting to www.atarimania.com (www.atarimania.com)|195.154.81.199|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10823448 (10M) [application/x-rar-compressed]
Saving to: ‘Roms.rar’


2019-04-17 06:06:26 (446 KB/s) - ‘Roms.rar’ saved [10823448/10823448]


UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from Roms.rar

Creating    Roms                                                      OK
Extracting  Roms/HC ROMS.zip                                              37%  OK 
Extracting  Roms/ROMS.zip                                                 76% 99%  OK 
All OK
Archive:  Roms/ROMS.zip
   creating: ROMS/
  inflating: ROMS/128 in 1 Game Select ROM (128 in 1) (Unknown) ~.bin  
  inflating: ROMS/2 Pak Special - Cavern Blaster, City War (1992) (HES) (773-867) (PAL).bin  
  inflat

In [0]:
env = retro.make(game='SpaceInvaders-Atari2600')


In [0]:
print('The size of our frame is- ', env.observation_space)
print('Total possible actions are -',env.action_space.n)


The size of our frame is-  Box(210, 160, 3)
Total possible actions are - 8


In [0]:
possible_actions = np.identity(env.action_space.n,dtype = int)

In [0]:
""" Preprocessing Steps
1. Grayscale it
2. Crop it
3. Normalize it
4. Reduce the size

"""

def preprocess_frame(frame):
  gray = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)
  cropped_frame = gray[8:-12,4:-12]
  normalized_frame = cropped_frame/225.0
  resized = cv2.resize(normalized_frame,(110,84))
  
  return resized



In [0]:
stack_size = 4

stacked_frames = deque([np.zeros((110,84),dtype = np.int) for i in range(stack_size)],maxlen = 4)


## Stack the frames together and return the stacked frames as np array and as a deque

def stack_frame(stacked_frames, state, is_new_episode):
  state = preprocess_frame(state)
  
  if is_new_episode:
    #Clear stacked frames:    
    stacked_frames = deque([np.zeros((110,84),dtype = np.int) for i in range(stack_size)],maxlen = 4)
    
    stacked_frames.append(state)
    stacked_frames.append(state)
    stacked_frames.append(state)
    stacked_frames.append(state)
    
  else:
    stacked_frames.append(state)
      
  stacked_state = np.stack(stacked_frames, axis = 2)
  #Returns numpy array of 4 states -> stacked_state and a deque of 4 states -> stacked_frames
  return stacked_state, stacked_frames


In [0]:
## Hyperparameters
state_size = [110,84,4]
action_size = env.action_space.n
learning_rate = 0.00025

total_episodes = 50
max_steps  = 50000
batch_size = 64

explore_start = 1.0
explore_end = 0.01
decay_rate = 0.00001

gamma = 0.9

pretrain_len = batch_size
memory_size = 1000000

training = True

episode_render = False


In [0]:
class DQNetwork(nn.Module):
  
  def __init__(self):
    super(DQNetwork, self).__init__()

    self.conv1 = nn.Conv2d(4,16,4,padding = 1)
    # 16*55*42
    
    self.conv2 = nn.Conv2d(16,32,3,padding = 1)
    # 32*27*21
    
    self.conv3 = nn.Conv2d(32,64,3, padding = 1)
    # 64*13*10
    
    self.maxpool = nn.MaxPool2d(2,2)
    
    self.bn1 = nn.BatchNorm2d(16)
    
    self.bn2 = nn.BatchNorm2d(32)
    
    self.bn3 = nn.BatchNorm2d(64)
    
    self.bn4 = nn.BatchNorm1d(256)
    
    self.bn5 = nn.BatchNorm1d(64)

    
    self.fc1 = nn.Linear(64*13*10,256)
    
    self.fc2 = nn.Linear(256,64)
    
    self.fc3 = nn.Linear(64,action_size)
    
    self.dropout = nn.Dropout(0.5)
    
  def forward(self,x):
    x = self.maxpool(F.relu(self.bn1(self.conv1(x))))
    x = self.maxpool(F.relu(self.bn2(self.conv2(x))))
    x = self.maxpool(F.relu(self.bn3(self.conv3(x))))
    x = x.view(-1,64*13*10)
    x = F.relu(self.fc1(x))
    x = self.dropout(x)
    x = F.relu(self.fc2(x))
    x = self.dropout(x)
    x = self.fc3(x)
    
    return x
  
  
model = DQNetwork()
if(train_on_gpu):
  model.cuda()

    
    
    
    
    
    

In [0]:
## Memory class which stores the experiences in a deque
class Memory():
  def __init__(self,max_len):
    
    self.max_len = max_len
    self.buffer = deque(maxlen = max_len)
    
  def add(self,experience):
    self.buffer.append(experience)
    
  def sample(self,batch_size):
    buffer_size = len(self.buffer)
    index = np.random.choice(buffer_size,size = batch_size,replace = False)
    return [self.buffer[i] for i in index]
  
    
    

In [0]:
memory = Memory(max_len = memory_size)

## Adding some random experiences 
for i in range(pretrain_len):
  ## If nothing is in the stack
  if i == 0:
    state = env.reset()
    state,stacked_frames = stack_frame(stacked_frames,state,is_new_episode = True)
  choice = random.randint(1,action_size)-1
  action = possible_actions[choice]
  next_state, reward, done, _ = env.step(action)
  
    ## Add the new state in the stack
  next_state,stacked_frames = stack_frame(stacked_frames, next_state, is_new_episode = False)
    
    ## If this was the last state in that episode
    ## Then we need to clear the stack
    ## Next state would be 0s
  if done:
    next_state = np.zeros(next_state.shape)
    memory.add((state,action,reward,next_state,done))
      
    state = env.reset()
    state, stacked_frames = stack_frame(stacked_frames, state, True)
  ## Otherwise add the current experience to the memory
  else:
    memory.add((state, action, reward, next_state, done))
    state = next_state

    

In [0]:
import torch.optim as optim

# specify loss function
criterion = nn.MSELoss()

# specify optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [0]:
def predict_action(explore_start, explore_end, decay_rate, decay_step, state, actions):
    ## EPSILON GREEDY STRATEGY
    # Choose action a from state s using epsilon greedy.
    ## First we randomize a number
    exp_exp_tradeoff = np.random.rand()
    state = np.expand_dims(state,0)
    state = np.transpose(state,(0,3,2,1))
    
    # Get action from Q-network (exploitation)
        # Estimate the Qs values state
    
 

    # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook
    explore_probability = explore_end + (explore_start - explore_end) * np.exp(-decay_rate * decay_step)
    if (explore_probability > exp_exp_tradeoff):
        # Make a random action (exploration)
        choice = random.randint(1,len(possible_actions))-1
        action = possible_actions[choice]
        
    else:
      state = torch.from_numpy(state)
      if train_on_gpu:
        state = state.cuda()
      Qs = model(state)

          # Take the biggest Q value (= the best action)
      _,choice = torch.max(Qs,1)

      choice = choice.cpu().item()

      action = possible_actions[choice]
      
    return action, choice

                
                


In [0]:
#Convert Model Parameters to Double
model = model.double()
explore_start = 0.8
model.load_state_dict(torch.load('atari_space.pt'))

if training == True:

    # Initialize the decay rate (that will use to reduce epsilon) 
        decay_step = 0
        rewards_list = []
        average_Q = []
        for episode in range(total_episodes):
            # Set step to 0
            step = 0
            
            Q_value = 0
            
            # Initialize the rewards of the episode
            episode_rewards = []
            
            # Make a new episode and observe the first state
            state = env.reset()
            
            # Remember that stack frame function also call our preprocess function.
            state, stacked_frames = stack_frame(stacked_frames, state, True)
            
            while step < max_steps:
                step += 1
                
                #Increase decay_step
                decay_step +=1
                
                # Predict the action to take and take it
                action, choice = predict_action(explore_start, explore_end, decay_rate, decay_step, state, possible_actions)
                
                #Perform the action and get the next_state, reward, and done information
                next_state, reward, done, _ = env.step(action)
                
                # Add the reward to total reward
                episode_rewards.append(reward)
                
                # If the game is finished
                if done:
                    # The episode ends so no next state
                    average_Q.append(Q_value/step)
                    next_state = np.zeros((210, 160, 3), dtype=np.uint8)
                    next_state, stacked_frames = stack_frame(stacked_frames, next_state, False)


                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)
                    
                    

                    print('Episode: {}'.format(episode),
                                'Training Loss {:.4f}'.format(loss))
                    print("Total Reward:", total_reward)
                    print("Steps:",step)
                    
                    # Set step = max_steps to end the episode
                    step = max_steps


                    rewards_list.append((episode, total_reward))

                    # Store transition <st,at,rt+1,st+1> in memory D
                    memory.add((state, action, reward, next_state, done))

                else:
                    # Stack the frame of the next_state
                    next_state, stacked_frames = stack_frame(stacked_frames, next_state, False)
                
                    # Add experience to memory
                    memory.add((state, action, reward, next_state, done))

                    # st+1 is now our current state
                    state = next_state
                    

                ### LEARNING PART            
                # Obtain random mini-batch from memory
                batch = memory.sample(batch_size)
                
                
                # Unpack the batch into respective arrays
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch]) 
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])

                target_Qs_batch = []
                
                # Transpose the axis to match the pytorch input standard (C*H*W)
                states_mb = torch.from_numpy(np.transpose(states_mb,(0,3,2,1)))
                next_states_mb = torch.from_numpy(np.transpose(next_states_mb,(0,3,2,1)))

                # Get Q values for next_state 
                if train_on_gpu:
                  next_states_mb = next_states_mb.cuda()
                  states_mb = states_mb.cuda()
                  
                # Get Q values for the next state
                Qs_next_state = model(next_states_mb)
                
 
               
                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]

                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    # If not the terminal state, target = reward for that state + gamma*max Q value of next state
                    else:
                        target = rewards_mb[i] + gamma * torch.max(Qs_next_state,1)[0][i]
                        print("target",target)
                        print("reward",reward)
                        target_Qs_batch.append(target.cpu().detach().numpy())
                
                #Convert List to np array
                targets_mb = np.array([each for each in target_Qs_batch])
                optimizer.zero_grad()
                
                max_Q_predicted = torch.sum(model(states_mb)*torch.from_numpy(actions_mb).cuda().double(),dim = 1)
                Q_value += torch.sum(max_Q_predicted,dim = 0)
                loss = criterion(max_Q_predicted, torch.from_numpy(targets_mb).cuda())
                
                if( step%100 == 0):
                  print("Loss: ",loss)
                  print("Predicted Qs:",max_Q_predicted[0].cpu().item())
                  print("Target Qs: ",targets_mb[0])
                  print("Reward Till Now:",np.sum(episode_rewards))
                  print("Step: ",step)
                  print("Q_value:" , Q_value)
                  print("\n")

                loss.backward()
                # perform a single optimization step (parameter update)
                optimizer.step()
               

            # Save model every 5 episodes
            if episode % 1 == 0:
                torch.save(model.state_dict(), 'atari_space.pt')
                print("Model Saved")

target tensor(1.9633, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
reward 0.0
target tensor(1.8955, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
reward 0.0
target tensor(1.8890, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
reward 0.0
target tensor(1.7761, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
reward 0.0
target tensor(2.0638, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
reward 0.0
target tensor(2.0662, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
reward 0.0
target tensor(1.9406, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
reward 0.0
target tensor(69.4932, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
reward 0.0
target tensor(1.7401, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
reward 0.0
target tensor(1.8455, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
reward 0.0
target tensor(2.1899, device='cuda:0', dtype=torc

KeyboardInterrupt: ignored

In [0]:
model.load_state_dict(torch.load('atari_space.pt'))
total_reward = 0
i = 0
while(i in range(10)):
  # Set step to 0
  step = 0

  # Make a new episode and observe the first state
  state = env.reset()

  episode_rewards = []

  # Remember that stack frame function also call our preprocess function.
  state, stacked_frames = stack_frame(stacked_frames, state, True)

  while step < max_steps:
      step += 1

      #Perform the action and get the next_state, reward, and done information

      action, choice = predict_action(-1, explore_end, decay_rate, decay_step, state, possible_actions)

      next_state, reward, done, _ = env.step(action)

      # Add the reward to total reward
      episode_rewards.append(reward)

      # If the game is finished
      if done:

          total_reward += np.sum(episode_rewards)

          print("Total Reward:", np.sum(episode_rewards))
          print("Steps:",step)
          step = max_steps


      else:
          # Stack the frame of the next_state
          next_state, stacked_frames = stack_frame(stacked_frames, next_state, False)

          # Add experience to memory
          #memory.add((state, action, reward, next_state, done))

          # st+1 is now our current state
          state = next_state
  i+=1

print("Average Reward for Learned Model for 10 Episodes:",total_reward/10)

Total Reward: 295.0
Steps: 3786
Total Reward: 405.0
Steps: 3812
Total Reward: 260.0
Steps: 2636
Total Reward: 245.0
Steps: 2768
Total Reward: 80.0
Steps: 1506
Total Reward: 135.0
Steps: 2212
Total Reward: 325.0
Steps: 2888
Total Reward: 80.0
Steps: 2056
Total Reward: 60.0
Steps: 2832
Total Reward: 520.0
Steps: 3086
Average Reward for Learned Model for 10 Episodes: 240.5


In [0]:
## RANDOM ACTIONS
i = 0
total_reward = 0

while i in range(10):
  
  

  # Set step to 0
  step = 0

  # Make a new episode and observe the first state
  state = env.reset()

  episode_rewards = []

  # Remember that stack frame function also call our preprocess function.
  state, stacked_frames = stack_frame(stacked_frames, state, True)

  while step < max_steps:
      step += 1

      #Perform the action and get the next_state, reward, and done information

      action, choice = predict_action(explore_start, explore_start, decay_rate, decay_step, state, possible_actions)

      next_state, reward, done, _ = env.step(action)

      # Add the reward to total reward
      episode_rewards.append(reward)

      # If the game is finished
      if done:

          total_reward += np.sum(episode_rewards)

          print("Total Reward:", np.sum(episode_rewards))
          print("Steps:",step)
          step = max_steps

      else:
          # Stack the frame of the next_state
          next_state, stacked_frames = stack_frame(stacked_frames, next_state, False)

          # Add experience to memory
          #memory.add((state, action, reward, next_state, done))

          # st+1 is now our current state
          state = next_state
  i= i +1
print("\nAverage Reward for Random 10 Episodes:", total_reward/10)



Total Reward: 105.0
Steps: 2040
Total Reward: 105.0
Steps: 2082
Total Reward: 210.0
Steps: 2394
Total Reward: 410.0
Steps: 2712
Total Reward: 260.0
Steps: 2850
Total Reward: 80.0
Steps: 1758
Total Reward: 515.0
Steps: 2978
Total Reward: 410.0
Steps: 2442
Total Reward: 440.0
Steps: 2874
Total Reward: 215.0
Steps: 2700

Average Reward for Random 10 Episodes: 275.0
