In [1]:
import os
import pygame
DISPLAY = True
if not DISPLAY:
    os.environ["SDL_VIDEODRIVER"] = "dummy"

pygame 1.9.5
Hello from the pygame community. https://www.pygame.org/contribute.html


## Double Dueling Deep Q Network Learning with Priortized Experienced Reply

In [2]:
import os,sys
sys.path.append('game/')
import flappy_wrapped as game
import cv2
import numpy as np
import time
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
KERNEL = np.array([[-1,-1,-1], [-1, 9,-1],[-1,-1,-1]])
def processFrame(frame):
    frame = frame[55:288,0:400] #crop image
    frame = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY) #convert image to black and white
    frame = cv2.resize(frame,(84,84),interpolation=cv2.INTER_AREA)
    _ , frame = cv2.threshold(frame,50,255,cv2.THRESH_BINARY)
    #frame = cv2.blur(frame,(5,5))
    frame = cv2.filter2D(frame,-1,KERNEL)
    #frame = cv2.Canny(frame,100,200)
    frame = frame.astype(np.float64)/255.0
    return frame

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

#Dueling DQN
class DDQN(nn.Module):
    def __init__(self,input_shape,nactions):
        super(DDQN,self).__init__()
        self.nactions = nactions
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0],32,kernel_size=4,stride=2),
            nn.ReLU(),
            nn.Conv2d(32,64,kernel_size=3,stride=2),
            nn.ReLU(),
            nn.Conv2d(64,64,kernel_size=2,stride=1),
            nn.ReLU()
        )
        
        conv_out_size = self._get_conv_out(input_shape)
        
        self.fca = nn.Sequential(
            nn.Linear( conv_out_size, 512),
            nn.ReLU(),
            nn.Linear( 512, nactions )
        )
        
        self.fcv = nn.Sequential(
            nn.Linear(conv_out_size,512),
            nn.ReLU(),
            nn.Linear(512,1)
        )
        
    def _get_conv_out(self,shape):
        o = self.conv( torch.zeros(1,*shape) )
        return int(np.prod(o.size()))
    
    def forward(self,x):
        conv_out = self.conv(x).view(x.size()[0], -1)
        action_v = self.fca(conv_out)
        value_v = self.fcv(conv_out).expand(x.size(0), self.nactions)
        return value_v + action_v - action_v.mean(1).unsqueeze(1).expand(x.size(0), self.nactions)

In [5]:
ACTIONS = [0,1]
EXPERIENCE_BUFFER_SIZE = 2000
STATE_DIM = 4
GAMMA = 0.99
EPSILON_START = 1
EPSILON_FINAL = 0.001
EPSILON_DECAY_FRAMES = (10**4)/3
MEAN_GOAL_REWARD = 10
BATCH_SIZE = 32
MIN_EXP_BUFFER_SIZE = 500
SYNC_TARGET_FRAMES = 30
LEARNING_RATE = 1e-4
SKIP_FRAME = 2
INITIAL_SKIP = [0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1]

In [6]:
import collections
class ExperienceBuffer():
    def __init__(self,capacity):
        self.buffer = collections.deque(maxlen=capacity)
        self.priority = collections.deque(maxlen=capacity)
    
    def clear(self):
        self.buffer.clear()
        self.priority.clear()
        
    def __len__(self):
        return len(self.buffer)
    
    def append(self,exp,p):
        self.buffer.append(exp)
        self.priority.append(p)
        
    def sample(self,batch_size):
        probs = np.array(self.priority)/sum(np.array(self.priority))
        indices = np.random.choice( range(len(self.buffer)), batch_size, p = probs)
        states,actions,rewards,dones,next_states = zip(*[ self.buffer[idx] for idx in indices ])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32),\
    np.array(dones,dtype=np.uint8), np.array(next_states)

In [7]:
class Agent():
    def __init__(self,env,buffer,state_buffer_size = STATE_DIM):
        self.env = env
        self.exp_buffer = buffer
        self.state = collections.deque(maxlen = STATE_DIM)
        self.next_state= collections.deque(maxlen = STATE_DIM)
        self._reset()
        
    def _reset(self):
        self.total_rewards = 0
        self.state.clear()
        self.next_state.clear()
        
        for i in INITIAL_SKIP[:-7]:
            frame,reward,done = self.env.frame_step(i)
            self.total_rewards+=reward
            if done:
                self._reset()
        frame = processFrame(frame)
        self.state.append(frame)
        self.next_state.append(frame)

        for i in INITIAL_SKIP[-7:-5]:
            frame,reward,done = self.env.frame_step(i)
            self.total_rewards+=reward
            if done:
                self._reset()
        frame = processFrame(frame)
        self.state.append(frame)
        self.next_state.append(frame)
        
        for i in INITIAL_SKIP[-5:-3]:
            frame,reward,done = self.env.frame_step(i)
            self.total_rewards+=reward
            if done:
                self._reset()
        frame = processFrame(frame)
        self.state.append(frame)
        self.next_state.append(frame)
        
        for i in INITIAL_SKIP[-3:-1]:
            frame,reward,done = self.env.frame_step(i)
            self.total_rewards+=reward
            if done:
                self._reset()
        frame = processFrame(frame)
        self.state.append(frame)
        self.next_state.append(frame)
    
    def step(self,net,tgt_net,epsilon=0.9,device='cpu'):
        self.total_rewards = 0
        if np.random.random() < epsilon:
            action = np.random.choice(ACTIONS)
        else:
            state_v = torch.tensor(np.array([self.state],copy=False),dtype=torch.float32).to(device)
            action = int(torch.argmax(net(state_v)))
       
        frame,reward,done = self.env.frame_step(action)
        self.total_rewards += reward
        for _ in range(SKIP_FRAME):
                frame,reward,done =  self.env.frame_step(action)
                self.total_rewards += reward
                if done:
                    break
                    
        frame = processFrame(frame)
        self.next_state.append(frame)
        
        if len(self.next_state)==STATE_DIM and len(self.state)==STATE_DIM:
            #PER - Prioritized Experience Replay
            o = net( torch.tensor( np.array([self.state]),dtype=torch.float32).to(device)).to('cpu').detach().numpy()[0][action]
            e = float(torch.max(tgt_net( torch.tensor( np.array([self.next_state]),dtype=torch.float32).to(device))))
            p = abs(o-e)+0.0001
            self.exp_buffer.append((self.state.copy(),action,int(self.total_rewards),done,self.next_state.copy()),p)
        
        self.state.append(frame)
        
        end_reward = int(self.total_rewards)
        if done:
            self._reset()
        
        return end_reward

In [None]:
def calc_loss(batch,net,tgt_net,device='cpu'):
    states,actions,rewards,dones,next_states = batch
    
    states_v = torch.tensor(states,dtype=torch.float32).to(device)
    actions_v = torch.tensor(actions,dtype=torch.long).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    dones_v = torch.ByteTensor(dones).to(device)
    next_states_v = torch.tensor(next_states,dtype=torch.float32).to(device)
    
    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_action_values = tgt_net(next_states_v).max(1)[0]
    next_state_action_values[dones_v] = 0.0
    next_state_action_values = next_state_action_values.detach() 
    
    expected_values = rewards_v +  next_state_action_values * GAMMA
    return nn.MSELoss()(state_action_values,expected_values)

In [None]:
all_losses = []
device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" )

#Double Dueling DQN
net = DDQN( (STATE_DIM,84,84), len(ACTIONS) ).to(device)
tgt_net = DDQN( (STATE_DIM,84,84), len(ACTIONS) ).to(device)

env = game.GameState()
buffer = ExperienceBuffer(EXPERIENCE_BUFFER_SIZE)
agent = Agent(env,buffer)
epsilon = EPSILON_START
optimizer = optim.Adam(net.parameters(),lr=LEARNING_RATE)

total_rewards = []
best_mean_reward = float('-inf')
last_mean = float('-inf')
game_id = 0
while True:
    epsilon = max( EPSILON_FINAL , EPSILON_START - game_id/EPSILON_DECAY_FRAMES )
    
    reward = agent.step(net,tgt_net,epsilon,device=device)
    if reward != 0:
        game_id += 1
        total_rewards.append(reward)
        mean_reward = np.mean(total_rewards[-100:])
        if game_id%5 == 0:
            print("GAME : {} | EPSILON : {:.4f} | MEAN REWARD : {}".format( game_id, epsilon, mean_reward ))
        if best_mean_reward < mean_reward:
            best_mean_reward = mean_reward
            
            if best_mean_reward - last_mean >= 0.1:
                torch.save(net.state_dict(),'checkpoints/flappy_best_model.dat')
                print("REWARD {} -> {}. Model Saved".format(last_mean,mean_reward))
                last_mean = best_mean_reward

        if game_id % SYNC_TARGET_FRAMES == 0:
            tgt_net.load_state_dict(net.state_dict())
            
        if mean_reward >= MEAN_GOAL_REWARD:
            print("Learned in {} Games.".format(game_id))
            break
    
    if len(buffer) < MIN_EXP_BUFFER_SIZE:
        continue
    
    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch,net,tgt_net,device=device)
    all_losses.append(float(loss_t))
    loss_t.backward()
    optimizer.step()

REWARD -inf -> -1.0. Model Saved
GAME : 5 | EPSILON : 0.9988 | MEAN REWARD : -1.0
REWARD -1.0 -> -0.6666666666666666. Model Saved
GAME : 10 | EPSILON : 0.9973 | MEAN REWARD : -0.8
GAME : 15 | EPSILON : 0.9958 | MEAN REWARD : -0.6
GAME : 20 | EPSILON : 0.9943 | MEAN REWARD : -0.6
REWARD -0.6666666666666666 -> -0.5454545454545454. Model Saved
GAME : 25 | EPSILON : 0.9928 | MEAN REWARD : -0.6
GAME : 30 | EPSILON : 0.9913 | MEAN REWARD : -0.6666666666666666
GAME : 35 | EPSILON : 0.9898 | MEAN REWARD : -0.6571428571428571
GAME : 40 | EPSILON : 0.9883 | MEAN REWARD : -0.7
GAME : 45 | EPSILON : 0.9868 | MEAN REWARD : -0.7333333333333333
GAME : 50 | EPSILON : 0.9853 | MEAN REWARD : -0.76
GAME : 55 | EPSILON : 0.9838 | MEAN REWARD : -0.7454545454545455
GAME : 60 | EPSILON : 0.9823 | MEAN REWARD : -0.7666666666666667
GAME : 65 | EPSILON : 0.9808 | MEAN REWARD : -0.7846153846153846
GAME : 70 | EPSILON : 0.9793 | MEAN REWARD : -0.8
GAME : 75 | EPSILON : 0.9778 | MEAN REWARD : -0.7866666666666666
G

GAME : 775 | EPSILON : 0.7678 | MEAN REWARD : -0.74
GAME : 780 | EPSILON : 0.7663 | MEAN REWARD : -0.78
GAME : 785 | EPSILON : 0.7648 | MEAN REWARD : -0.76
GAME : 790 | EPSILON : 0.7633 | MEAN REWARD : -0.78
GAME : 795 | EPSILON : 0.7618 | MEAN REWARD : -0.8
GAME : 800 | EPSILON : 0.7603 | MEAN REWARD : -0.84
GAME : 805 | EPSILON : 0.7588 | MEAN REWARD : -0.82
GAME : 810 | EPSILON : 0.7573 | MEAN REWARD : -0.8
GAME : 815 | EPSILON : 0.7558 | MEAN REWARD : -0.8
GAME : 820 | EPSILON : 0.7543 | MEAN REWARD : -0.82
GAME : 825 | EPSILON : 0.7528 | MEAN REWARD : -0.8
GAME : 830 | EPSILON : 0.7513 | MEAN REWARD : -0.78
GAME : 835 | EPSILON : 0.7498 | MEAN REWARD : -0.76
GAME : 840 | EPSILON : 0.7483 | MEAN REWARD : -0.78
GAME : 845 | EPSILON : 0.7468 | MEAN REWARD : -0.78
GAME : 850 | EPSILON : 0.7453 | MEAN REWARD : -0.78
GAME : 855 | EPSILON : 0.7438 | MEAN REWARD : -0.82
GAME : 860 | EPSILON : 0.7423 | MEAN REWARD : -0.82
GAME : 865 | EPSILON : 0.7408 | MEAN REWARD : -0.8
GAME : 870 | EPSI

GAME : 1555 | EPSILON : 0.5338 | MEAN REWARD : -0.38
GAME : 1560 | EPSILON : 0.5323 | MEAN REWARD : -0.42
GAME : 1565 | EPSILON : 0.5308 | MEAN REWARD : -0.4
GAME : 1570 | EPSILON : 0.5293 | MEAN REWARD : -0.36
GAME : 1575 | EPSILON : 0.5278 | MEAN REWARD : -0.34
REWARD -0.44 -> -0.32. Model Saved
GAME : 1580 | EPSILON : 0.5263 | MEAN REWARD : -0.3
GAME : 1585 | EPSILON : 0.5248 | MEAN REWARD : -0.26
GAME : 1590 | EPSILON : 0.5233 | MEAN REWARD : -0.3
GAME : 1595 | EPSILON : 0.5218 | MEAN REWARD : -0.28
GAME : 1600 | EPSILON : 0.5203 | MEAN REWARD : -0.28
GAME : 1605 | EPSILON : 0.5188 | MEAN REWARD : -0.3
GAME : 1610 | EPSILON : 0.5173 | MEAN REWARD : -0.26
GAME : 1615 | EPSILON : 0.5158 | MEAN REWARD : -0.28
GAME : 1620 | EPSILON : 0.5143 | MEAN REWARD : -0.24
REWARD -0.32 -> -0.22. Model Saved
GAME : 1625 | EPSILON : 0.5128 | MEAN REWARD : -0.24
GAME : 1630 | EPSILON : 0.5113 | MEAN REWARD : -0.26
GAME : 1635 | EPSILON : 0.5098 | MEAN REWARD : -0.3
GAME : 1640 | EPSILON : 0.5083 | M

GAME : 2320 | EPSILON : 0.3043 | MEAN REWARD : 0.08
GAME : 2325 | EPSILON : 0.3028 | MEAN REWARD : 0.1
GAME : 2330 | EPSILON : 0.3013 | MEAN REWARD : 0.12
GAME : 2335 | EPSILON : 0.2998 | MEAN REWARD : 0.06
GAME : 2340 | EPSILON : 0.2983 | MEAN REWARD : 0.02
GAME : 2345 | EPSILON : 0.2968 | MEAN REWARD : 0.02
GAME : 2350 | EPSILON : 0.2953 | MEAN REWARD : 0.0
GAME : 2355 | EPSILON : 0.2938 | MEAN REWARD : -0.06
GAME : 2360 | EPSILON : 0.2923 | MEAN REWARD : -0.02
GAME : 2365 | EPSILON : 0.2908 | MEAN REWARD : -0.06
GAME : 2370 | EPSILON : 0.2893 | MEAN REWARD : -0.04
GAME : 2375 | EPSILON : 0.2878 | MEAN REWARD : -0.06
GAME : 2380 | EPSILON : 0.2863 | MEAN REWARD : -0.1
GAME : 2385 | EPSILON : 0.2848 | MEAN REWARD : -0.12
GAME : 2390 | EPSILON : 0.2833 | MEAN REWARD : -0.1
GAME : 2395 | EPSILON : 0.2818 | MEAN REWARD : -0.06
GAME : 2400 | EPSILON : 0.2803 | MEAN REWARD : -0.1
GAME : 2405 | EPSILON : 0.2788 | MEAN REWARD : -0.04
GAME : 2410 | EPSILON : 0.2773 | MEAN REWARD : -0.06
GAME 

GAME : 3095 | EPSILON : 0.0718 | MEAN REWARD : 0.64
GAME : 3100 | EPSILON : 0.0703 | MEAN REWARD : 0.66
GAME : 3105 | EPSILON : 0.0688 | MEAN REWARD : 0.68
GAME : 3110 | EPSILON : 0.0673 | MEAN REWARD : 0.66
GAME : 3115 | EPSILON : 0.0658 | MEAN REWARD : 0.7
GAME : 3120 | EPSILON : 0.0643 | MEAN REWARD : 0.72
GAME : 3125 | EPSILON : 0.0628 | MEAN REWARD : 0.72
GAME : 3130 | EPSILON : 0.0613 | MEAN REWARD : 0.72
GAME : 3135 | EPSILON : 0.0598 | MEAN REWARD : 0.74
GAME : 3140 | EPSILON : 0.0583 | MEAN REWARD : 0.74
GAME : 3145 | EPSILON : 0.0568 | MEAN REWARD : 0.76
GAME : 3150 | EPSILON : 0.0553 | MEAN REWARD : 0.76
GAME : 3155 | EPSILON : 0.0538 | MEAN REWARD : 0.76
REWARD 0.66 -> 0.78. Model Saved
GAME : 3160 | EPSILON : 0.0523 | MEAN REWARD : 0.78
GAME : 3165 | EPSILON : 0.0508 | MEAN REWARD : 0.8
GAME : 3170 | EPSILON : 0.0493 | MEAN REWARD : 0.78
GAME : 3175 | EPSILON : 0.0478 | MEAN REWARD : 0.78
GAME : 3180 | EPSILON : 0.0463 | MEAN REWARD : 0.78
GAME : 3185 | EPSILON : 0.0448 | 