In [1]:
import os
import pygame
DISPLAY = True
if not DISPLAY:
    os.environ["SDL_VIDEODRIVER"] = "dummy"

pygame 1.9.5
Hello from the pygame community. https://www.pygame.org/contribute.html


## Double Dueling Deep Q Network Learning with Priortized Experienced Reply

In [2]:
import os,sys
sys.path.append('game/')
import flappy_wrapped as game
import cv2
import numpy as np
import time
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
KERNEL = np.array([[-1,-1,-1], [-1, 9,-1],[-1,-1,-1]])
def processFrame(frame):
    frame = frame[55:288,0:400] #crop image
    frame = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY) #convert image to black and white
    frame = cv2.resize(frame,(84,84),interpolation=cv2.INTER_AREA)
    _ , frame = cv2.threshold(frame,50,255,cv2.THRESH_BINARY)
    #frame = cv2.blur(frame,(5,5))
    frame = cv2.filter2D(frame,-1,KERNEL)
    #frame = cv2.Canny(frame,100,200)
    frame = frame.astype(np.float64)/255.0
    return frame

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

#Dueling DQN
class DDQN(nn.Module):
    def __init__(self,input_shape,nactions):
        super(DDQN,self).__init__()
        self.nactions = nactions
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0],32,kernel_size=4,stride=2),
            nn.ReLU(),
            nn.Conv2d(32,64,kernel_size=3,stride=2),
            nn.ReLU(),
            nn.Conv2d(64,64,kernel_size=2,stride=1),
            nn.ReLU()
        )
        
        conv_out_size = self._get_conv_out(input_shape)
        
        self.fca = nn.Sequential(
            nn.Linear( conv_out_size, 512),
            nn.ReLU(),
            #nn.Linear(512,256),
            #nn.ReLU(),
            nn.Linear( 512, nactions )
        )
        
        self.fcv = nn.Sequential(
            nn.Linear(conv_out_size,512),
            nn.ReLU(),
            nn.Linear(512,1)
        )
        
    def _get_conv_out(self,shape):
        o = self.conv( torch.zeros(1,*shape) )
        return int(np.prod(o.size()))
    
    def forward(self,x):
        conv_out = self.conv(x).view(x.size()[0], -1)
        action_v = self.fca(conv_out)
        value_v = self.fcv(conv_out).expand(x.size(0), self.nactions)
        return value_v + action_v - action_v.mean(1).unsqueeze(1).expand(x.size(0), self.nactions)

In [5]:
ACTIONS = [0,1]
EXPERIENCE_BUFFER_SIZE = 2000
STATE_DIM = 4
GAMMA = 0.99
EPSILON_START = 1
EPSILON_FINAL = 0.01
EPSILON_DECAY_FRAMES = (10**4)/3
MEAN_GOAL_REWARD = 20
BATCH_SIZE = 32
MIN_EXP_BUFFER_SIZE = 500
SYNC_TARGET_FRAMES = 30
LEARNING_RATE = 1e-4
SKIP_FRAME = 2
INITIAL_SKIP = [0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1]

In [6]:
import collections
class ExperienceBuffer():
    def __init__(self,capacity):
        self.buffer = collections.deque(maxlen=capacity)
        self.priority = collections.deque(maxlen=capacity)
    
    def clear(self):
        self.buffer.clear()
        self.priority.clear()
        
    def __len__(self):
        return len(self.buffer)
    
    def append(self,exp,p):
        self.buffer.append(exp)
        self.priority.append(p)
        
    def sample(self,batch_size):
        probs = np.array(self.priority)/sum(np.array(self.priority))
        indices = np.random.choice( range(len(self.buffer)), batch_size, p = probs)
        states,actions,rewards,dones,next_states = zip(*[ self.buffer[idx] for idx in indices ])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32),\
    np.array(dones,dtype=np.uint8), np.array(next_states)

In [7]:
class Agent():
    def __init__(self,env,buffer,state_buffer_size = STATE_DIM):
        self.env = env
        self.exp_buffer = buffer
        self.state = collections.deque(maxlen = STATE_DIM)
        self.next_state= collections.deque(maxlen = STATE_DIM)
        self._reset()
        
    def _reset(self):
        self.total_rewards = 0
        
        for i in INITIAL_SKIP[:-1]:
            frame,reward,done = self.env.frame_step(i)
            self.total_rewards+=reward
            if done:
                self._reset()
        frame,reward,done =  self.env.frame_step(INITIAL_SKIP[-1])
        
        if done:
            self._reset()
        
        frame = processFrame(frame)
        self.state.append(frame)
        self.next_state.append(frame)
    
    def step(self,net,tgt_net,epsilon=0.9,device='cpu'):
        done_reward  = None
        if np.random.random() < epsilon:
            action = np.random.choice(ACTIONS)
        else:
            state_v = torch.tensor(np.array([self.state],copy=False),dtype=torch.float32).to(device)
            action = int(torch.argmax(net(state_v)))
       
        frame,reward,done = self.env.frame_step(action)
        for _ in range(SKIP_FRAME):
                frame,reward,done =  self.env.frame_step(action)
                self.total_rewards += reward
                if done:
                    break
                    
        frame = processFrame(frame)
        self.next_state.append(frame)
        
        if len(self.next_state)==STATE_DIM and len(self.state)==STATE_DIM:
            #PER - Prioritized Experience Replay
            o = net( torch.tensor( np.array([self.state]),dtype=torch.float32).to(device)).to('cpu').detach().numpy()[0][action]
            e = float(torch.max(tgt_net( torch.tensor( np.array([self.next_state]),dtype=torch.float32).to(device))))
            p = abs(o-e)+0.0001
            self.exp_buffer.append((self.state.copy(),action,int(self.total_rewards),done,self.next_state.copy()),p)
        
        self.state.append(frame)
        if done:
            done_reward = self.total_rewards
            self._reset()
        return done_reward

In [None]:
def calc_loss(batch,net,tgt_net,device='cpu'):
    states,actions,rewards,dones,next_states = batch
    
    states_v = torch.tensor(states,dtype=torch.float32).to(device)
    actions_v = torch.tensor(actions,dtype=torch.long).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    dones_v = torch.ByteTensor(dones).to(device)
    next_states_v = torch.tensor(next_states,dtype=torch.float32).to(device)
    
    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_action_values = tgt_net(next_states_v).max(1)[0]
    next_state_action_values[dones_v] = 0.0
    next_state_action_values = next_state_action_values.detach() 
    
    expected_values = rewards_v +  next_state_action_values * GAMMA
    return nn.MSELoss()(state_action_values,expected_values)

In [None]:
all_losses = []
device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" )

#Double Dueling DQN
net = DDQN( (STATE_DIM,84,84), len(ACTIONS) ).to(device)
tgt_net = DDQN( (STATE_DIM,84,84), len(ACTIONS) ).to(device)

env = game.GameState()
buffer = ExperienceBuffer(EXPERIENCE_BUFFER_SIZE)
agent = Agent(env,buffer)
epsilon = EPSILON_START
optimizer = optim.Adam(net.parameters(),lr=LEARNING_RATE)

total_rewards = []
best_mean_reward = float('-inf')
last_mean = float('-inf')
game_id = 0
while True:
    epsilon = max( EPSILON_FINAL , EPSILON_START - game_id/EPSILON_DECAY_FRAMES )
    
    reward = agent.step(net,tgt_net,epsilon,device=device)
    if reward is not None:
        game_id += 1
        total_rewards.append(reward)
        mean_reward = np.mean(total_rewards[-100:])
        print("GAME : {} | EPSILON : {:.4f} | MEAN REWARD : {}".format( game_id, epsilon, mean_reward ))
        if best_mean_reward < mean_reward:
            best_mean_reward = mean_reward
            
            if best_mean_reward-last_mean >= 0.2:
                torch.save(net.state_dict(),'checkpoints/flappy_best_model.dat')
                print("REWARD {} -> {}. Model Saved".format(last_mean,mean_reward))
                last_mean = best_mean_reward

        
        if game_id % SYNC_TARGET_FRAMES == 0:
            tgt_net.load_state_dict(net.state_dict())
            
        if mean_reward >= MEAN_GOAL_REWARD:
            print("Learned in {} Games.".format(game_id))
            break
    
    if len(buffer) < MIN_EXP_BUFFER_SIZE:
        continue
    
    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch,net,tgt_net,device=device)
    all_losses.append(float(loss_t))
    loss_t.backward()
    optimizer.step()

GAME : 1 | EPSILON : 1.0000 | MEAN REWARD : -1.0
REWARD -inf -> -1.0. Model Saved
GAME : 2 | EPSILON : 0.9997 | MEAN REWARD : -1.0
GAME : 3 | EPSILON : 0.9994 | MEAN REWARD : -1.0
GAME : 4 | EPSILON : 0.9991 | MEAN REWARD : -1.0
GAME : 5 | EPSILON : 0.9988 | MEAN REWARD : -1.0
GAME : 6 | EPSILON : 0.9985 | MEAN REWARD : -0.8333333333333334
GAME : 7 | EPSILON : 0.9982 | MEAN REWARD : -0.8571428571428571
GAME : 8 | EPSILON : 0.9979 | MEAN REWARD : -0.875
GAME : 9 | EPSILON : 0.9976 | MEAN REWARD : -0.7777777777777778
REWARD -1.0 -> -0.7777777777777778. Model Saved
GAME : 10 | EPSILON : 0.9973 | MEAN REWARD : -0.8
GAME : 11 | EPSILON : 0.9970 | MEAN REWARD : -0.8181818181818182
GAME : 12 | EPSILON : 0.9967 | MEAN REWARD : -0.8333333333333334
GAME : 13 | EPSILON : 0.9964 | MEAN REWARD : -0.8461538461538461
GAME : 14 | EPSILON : 0.9961 | MEAN REWARD : -0.8571428571428571
GAME : 15 | EPSILON : 0.9958 | MEAN REWARD : -0.8666666666666667
GAME : 16 | EPSILON : 0.9955 | MEAN REWARD : -0.875
GAME

GAME : 137 | EPSILON : 0.9592 | MEAN REWARD : -0.91
GAME : 138 | EPSILON : 0.9589 | MEAN REWARD : -0.91
GAME : 139 | EPSILON : 0.9586 | MEAN REWARD : -0.92
GAME : 140 | EPSILON : 0.9583 | MEAN REWARD : -0.92
GAME : 141 | EPSILON : 0.9580 | MEAN REWARD : -0.93
GAME : 142 | EPSILON : 0.9577 | MEAN REWARD : -0.94
GAME : 143 | EPSILON : 0.9574 | MEAN REWARD : -0.94
GAME : 144 | EPSILON : 0.9571 | MEAN REWARD : -0.94
GAME : 145 | EPSILON : 0.9568 | MEAN REWARD : -0.94
GAME : 146 | EPSILON : 0.9565 | MEAN REWARD : -0.94
GAME : 147 | EPSILON : 0.9562 | MEAN REWARD : -0.94
GAME : 148 | EPSILON : 0.9559 | MEAN REWARD : -0.94
GAME : 149 | EPSILON : 0.9556 | MEAN REWARD : -0.94
GAME : 150 | EPSILON : 0.9553 | MEAN REWARD : -0.94
GAME : 151 | EPSILON : 0.9550 | MEAN REWARD : -0.94
GAME : 152 | EPSILON : 0.9547 | MEAN REWARD : -0.94
GAME : 153 | EPSILON : 0.9544 | MEAN REWARD : -0.94
GAME : 154 | EPSILON : 0.9541 | MEAN REWARD : -0.94
GAME : 155 | EPSILON : 0.9538 | MEAN REWARD : -0.94
GAME : 156 |

GAME : 296 | EPSILON : 0.9115 | MEAN REWARD : -0.89
GAME : 297 | EPSILON : 0.9112 | MEAN REWARD : -0.88
GAME : 298 | EPSILON : 0.9109 | MEAN REWARD : -0.89
GAME : 299 | EPSILON : 0.9106 | MEAN REWARD : -0.88
GAME : 300 | EPSILON : 0.9103 | MEAN REWARD : -0.88
GAME : 301 | EPSILON : 0.9100 | MEAN REWARD : -0.88
GAME : 302 | EPSILON : 0.9097 | MEAN REWARD : -0.88
GAME : 303 | EPSILON : 0.9094 | MEAN REWARD : -0.88
GAME : 304 | EPSILON : 0.9091 | MEAN REWARD : -0.88
GAME : 305 | EPSILON : 0.9088 | MEAN REWARD : -0.88
GAME : 306 | EPSILON : 0.9085 | MEAN REWARD : -0.88
GAME : 307 | EPSILON : 0.9082 | MEAN REWARD : -0.88
GAME : 308 | EPSILON : 0.9079 | MEAN REWARD : -0.88
GAME : 309 | EPSILON : 0.9076 | MEAN REWARD : -0.88
GAME : 310 | EPSILON : 0.9073 | MEAN REWARD : -0.88
GAME : 311 | EPSILON : 0.9070 | MEAN REWARD : -0.88
GAME : 312 | EPSILON : 0.9067 | MEAN REWARD : -0.88
GAME : 313 | EPSILON : 0.9064 | MEAN REWARD : -0.88
GAME : 314 | EPSILON : 0.9061 | MEAN REWARD : -0.88
GAME : 315 |

GAME : 454 | EPSILON : 0.8641 | MEAN REWARD : -0.83
GAME : 455 | EPSILON : 0.8638 | MEAN REWARD : -0.84
GAME : 456 | EPSILON : 0.8635 | MEAN REWARD : -0.84
GAME : 457 | EPSILON : 0.8632 | MEAN REWARD : -0.84
GAME : 458 | EPSILON : 0.8629 | MEAN REWARD : -0.84
GAME : 459 | EPSILON : 0.8626 | MEAN REWARD : -0.84
GAME : 460 | EPSILON : 0.8623 | MEAN REWARD : -0.85
GAME : 461 | EPSILON : 0.8620 | MEAN REWARD : -0.86
GAME : 462 | EPSILON : 0.8617 | MEAN REWARD : -0.85
GAME : 463 | EPSILON : 0.8614 | MEAN REWARD : -0.84
GAME : 464 | EPSILON : 0.8611 | MEAN REWARD : -0.84
GAME : 465 | EPSILON : 0.8608 | MEAN REWARD : -0.83
GAME : 466 | EPSILON : 0.8605 | MEAN REWARD : -0.83
GAME : 467 | EPSILON : 0.8602 | MEAN REWARD : -0.83
GAME : 468 | EPSILON : 0.8599 | MEAN REWARD : -0.83
GAME : 469 | EPSILON : 0.8596 | MEAN REWARD : -0.83
GAME : 470 | EPSILON : 0.8593 | MEAN REWARD : -0.83
GAME : 471 | EPSILON : 0.8590 | MEAN REWARD : -0.83
GAME : 472 | EPSILON : 0.8587 | MEAN REWARD : -0.83
GAME : 473 |

GAME : 612 | EPSILON : 0.8167 | MEAN REWARD : -0.71
GAME : 613 | EPSILON : 0.8164 | MEAN REWARD : -0.71
GAME : 614 | EPSILON : 0.8161 | MEAN REWARD : -0.71
GAME : 615 | EPSILON : 0.8158 | MEAN REWARD : -0.7
GAME : 616 | EPSILON : 0.8155 | MEAN REWARD : -0.7
GAME : 617 | EPSILON : 0.8152 | MEAN REWARD : -0.7
GAME : 618 | EPSILON : 0.8149 | MEAN REWARD : -0.7
GAME : 619 | EPSILON : 0.8146 | MEAN REWARD : -0.7
GAME : 620 | EPSILON : 0.8143 | MEAN REWARD : -0.71
GAME : 621 | EPSILON : 0.8140 | MEAN REWARD : -0.7
GAME : 622 | EPSILON : 0.8137 | MEAN REWARD : -0.7
GAME : 623 | EPSILON : 0.8134 | MEAN REWARD : -0.71
GAME : 624 | EPSILON : 0.8131 | MEAN REWARD : -0.71
GAME : 625 | EPSILON : 0.8128 | MEAN REWARD : -0.71
GAME : 626 | EPSILON : 0.8125 | MEAN REWARD : -0.71
GAME : 627 | EPSILON : 0.8122 | MEAN REWARD : -0.72
GAME : 628 | EPSILON : 0.8119 | MEAN REWARD : -0.73
GAME : 629 | EPSILON : 0.8116 | MEAN REWARD : -0.74
GAME : 630 | EPSILON : 0.8113 | MEAN REWARD : -0.74
GAME : 631 | EPSILO

GAME : 770 | EPSILON : 0.7693 | MEAN REWARD : -0.84
GAME : 771 | EPSILON : 0.7690 | MEAN REWARD : -0.84
GAME : 772 | EPSILON : 0.7687 | MEAN REWARD : -0.85
GAME : 773 | EPSILON : 0.7684 | MEAN REWARD : -0.85
GAME : 774 | EPSILON : 0.7681 | MEAN REWARD : -0.85
GAME : 775 | EPSILON : 0.7678 | MEAN REWARD : -0.85
GAME : 776 | EPSILON : 0.7675 | MEAN REWARD : -0.85
GAME : 777 | EPSILON : 0.7672 | MEAN REWARD : -0.85
GAME : 778 | EPSILON : 0.7669 | MEAN REWARD : -0.83
GAME : 779 | EPSILON : 0.7666 | MEAN REWARD : -0.83
GAME : 780 | EPSILON : 0.7663 | MEAN REWARD : -0.84
GAME : 781 | EPSILON : 0.7660 | MEAN REWARD : -0.84
GAME : 782 | EPSILON : 0.7657 | MEAN REWARD : -0.84
GAME : 783 | EPSILON : 0.7654 | MEAN REWARD : -0.84
GAME : 784 | EPSILON : 0.7651 | MEAN REWARD : -0.83
GAME : 785 | EPSILON : 0.7648 | MEAN REWARD : -0.83
GAME : 786 | EPSILON : 0.7645 | MEAN REWARD : -0.83
GAME : 787 | EPSILON : 0.7642 | MEAN REWARD : -0.83
GAME : 788 | EPSILON : 0.7639 | MEAN REWARD : -0.83
GAME : 789 |

GAME : 928 | EPSILON : 0.7219 | MEAN REWARD : -0.76
GAME : 929 | EPSILON : 0.7216 | MEAN REWARD : -0.76
GAME : 930 | EPSILON : 0.7213 | MEAN REWARD : -0.76
GAME : 931 | EPSILON : 0.7210 | MEAN REWARD : -0.77
GAME : 932 | EPSILON : 0.7207 | MEAN REWARD : -0.77
GAME : 933 | EPSILON : 0.7204 | MEAN REWARD : -0.77
GAME : 934 | EPSILON : 0.7201 | MEAN REWARD : -0.76
GAME : 935 | EPSILON : 0.7198 | MEAN REWARD : -0.77
GAME : 936 | EPSILON : 0.7195 | MEAN REWARD : -0.76
GAME : 937 | EPSILON : 0.7192 | MEAN REWARD : -0.76
GAME : 938 | EPSILON : 0.7189 | MEAN REWARD : -0.76
GAME : 939 | EPSILON : 0.7186 | MEAN REWARD : -0.76
GAME : 940 | EPSILON : 0.7183 | MEAN REWARD : -0.76
GAME : 941 | EPSILON : 0.7180 | MEAN REWARD : -0.77
GAME : 942 | EPSILON : 0.7177 | MEAN REWARD : -0.77
GAME : 943 | EPSILON : 0.7174 | MEAN REWARD : -0.77
GAME : 944 | EPSILON : 0.7171 | MEAN REWARD : -0.77
GAME : 945 | EPSILON : 0.7168 | MEAN REWARD : -0.77
GAME : 946 | EPSILON : 0.7165 | MEAN REWARD : -0.77
GAME : 947 |

GAME : 1085 | EPSILON : 0.6748 | MEAN REWARD : -0.65
GAME : 1086 | EPSILON : 0.6745 | MEAN REWARD : -0.65
GAME : 1087 | EPSILON : 0.6742 | MEAN REWARD : -0.65
GAME : 1088 | EPSILON : 0.6739 | MEAN REWARD : -0.65
GAME : 1089 | EPSILON : 0.6736 | MEAN REWARD : -0.65
GAME : 1090 | EPSILON : 0.6733 | MEAN REWARD : -0.65
GAME : 1091 | EPSILON : 0.6730 | MEAN REWARD : -0.64
GAME : 1092 | EPSILON : 0.6727 | MEAN REWARD : -0.64
GAME : 1093 | EPSILON : 0.6724 | MEAN REWARD : -0.63
GAME : 1094 | EPSILON : 0.6721 | MEAN REWARD : -0.63
GAME : 1095 | EPSILON : 0.6718 | MEAN REWARD : -0.63
GAME : 1096 | EPSILON : 0.6715 | MEAN REWARD : -0.63
GAME : 1097 | EPSILON : 0.6712 | MEAN REWARD : -0.62
GAME : 1098 | EPSILON : 0.6709 | MEAN REWARD : -0.63
GAME : 1099 | EPSILON : 0.6706 | MEAN REWARD : -0.62
GAME : 1100 | EPSILON : 0.6703 | MEAN REWARD : -0.62
GAME : 1101 | EPSILON : 0.6700 | MEAN REWARD : -0.62
GAME : 1102 | EPSILON : 0.6697 | MEAN REWARD : -0.62
GAME : 1103 | EPSILON : 0.6694 | MEAN REWARD :

GAME : 1239 | EPSILON : 0.6286 | MEAN REWARD : -0.59
GAME : 1240 | EPSILON : 0.6283 | MEAN REWARD : -0.59
GAME : 1241 | EPSILON : 0.6280 | MEAN REWARD : -0.59
GAME : 1242 | EPSILON : 0.6277 | MEAN REWARD : -0.59
GAME : 1243 | EPSILON : 0.6274 | MEAN REWARD : -0.59
GAME : 1244 | EPSILON : 0.6271 | MEAN REWARD : -0.58
GAME : 1245 | EPSILON : 0.6268 | MEAN REWARD : -0.58
GAME : 1246 | EPSILON : 0.6265 | MEAN REWARD : -0.58
GAME : 1247 | EPSILON : 0.6262 | MEAN REWARD : -0.57
GAME : 1248 | EPSILON : 0.6259 | MEAN REWARD : -0.58
GAME : 1249 | EPSILON : 0.6256 | MEAN REWARD : -0.58
GAME : 1250 | EPSILON : 0.6253 | MEAN REWARD : -0.58
