In [1]:
import os,sys
sys.path.append('game/')
import flappy_wrapped as game
import cv2
import numpy as np
from tqdm.autonotebook import tqdm
import time
%matplotlib inline
import matplotlib.pyplot as plt

pygame 1.9.5
Hello from the pygame community. https://www.pygame.org/contribute.html




In [2]:
KERNEL = np.array([[-1,-1,-1], [-1, 9,-1],[-1,-1,-1]])
def processFrame(frame):
    frame = frame[0:288,0:400] #crop image
    frame = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY) #convert image to black and white
    frame = cv2.resize(frame,(80,80))
    _ , frame = cv2.threshold(frame,50,255,cv2.THRESH_BINARY)
    #frame = cv2.blur(frame,(5,5))
    frame = cv2.filter2D(frame,-1,KERNEL)
    frame = frame/255.0
    return frame

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
class DQN(nn.Module):
    def __init__(self,input_shape,nactions):
        super(DQN,self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0],32,kernel_size=8,stride=4),
            nn.ReLU(),
            nn.Conv2d(32,64,kernel_size=4,stride=2),
            nn.ReLU(),
            nn.Conv2d(64,64,kernel_size=3,stride=1),
            nn.ReLU()
        )
        
        conv_out_size = self._get_conv_out(input_shape)
        
        self.fc = nn.Sequential(
            nn.Linear( conv_out_size, 256),
            nn.ReLU(),
            nn.Linear( 256, 64 ),
            nn.ReLU(),
            nn.Linear( 64, nactions )
        )
        
    def _get_conv_out(self,shape):
        o = self.conv( torch.zeros(1,*shape) )
        return int(np.prod(o.size()))
    
    def forward(self,x):
        conv_out = self.conv(x).view(x.size()[0], -1)
        return self.fc(conv_out)

In [4]:
ACTIONS = [0,1]
EXPERIENCE_BUFFER_SIZE = 10000
STATE_DIM = 4
GAMMA = 0.99
EPSILON_START = 1
EPSILON_FINAL = 0.1
EPSILON_DECAY_FRAMES = 10**3
MEAN_GOAL_REWARD = 20
BATCH_SIZE = 32
MIN_EXP_BUFFER_SIZE = 100
SYNC_TARGET_FRAMES = 500
LEARNING_RATE = 1e-4

In [5]:
import collections
class ExperienceBuffer():
    def __init__(self,capacity):
        self.buffer = collections.deque(maxlen=capacity)
    
    def clear(self):
        buffer.clear()
    
    def __len__(self):
        return len(self.buffer)
    
    def append(self,exp):
        self.buffer.append(exp)
    
    def sample(self,batch_size):
        indices = np.random.choice( range(len(self.buffer)), batch_size)
        states,actions,rewards,dones,next_states = zip(*[ self.buffer[idx] for idx in indices ])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32),\
    np.array(dones,dtype=np.uint8), np.array(next_states)

In [6]:
class Agent():
    def __init__(self,env,buffer,state_buffer_size = STATE_DIM):
        self.env = env
        self.exp_buffer = buffer
        self.state = collections.deque(maxlen = STATE_DIM)
        self.next_state= collections.deque(maxlen = STATE_DIM)
        self._reset()
        
    def _reset(self):
        self.total_rewards = 0
        for i in range(STATE_DIM):
            frame,reward,done =  env.frame_step(np.random.choice(ACTIONS))
            frame = processFrame(frame)
            self.total_rewards += reward
            self.state.append(frame)
            self.next_state.append(frame)
            if done:
                self._reset()
    
    def step(self,net,epsilon=0.9,device='cpu'):
        done_reward  = None
        if np.random.random() < epsilon:
            action = np.random.choice(ACTIONS)
        else:
            state_v = torch.tensor(np.array([self.state],copy=False),dtype=torch.float32).to(device)
            action = int(torch.argmax(net(state_v)))
       
        frame,reward,done = env.frame_step(action)
        frame = processFrame(frame)
        self.total_rewards += reward
        self.next_state.append(frame)
        self.exp_buffer.append((self.state,action,
                                self.total_rewards,done,self.next_state))
        self.state.append(frame)
        if done:
            done_reward = self.total_rewards
            self._reset()
        return done_reward

In [None]:
def calc_loss(batch,net,tgt_net,device='cpu'):
    states,actions,rewards,dones,next_states = batch
    
    states_v = torch.tensor(states,dtype=torch.float32).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    dones_v = torch.ByteTensor(dones).to(device)
    next_states_v = torch.tensor(next_states,dtype=torch.float32).to(device)
    
    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_action_values = tgt_net(next_states_v).max(1)[0]
    next_state_action_values[dones_v] = 0.0
    next_state_action_values = next_state_action_values.detach() 
    
    expected_values = rewards_v +  next_state_action_values * GAMMA
    return nn.MSELoss()(state_action_values,expected_values)

In [None]:
device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" )
net = DQN( (STATE_DIM,80,80), len(ACTIONS) ).to(device)
tgt_net = DQN( (STATE_DIM,80,80), len(ACTIONS) ).to(device)

env = game.GameState()
buffer = ExperienceBuffer(EXPERIENCE_BUFFER_SIZE)
agent = Agent(env,buffer,state_buffer_size=STATE_DIM)
epsilon = EPSILON_START
optimizer = optim.Adam(net.parameters(),lr=LEARNING_RATE)

total_rewards = []
best_mean_reward = None
game_id = 0
while True:
    epsilon = max( EPSILON_FINAL , EPSILON_START - game_id/EPSILON_DECAY_FRAMES )
    
    reward = agent.step(net,epsilon,device=device)
    if reward is not None:
        game_id += 1
        total_rewards.append(reward)
        mean_reward = np.mean(total_rewards[-100:])
        print("GAME : {} | EPSILON : {} | MEAN REWARD : {}".format( game_id, epsilon, mean_reward ))
        if best_mean_reward == None or best_mean_reward < mean_reward:
            if best_mean_reward == None:
                last_mean = mean_reward
                best_mean_reward = mean_reward
            
            if best_mean_reward is not None and best_mean_reward-last_mean > 1:
                last_mean = best_mean_reward
                torch.save(net.save_dict(),'checkpoints/flappy_best_model.dat')
                print("REWARD {} -> {}. Model Saved".format(last_mean,mean_reward))
            best_mean_reward = mean_reward
            
        if mean_reward > MEAN_GOAL_REWARD:
            print("Learned in {} Games.".format(game_id))
            break
    
    if len(buffer) < EXPERIENCE_BUFFER_SIZE:
        continue
    
    if game_id % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())
        
    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch,net,tgt_net,device=device)
    loss_t.backward()
    optimizer.step()

GAME : 1 | EPSILON : 1.0 | MEAN REWARD : -1.0
GAME : 2 | EPSILON : 0.999 | MEAN REWARD : -1.0
GAME : 3 | EPSILON : 0.998 | MEAN REWARD : -1.0
GAME : 4 | EPSILON : 0.997 | MEAN REWARD : -1.0
GAME : 5 | EPSILON : 0.996 | MEAN REWARD : -0.8
GAME : 6 | EPSILON : 0.995 | MEAN REWARD : -0.8333333333333334
GAME : 7 | EPSILON : 0.994 | MEAN REWARD : -0.8571428571428571
GAME : 8 | EPSILON : 0.993 | MEAN REWARD : -0.875
GAME : 9 | EPSILON : 0.992 | MEAN REWARD : -0.8888888888888888
GAME : 10 | EPSILON : 0.991 | MEAN REWARD : -0.9
GAME : 11 | EPSILON : 0.99 | MEAN REWARD : -0.9090909090909091
GAME : 12 | EPSILON : 0.989 | MEAN REWARD : -0.9166666666666666
GAME : 13 | EPSILON : 0.988 | MEAN REWARD : -0.9230769230769231
GAME : 14 | EPSILON : 0.987 | MEAN REWARD : -0.9285714285714286
GAME : 15 | EPSILON : 0.986 | MEAN REWARD : -0.9333333333333333
GAME : 16 | EPSILON : 0.985 | MEAN REWARD : -0.9375
GAME : 17 | EPSILON : 0.984 | MEAN REWARD : -0.9411764705882353
GAME : 18 | EPSILON : 0.983 | MEAN REWA

GAME : 141 | EPSILON : 0.86 | MEAN REWARD : -1.0
GAME : 142 | EPSILON : 0.859 | MEAN REWARD : -1.0
GAME : 143 | EPSILON : 0.858 | MEAN REWARD : -1.0
GAME : 144 | EPSILON : 0.857 | MEAN REWARD : -1.0
GAME : 145 | EPSILON : 0.856 | MEAN REWARD : -1.0
GAME : 146 | EPSILON : 0.855 | MEAN REWARD : -1.0
GAME : 147 | EPSILON : 0.854 | MEAN REWARD : -1.0
GAME : 148 | EPSILON : 0.853 | MEAN REWARD : -1.0
GAME : 149 | EPSILON : 0.852 | MEAN REWARD : -1.0
GAME : 150 | EPSILON : 0.851 | MEAN REWARD : -1.0
GAME : 151 | EPSILON : 0.85 | MEAN REWARD : -1.0
GAME : 152 | EPSILON : 0.849 | MEAN REWARD : -1.0
GAME : 153 | EPSILON : 0.848 | MEAN REWARD : -1.0
GAME : 154 | EPSILON : 0.847 | MEAN REWARD : -1.0
GAME : 155 | EPSILON : 0.846 | MEAN REWARD : -1.0
GAME : 156 | EPSILON : 0.845 | MEAN REWARD : -1.0
GAME : 157 | EPSILON : 0.844 | MEAN REWARD : -1.0
GAME : 158 | EPSILON : 0.843 | MEAN REWARD : -1.0
GAME : 159 | EPSILON : 0.842 | MEAN REWARD : -1.0
GAME : 160 | EPSILON : 0.841 | MEAN REWARD : -1.0
GA

GAME : 299 | EPSILON : 0.702 | MEAN REWARD : -0.98
GAME : 300 | EPSILON : 0.7010000000000001 | MEAN REWARD : -0.98
GAME : 301 | EPSILON : 0.7 | MEAN REWARD : -0.98
GAME : 302 | EPSILON : 0.6990000000000001 | MEAN REWARD : -0.98
GAME : 303 | EPSILON : 0.698 | MEAN REWARD : -0.98
GAME : 304 | EPSILON : 0.6970000000000001 | MEAN REWARD : -0.98
GAME : 305 | EPSILON : 0.696 | MEAN REWARD : -0.98
GAME : 306 | EPSILON : 0.6950000000000001 | MEAN REWARD : -0.98
GAME : 307 | EPSILON : 0.694 | MEAN REWARD : -0.98
GAME : 308 | EPSILON : 0.6930000000000001 | MEAN REWARD : -0.98
GAME : 309 | EPSILON : 0.692 | MEAN REWARD : -0.98
GAME : 310 | EPSILON : 0.6910000000000001 | MEAN REWARD : -0.98
GAME : 311 | EPSILON : 0.69 | MEAN REWARD : -0.98
GAME : 312 | EPSILON : 0.6890000000000001 | MEAN REWARD : -0.98
GAME : 313 | EPSILON : 0.688 | MEAN REWARD : -0.98
GAME : 314 | EPSILON : 0.687 | MEAN REWARD : -0.98
GAME : 315 | EPSILON : 0.6859999999999999 | MEAN REWARD : -0.98
GAME : 316 | EPSILON : 0.685 | M

GAME : 450 | EPSILON : 0.5509999999999999 | MEAN REWARD : -0.95
GAME : 451 | EPSILON : 0.55 | MEAN REWARD : -0.95
GAME : 452 | EPSILON : 0.5489999999999999 | MEAN REWARD : -0.95
GAME : 453 | EPSILON : 0.548 | MEAN REWARD : -0.95
GAME : 454 | EPSILON : 0.5469999999999999 | MEAN REWARD : -0.95
GAME : 455 | EPSILON : 0.546 | MEAN REWARD : -0.95
GAME : 456 | EPSILON : 0.5449999999999999 | MEAN REWARD : -0.96
GAME : 457 | EPSILON : 0.544 | MEAN REWARD : -0.96
GAME : 458 | EPSILON : 0.5429999999999999 | MEAN REWARD : -0.96
GAME : 459 | EPSILON : 0.542 | MEAN REWARD : -0.96
GAME : 460 | EPSILON : 0.5409999999999999 | MEAN REWARD : -0.96
GAME : 461 | EPSILON : 0.54 | MEAN REWARD : -0.96
GAME : 462 | EPSILON : 0.5389999999999999 | MEAN REWARD : -0.96
GAME : 463 | EPSILON : 0.538 | MEAN REWARD : -0.96
GAME : 464 | EPSILON : 0.5369999999999999 | MEAN REWARD : -0.96
GAME : 465 | EPSILON : 0.536 | MEAN REWARD : -0.96
GAME : 466 | EPSILON : 0.5349999999999999 | MEAN REWARD : -0.96
GAME : 467 | EPSIL