In [9]:
import numpy as np
import gym
import time

env = gym.make('LunarLander-v2')

  result = entry_point.load(False)


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim

class DQN(nn.Module):
    def __init__(self, in_features, n_actions):
        super(DQN,self).__init__()
        self.neuralnet = nn.Sequential(
            nn.Linear(in_features,256),
            nn.ReLU(),
            nn.Linear(256,128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,n_actions)
        )
        
    def forward(self,x):
        return self.neuralnet(x)

In [11]:
import collections
class ExperienceBuffer():
    def __init__(self,capacity):
        self.exp_buffer = collections.deque(maxlen=capacity)
        
    def append(self,exp):
        self.exp_buffer.append(exp)
    
    def __len__(self):
        return len(self.exp_buffer)
    
    def clear(self):
        self.exp_buffer.clear()
        
    def sample(self,batch_size):
        indices = np.random.choice( range(len(self.exp_buffer)), batch_size )
        states,actions,rewards,dones,next_states = zip(*[self.exp_buffer[i] for i in indices])
        return np.array(states),np.array(actions),np.array(rewards, dtype=np.float32),np.array(dones,dtype=np.uint8),np.array(next_states)

In [12]:
class Agent():
    def __init__(self,env,buffer):
        self.env = env
        self.buffer = buffer
        self._reset()
        
    def _reset(self):
        self.state = env.reset()
        self.total_rewards = 0.0
    
    def step(self, net, eps, device="cpu"):
        done_reward= None
        if np.random.random() < eps:
            action = env.action_space.sample()
        else:
            state_prev = torch.tensor(self.state).to(device)
            action = int(torch.argmax(net(state_prev).to(device)))
            
        state_prev = self.state
        rewards = 0
        done = False
        for _ in range(4):
            self.state,reward,done,info = env.step(action)
            self.total_rewards+=reward
            if done:
                break

        self.buffer.append((state_prev,action,reward,done,self.state))
        if done:
            done_reward = self.total_rewards
            self._reset()
        return done_reward

In [13]:
GAMMA = 0.99
EPSILON_START = 1
EPSILON_FINAL = 0.01
EPSILON_DECAY_OBS = 10**5
BATCH_SIZE = 32
MEAN_GOAL_REWARD = 250
REPLAY_BUFFER_SIZE = 10000
REPLAY_MIN_SIZE = 10000
LEARNING_RATE= 1e-4
SYNC_TARGET_OBS = 1000

In [14]:
def cal_loss(batch, net, tgt_net, device='cpu'):
    states,actions,rewards,dones,next_states = batch
    
    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    dones_v = torch.ByteTensor(dones).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    
    
    Q_val = net(states_v).gather(1,actions_v.unsqueeze(-1)).squeeze(-1) #select q value corresponding each action
    Q_val_next = tgt_net(next_states_v).max(1)[0] #give maximum value for each sample
    Q_val_next[dones_v] = 0.0 #making q value for done to zero
    Q_val_next = Q_val_next.detach() #detach from current graph
    
    expected_return = rewards_v + GAMMA * Q_val_next #what should be
    return nn.MSELoss()(Q_val,expected_return)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

net = DQN(env.observation_space.shape[0],env.action_space.n).to(device)
tgt_net = DQN(env.observation_space.shape[0],env.action_space.n).to(device)

buffer= ExperienceBuffer(REPLAY_BUFFER_SIZE)

agent = Agent(env,buffer)
epsilon = EPSILON_START
optimizer = optim.Adam(net.parameters(),lr=LEARNING_RATE)

total_rewards= []

ts = time.time()
best_mean_reward= None
obs_id = 0

while True:
    obs_id +=1
    epsilon = max(EPSILON_FINAL, EPSILON_START - obs_id/EPSILON_DECAY_OBS)
    
    reward = agent.step(net,epsilon,device=device)
    if reward is not None:
        total_rewards.append(reward)
        game_time = time.time() - ts
        ts = time.time()
        mean_reward=  np.mean(total_rewards[-100:])
        
        if best_mean_reward == None or best_mean_reward < mean_reward:
            torch.save(net.state_dict(),'checkpoints/lunar_lander-best.dat')
            
            if best_mean_reward == None:
                last = mean_reward
                best_mean_reward = mean_reward
            
            if best_mean_reward is not None and best_mean_reward - last > 10:
                last = best_mean_reward
                print("GAME : {}, TIME ECLAPSED : {}, EPSILON : {}, MEAN_REWARD : {}".format(obs_id,game_time,epsilon,mean_reward))
                print("Reward {} -> {} Model Saved".format(best_mean_reward,mean_reward))
            
            best_mean_reward = mean_reward
        
        if mean_reward > MEAN_GOAL_REWARD:
            print("SOLVED in {} obs".format(obs_id))
            break
        
    if len(buffer) < REPLAY_MIN_SIZE:
        continue
        
    if obs_id % SYNC_TARGET_OBS == 0:
        tgt_net.load_state_dict(net.state_dict())
        
    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = cal_loss(batch,net,tgt_net,device= device)
    loss_t.backward()
    optimizer.step()

GAME : 91, TIME ECLAPSED : 0.0198667049407959, EPSILON : 0.99909, MEAN_REWARD : -236.57854038856965
Reward -272.3648135666385 -> -236.57854038856965 Model Saved
GAME : 113, TIME ECLAPSED : 0.016995668411254883, EPSILON : 0.99887, MEAN_REWARD : -227.97405359419668
Reward -236.57854038856965 -> -227.97405359419668 Model Saved
GAME : 143, TIME ECLAPSED : 0.01102590560913086, EPSILON : 0.99857, MEAN_REWARD : -186.95619094043488
Reward -207.1112979943246 -> -186.95619094043488 Model Saved
GAME : 159, TIME ECLAPSED : 0.012368440628051758, EPSILON : 0.99841, MEAN_REWARD : -172.4161077995954
Reward -186.95619094043488 -> -172.4161077995954 Model Saved
GAME : 25135, TIME ECLAPSED : 0.34514451026916504, EPSILON : 0.74865, MEAN_REWARD : -170.9960655080618
Reward -172.4161077995954 -> -170.9960655080618 Model Saved
GAME : 25751, TIME ECLAPSED : 0.1059412956237793, EPSILON : 0.74249, MEAN_REWARD : -161.6702902492795
Reward -161.99398269200546 -> -161.6702902492795 Model Saved
GAME : 26287, TIME ECL