In [1]:
import gym
from ReplayBuffer import ReplayBuffer as ReplayBuffer
from AgentDQN import AgentDQN as AgentDQN

import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import os
from IPython.display import clear_output, display

from collections import deque

In [2]:
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count


# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display



In [3]:
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MODE_E = 'EXPONENTIAL'
MODE_L = 'LINEAR'
MODE_R = 'RBED_b128_oc8_'


MODE = MODE_R

writer = SummaryWriter(comment=MODE)
tag_reward = "reward"
tag_loss = "loss"
tag_ep = "epsilon"

discrete_control = ['CartPole-v0', 'MountainCar-v0']
continuous_control = ['','']
print('hello world')
env = gym.make(discrete_control[0])


scores = deque(maxlen=100)  # because solution depends on last 100 solution.
scores.append(int(0))


hello world


In [4]:
BATCH_SIZE = 128
GAMMA      = 0.99 # discount factor
EPSILON    = 1
EPSILON_DECAY    = 0.999
LEARN_RATE = 0.01

UPDATE_TARGET_COUNT = 1

STATE_N  = 4
ACTION_N = env.action_space.n

OPTIMIZE_COUNT = 8

NUM_EPISODES = 4096

In [5]:

qvfa = AgentDQN(STATE_N, ACTION_N).double().to(device)
q_target = AgentDQN(STATE_N, ACTION_N).double().to(device)
optimizer = optim.Adam(qvfa.parameters(), lr = LEARN_RATE)

criterion = nn.MSELoss()
buffer = ReplayBuffer(10000000)

In [6]:
def select_action(state, ep=0):
    sample = random.random()
    state = torch.from_numpy(state).to(device)
    if sample < ep:
        return env.action_space.sample()
    else:
        with torch.no_grad():
            qvfa.eval()
            op = qvfa(state)
            values, indices = op.max(0)
            return indices.item()


In [7]:
def optimize_model(i_episode=0):
    if buffer.__len__() < BATCH_SIZE:
        print("optimizing model Not enough samples in buffer : ", buffer.__len__())
        return

    transitions = buffer.sample(min(BATCH_SIZE, buffer.__len__()))

    state_batch = transitions[buffer.header[0]].values
    state_batch = torch.from_numpy(np.stack(state_batch, axis=0)).to(device)

    action_batch = torch.tensor(transitions[buffer.header[1]].values.tolist()).view(-1, 1).to(device)

    next_state_batch = transitions[buffer.header[2]].values
    next_state_batch = torch.from_numpy(np.stack(next_state_batch, axis=0)).to(device)

    reward_batch = torch.tensor(transitions[buffer.header[3]].values.tolist()).view(-1, 1).to(device)

    done_batch = torch.tensor(transitions[buffer.header[4]].values.tolist()).view(-1, 1).to(device)
    qvfa.train()
    qsa = qvfa(state_batch).gather(1, action_batch)

    with torch.no_grad():
        qvfa.eval()
        q_target.eval()
        next_state_action_values = q_target(next_state_batch)
        max_next_state_values, _indices = next_state_action_values.max(dim=1)
        max_next_state_values = max_next_state_values.view(-1, 1)
        next_state_values = ((max_next_state_values * GAMMA).float() + reward_batch).float() * (1 - done_batch).float()
        target = next_state_values.double()
        qvfa.train()

    # 𝛿=𝑄(𝑠,𝑎)−(𝑟+𝛾max𝑎𝑄(𝑠′,𝑎))
    optimizer.zero_grad()
    loss = criterion(qsa, target)
    loss.backward()
    # for param in qvfa.parameters():param.grad.data.clamp_(-1, 1)
    optimizer.step()
    writer.add_scalar(tag_loss, loss.item(), i_episode)


In [8]:
def is_solved(value, threshold=195):    
    
    scores.append(int(value))
    score = sum(scores)/100
    
    if score >= threshold:
        print("SOLVED")
        return True
        
    return False
    
    

In [9]:
def exponential_decay (eps=EPSILON, decay_rate=EPSILON_DECAY):
    # need to decide decay_rate.
    eps *= decay_rate
    return  eps


In [10]:
def linear_decay( episode, decay_till=1024 , start_val = 1.0, end_val = 0.01):
    e = (((end_val - start_val)/decay_till)*episode) + start_val
    return max(e, end_val)

In [11]:
REWARD_THRESHOLD = 0
def rbed(current_reward, eps=EPSILON, highest_ep=1.0, stop_red=0.1, lowest_ep=0.01, target_reward=200, target_increment=1):
        
    global REWARD_THRESHOLD
    
    
#     if eps < lowest_ep:
#         return eps
    
#     if eps < stop_red :
#         eps *= decay_rate
#         return rps
    
    steps_to_move_in = target_reward
    #steps_to_move_in = steps_to_move
    quanta = (highest_ep - lowest_ep)/steps_to_move_in   
    
    
    #REWARD_THRESHOLD = min(REWARD_THRESHOLD, 200)
    
    if current_reward > REWARD_THRESHOLD :
        REWARD_THRESHOLD += target_increment
        eps -= quanta
        
    return max(eps, lowest_ep)


In [12]:
REWARD_THRESHOLD = 0
def red1(current_reward, eps=EPSILON, highest_ep=1.0, lowest_ep=0.01, target_reward=200, target_increment=1, steps_to_move=256):
    
    steps_to_move_in = target_increment
    # steps_to_move_in = steps_to_move
    
    quanta = (highest_ep - lowest_ep)/steps_to_move_in
        
    return max(eps, lowest_ep)


In [13]:
def decay_epsilon(episode, current_reward=0, eps=EPSILON):
    
    if MODE == MODE_L:
        eps = linear_decay(episode)
    
    elif MODE == MODE_E:
        eps = exponential_decay(eps, EPSILON_DECAY)
    
    elif MODE == MODE_R:
        eps = rbed(current_reward=current_reward, eps=eps)
    
    return eps

In [14]:
for i_episode in range(NUM_EPISODES):
    state = env.reset()
    done = False
    total_reward = 0
    
    #1. Avoid moving targets
    if i_episode % UPDATE_TARGET_COUNT == 0:
        q_target.load_state_dict(qvfa.state_dict())
        
        
    while not done:        
        env.render(mode='rgb_array')
        action = select_action(state, ep=EPSILON)
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        doneV = 0
        if done:
            reward = -reward
            doneV = 1 
        
        buffer.insert(state, action, next_state, reward, doneV)
        state = next_state

    writer.add_scalar(tag_reward, total_reward, i_episode)
    writer.add_scalar(tag_ep, EPSILON, i_episode)
    clear_output()
    print('episode number : ',i_episode, '\te : ', EPSILON, '\treward : ', total_reward)
    
    if is_solved(total_reward):
        print('solved at episode ', i_episode)
        break
    
    for _ in range(OPTIMIZE_COUNT):
        optimize_model(i_episode)

    EPSILON = decay_epsilon(i_episode, total_reward, EPSILON)

print('Complete')
env.render()
env.close()
plt.ioff()
plt.show()

episode number :  1101 	e :  0.01 	reward :  128.0
SOLVED
solved at episode  1101
Complete


In [15]:
env.render()
env.close()
plt.ioff()
plt.show()