In [1]:
import numpy as np
import torch
from torch import nn

import copy
import time
import matplotlib.pyplot as plt

import seaborn as sns

%matplotlib inline

In [20]:
N_FRAMES = 4

def filter(obs):
    m = np.mean(obs)
    v = np.var(obs)
    
    obs = (obs - m)/v   
    return obs

def get_stacked_obs(obs, prev_frames):
    if not prev_frames:
        prev_frames = [obs] * (N_FRAMES - 1)
        
    prev_frames.append(obs)

    stacked_frames = np.stack(prev_frames)
    prev_frames = prev_frames[-(N_FRAMES-1):]
    
    return stacked_frames, prev_frames

def preprocess_obs(obs, prev_frames):
    filtered_obs = filter(obs)
    stacked_obs, prev_frames = get_stacked_obs(filtered_obs, prev_frames)
    return stacked_obs, prev_frames

In [39]:
class ExperienceReplay():
    def __init__(self, capacity):
        self.capacity = capacity
        self.data = []
        
    def add_step(self, step_data):
        self.data.append(step_data)
        if len(self.data) > self.capacity:
            self.data = self.data[-self.capacity:]
            
    def sample(self, n):
        n = min(n, len(self.data))
        indices = np.random.choice(range(len(self.data)), n, replace=False)
        samples = np.asarray(self.data, dtype="object")[indices]

        #input = torch.tensor(np.concatenate((samples[:, 0][0],list(samples[:, 1])*4),axis=1)).float() 
      
        state_data = torch.tensor(np.stack(samples[:, 0])).float()
        act_data = torch.tensor(np.stack(samples[:, 1])).float()
        reward_data = torch.tensor(np.stack(samples[:, 2])).float()
        next_state_data = torch.tensor(np.stack(samples[:, 3])).float()
        terminal_data = torch.tensor(np.stack(samples[:, 4])).int()
        
        return state_data, act_data, reward_data, next_state_data, terminal_data

In [24]:
class DQN(nn.Module):
    def __init__(self, len_state, len_action):
        
        super(DQN, self).__init__()
        
        self.layer1 = nn.Sequential(
            nn.Linear(len_state+len_action,64),
            nn.ReLU())
        self.layer2 = nn.Sequential(
            nn.Linear(64,64),
            nn.ReLU())
        self.layer3 = nn.Sequential(
            nn.Linear(64,1),
            nn.ReLU())
        
    def forward(self, obs):
        q_values = self.layer1(obs)
        q_values = self.layer2(q_values)
        q_values = self.layer3(q_values)
        
        return q_values
    
    def train_on_batch(self, target_model, optimizer,input, obs, acts, rewards, next_obs, terminals, gamma=0.99):
        pass

# DQN 

In [27]:
from environment import Environment
from scipy.stats import qmc

In [29]:
X = np.linspace(-3,3,90).reshape(-1,3)
y = np.sum(np.sin(X),axis=1)

def f(x):
    s = np.sin(x).shape
    if len(s)>1:
        return np.sum(np.sin(x),axis=1)
    else:
        return [np.sum(np.sin(x))]

l_bounds = [-3,-3,-3]
u_bounds = [3,3,3]

env = Environment(X = X,y = y,l_bounds = l_bounds,u_bounds = u_bounds, func = f, model = 'NN',model_param = {'d':3,'nb_nodes':40,'nb_layers':4})

state = env.Reset()

<model.NN.NN_model object at 0x28d42b0e0>


In [43]:
n_episodes = 100
max_steps = 100
er_capacity = 15 # 1m in paper

train_batch_size = 32
learning_rate = 2.5e-4
update_freq = 4
print_freq = 5
frame_skip = 3
n_anneal_steps = 1e2 
target_update_delay = 10 
epsilon = lambda step: np.clip(1 - 0.9 * (step/n_anneal_steps), 0.1, 1)

In [47]:
er = ExperienceReplay(er_capacity)
model = DQN(len(state),X.shape[1])

target_model = copy.deepcopy(model)
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate, eps=1e-6)
all_rewards = []
global_step = 0

for episode in range(n_episodes):
    prev_frames = []
    obs, prev_frames = preprocess_obs(env.Reset(), prev_frames)
    
    episode_reward = 0
    step = 0
    while step < max_steps:

        ### Epsilon - greedy ###
        if np.random.rand() < epsilon(global_step):
            sampler = qmc.LatinHypercube(d=X.shape[1])
            sample = sampler.random(n=1)
            sample_scaled = qmc.scale(sample, l_bounds, u_bounds)
            act = np.array(sample_scaled)[0]
            print('random')
           
        else:   
            sampler = qmc.LatinHypercube(d=X.shape[1])
            sample = sampler.random(n=5)
            sample_scaled = qmc.scale(sample, l_bounds, u_bounds)
            acts = np.array(sample_scaled)
            q_values = []
            for a in acts:
                input = torch.tensor(np.concatenate((obs,[a]*4),axis=1)).float() 
                q_value = model(input)[0]
                q_value = q_value.detach().numpy()
                q_values.append(q_value)
                
            act = acts[np.argmax(q_values)]
            print('not random')
        
        cumulative_reward = 0
        for _ in range(frame_skip):
            next_obs, reward, done, _ = env.Step(act)
            cumulative_reward += reward
            current_reward = reward
            if done or step >= max_steps:
                break
        #episode_reward += cumulative_reward
        #reward = format_reward(cumulative_reward)
        reward = current_reward

        next_obs, prev_frames = preprocess_obs(next_obs, prev_frames)
        er.add_step([obs, act, reward, next_obs, int(done)])
        obs = next_obs
        
        ### Train on a minibatch ###
        
        if global_step % update_freq == 0:
            obs_data, act_data, reward_data, next_obs_data, terminal_data = er.sample(train_batch_size)
            #print(obs_data, act_data)
            model.train_on_batch(target_model, optimizer, input, obs_data, act_data,
                                 reward_data, next_obs_data, terminal_data)
        
        ### Update target network ###
        
        if global_step and global_step % target_update_delay == 0:
            target_model = copy.deepcopy(model)
        
        ### Finish the step ###
        
        step += 1
        global_step += 1
        
        if done:
            break

    all_rewards.append(episode_reward)
    
    if episode % print_freq == 0:
        print('Episode #{} | Step #{} | Epsilon {:.2f} | Avg. Reward {:.2f}'.format(
            episode, global_step, epsilon(global_step), np.mean(all_rewards[-print_freq:])))

random
Episode #0 | Step #1 | Epsilon 0.99 | Avg. Reward 0.00
random
random
random
random
random
Episode #5 | Step #6 | Epsilon 0.95 | Avg. Reward 0.00
random
random
random
random
random
Episode #10 | Step #11 | Epsilon 0.90 | Avg. Reward 0.00
random
random
random
random
random
Episode #15 | Step #16 | Epsilon 0.86 | Avg. Reward 0.00
random
random
random
random
random
Episode #20 | Step #21 | Epsilon 0.81 | Avg. Reward 0.00
random
not random
random
random
random
Episode #25 | Step #26 | Epsilon 0.77 | Avg. Reward 0.00
random
random
random
random
random
Episode #30 | Step #31 | Epsilon 0.72 | Avg. Reward 0.00
random
random
random
random
random
random
Episode #35 | Step #37 | Epsilon 0.67 | Avg. Reward 0.00
random
random
random
not random
random
Episode #40 | Step #42 | Epsilon 0.62 | Avg. Reward 0.00
not random
random
random
not random
random
not random
Episode #45 | Step #48 | Epsilon 0.57 | Avg. Reward 0.00
random
not random
not random
random
random
Episode #50 | Step #53 | Epsilon 0.

KeyboardInterrupt: 