In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [3]:
import copy
import math
import os
from collections import namedtuple

import gym
import ipywidgets as widgets
import matplotlib.pyplot as plt
import more_itertools as mitt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import tqdm

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [12, 4]

In [4]:
env = gym.make('FetchReach-v1')
torch.cuda.current_device()

if torch.cuda.is_available():  
  device = "cuda:0" 
else:  
  device = "cpu"



In [5]:
Batch = namedtuple(
    'Batch', ('states', 'actions', 'rewards', 'next_states', 'dones')
)


class ReplayMemory:
    def __init__(self, max_size, state_size, action_size):
        self.max_size = max_size
        self.state_size = state_size
        self.states = torch.empty((max_size, state_size), device=device)
        self.actions = torch.empty((max_size, action_size), device=device)
        self.rewards = torch.empty((max_size, 1), device=device)
        self.next_states = torch.empty((max_size, state_size), device=device)
        self.dones = torch.empty((max_size, 1), dtype=torch.bool, device=device)
        self.idx = 0
        self.size = 0

    def add(self, state, action, reward, next_state, done):
        self.states[self.idx] = torch.tensor(state, device=device)
        self.actions[self.idx] = torch.tensor(action, device=device)
        self.rewards[self.idx] = torch.tensor(reward, device=device)
        self.next_states[self.idx] = torch.tensor(next_state, device=device)
        self.dones[self.idx] = torch.tensor(done, device=device)
        self.idx = (self.idx + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)

    def sample(self, batch_size) -> Batch:
        if self.size <= batch_size:
            sample_indices = np.random.choice(self.size, self.size, replace=False)
        else:
            sample_indices = np.random.choice(self.size, batch_size, replace=False)
        batch = Batch(
                        states = self.states[ sample_indices ,:],
                        actions = self.actions[ sample_indices ,:],
                        rewards = self.rewards[ sample_indices ,:],
                        next_states = self.next_states[ sample_indices ,:],
                        dones = self.dones[ sample_indices ,:]
                    )
        return batch

    def populate(self, env, num_steps):
        state = env.reset()
        state = np.concatenate((state["observation"], state["achieved_goal"], state["desired_goal"]), axis=0)
        for i in range(num_steps):
            action = env.action_space.sample()
            next_state, reward, done, info = env.step(action)
            next_state = np.concatenate((next_state["observation"], next_state["achieved_goal"], next_state["desired_goal"]), axis=0)
            self.add(state, action, reward, next_state, done)
            if i != 0 and i%10000 == 0:
                print(i)
            if done:
                if not('TimeLimit.truncated' in info):
                    print(next_state, reward, done, info, i)
                state = env.reset()
            state = next_state

In [6]:
class Actor(nn.Module):
    
    def __init__(self, input_size, output_size, hidden_layers, units=256):
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_layers = hidden_layers
        self.units = units
        
        self.layers = nn.ModuleList([nn.Linear(self.input_size, self.units)])
        self.layers.extend([ nn.Linear(self.units, self.units) for i in range(1, self.hidden_layers) ])
        self.layers.append(nn.Linear(self.units, self.output_size))
    
    def forward(self, states):
        vals = states
        for layer_index in range(len(self.layers) - 1):
            vals = F.relu(self.layers[layer_index](vals))
        vals = torch.tanh(self.layers[layer_index + 1](vals))
        return vals
        

In [7]:
class Critic(nn.Module):
    
    def __init__(self, input_size, output_size, hidden_layers, units=256):
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_layers = hidden_layers
        self.units = units
        
        self.layers = nn.ModuleList([nn.Linear(self.input_size, self.units)])
        self.layers.extend([ nn.Linear(self.units, self.units) for i in range(1, self.hidden_layers) ])
        self.layers.append(nn.Linear(self.units, self.output_size))
    
    def forward(self, states, actions):
        vals = torch.cat([states, actions], 1)
        for layer_index in range(len(self.layers) - 1):
            vals = F.relu(self.layers[layer_index](vals))
        vals = self.layers[layer_index + 1](vals)
        return vals

In [8]:
class OUNoise(object):
    def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
        self.mu           = mu
        self.theta        = theta
        self.sigma        = max_sigma
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
        self.action_dim   = action_space.shape[0]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()
        
    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu
        
    def evolve_state(self):
        x  = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.state = x + dx
        return self.state
    
    def get_noisy_action(self, action, t=0): 
        ou_state = self.evolve_state()
        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
        return np.clip(action + ou_state, self.low, self.high)

In [24]:
class DDPG:
    
    def __init__(self, state_size, action_size, actor_learning_rate=0.0001, critic_learning_rate=0.0001, gamma=0.99, tau=0.005):
        self.state_size = state_size
        self.action_size = action_size
        self.actor_lr = actor_learning_rate
        self.critic_lr = critic_learning_rate
        self.gamma = gamma
        self.tau = tau
        
        self.actor = Actor(self.state_size, self.action_size, 5)
        self.target_actor = Actor(self.state_size, self.action_size, 5)
        
        self.critic = Critic(self.state_size + self.action_size, self.action_size, 5)
        self.target_critic = Critic(self.state_size + self.action_size, self.action_size, 5)
    
        self.actor.to(device)
        self.target_actor.to(device)
        self.critic.to(device)
        self.target_critic.to(device)
        self.update_weights(1)
        
        self.critic_loss_method  = nn.MSELoss()
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.actor_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.critic_lr)
        # print(next(self.actor.parameters()).is_cuda)
    
    def update_weights(self, tau):
        for target_weights, weights in zip(self.target_actor.parameters(), self.actor.parameters()):
            target_weights.data.copy_(weights.data * tau + target_weights.data * (1.0 - tau))
        
        for target_weights, weights in zip(self.target_critic.parameters(), self.critic.parameters()):
            target_weights.data.copy_(weights.data * tau + target_weights.data * (1.0 - tau))
            
    def get_actions(self, states):
        return self.actor.forward(states).detach()

    def train_batch(self, states, actions, rewards, next_states, dones):
        #print(dones)
        
        Q_vals = self.critic(states, actions)
        next_actions = self.target_actor(next_states)
        next_actions = next_actions.detach()
        Q_dash = self.target_critic(next_states, next_actions)
        dones = torch.logical_not(dones).float()
        Q_dash = rewards + self.gamma * Q_dash * dones
        self.critic_optimizer.zero_grad()
        critic_loss = self.critic_loss_method(Q_vals, Q_dash)
        critic_loss.backward()
        self.critic_optimizer.step()
        
        self.actor_optimizer.zero_grad()
        actor_loss = - self.critic(states, self.actor.forward(states)).mean()
        actor_loss.backward()
        self.actor_optimizer.step()
        self.update_weights(self.tau)
    
    def load_model(self, model_path):
        self.target_actor.load_state_dict(torch.load(model_path))
        self.target_actor.eval()
        
        
        

In [26]:
TIMESTEPS = 15_00_000
BATCH_SIZE = 64
state_dim = 16
action_dim = 4
memory_size = 1_00_000
replay_memory = ReplayMemory(memory_size, state_dim, action_dim)
replay_memory.populate(env, 50000)
saved_models = {}
count = 0
ou = OUNoise(env.action_space)
ddpg = DDPG(state_dim, action_dim)

state = env.reset()
state = np.concatenate((state["observation"], state["achieved_goal"], state["desired_goal"]), axis=0)
pbar = tqdm.tnrange(TIMESTEPS, ncols='100%')
for t_total in pbar:
    cuda_state = torch.tensor([state], device=device, dtype=torch.float)
    action = ddpg.get_actions(cuda_state).cpu()
    action = ou.get_noisy_action(action[0].numpy())
    next_state, reward, done, info = env.step(action)
    next_state = np.concatenate((next_state["observation"], next_state["achieved_goal"], next_state["desired_goal"]), axis=0)
    replay_memory.add(state, action, reward, next_state, done)
    if t_total != 0 and t_total % 4 == 0:
        experience_batch = replay_memory.sample(BATCH_SIZE)
        ddpg.train_batch(experience_batch.states, experience_batch.actions, experience_batch.rewards, experience_batch.next_states, experience_batch.dones)
    
    if t_total != 0 and t_total % 1_00_000 == 0:
        torch.save(ddpg.target_actor.state_dict(), "mc_" + str(count))
        count += 1
        print("saved")
            
    if done:
        state = env.reset()
        state = np.concatenate((state["observation"], state["achieved_goal"], state["desired_goal"]), axis=0)
    else:
        state = next_state

torch.save(ddpg.target_actor.state_dict(), "mc_" + str(count))
        

HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=1500000), HTML(value='')), layout=Layout(disp…

KeyboardInterrupt: 

In [44]:
import random

In [118]:
# DDPG + HER

EPISODES = 1
BATCH_SIZE = 64
state_dim = 16
action_dim = 4
memory_size = 1_00
replay_memory = ReplayMemory(memory_size, state_dim, action_dim)
replay_memory.populate(env, 50)
saved_models = {}
count = 0
ou = OUNoise(env.action_space)
ddpg = DDPG(state_dim, action_dim)


pbar = tqdm.tnrange(EPISODES, ncols='100%')
for t_total in pbar:
    
    # episode begins here
    transitions = []
    achieved_goals = []
    raw_state = env.reset()
    state = np.concatenate((raw_state["observation"], raw_state["achieved_goal"], raw_state["desired_goal"]), axis=0)
    while True:
        cuda_state = torch.tensor([state], device=device, dtype=torch.float)
        action = ddpg.get_actions(cuda_state).cpu()
        action = ou.get_noisy_action(action[0].numpy())
        raw_next_state, reward, done, info = env.step(action)
        next_state = np.concatenate((raw_next_state["observation"], raw_next_state["achieved_goal"], raw_next_state["desired_goal"]), axis=0)
        transitions.append({"state": raw_state, "action": action, "reward": reward, "next_state": raw_next_state, "done": done})
        achieved_goals.append(raw_state["achieved_goal"])
        if done:
            break
        state = next_state
        raw_state = raw_next_state
        
    for transition in transitions:
        #replay_memory.add(transition["state"], transition["action"], transition["reward"], transition["next_state"], transition["done"])
        #print(sample_goals)
        sample_goals = random.sample(achieved_goals, 10)
        for sample_goal in sample_goals:
            #print(sample_goal)
            #print(transition)
            transition["state"]["desired_goal"] = sample_goal
            transition["next_state"]["desired_goal"] = sample_goal
            if( np.array_equal(transition["state"]["desired_goal"], transition["state"]["achieved_goal"]) ):
                print(transition)
                print("*******")
            #print('-'*20)
            #print(transition)
            
            
            
        



# state = env.reset()
# state = np.concatenate((state["observation"], state["achieved_goal"], state["desired_goal"]), axis=0)
# pbar = tqdm.tnrange(TIMESTEPS, ncols='100%')
# for t_total in pbar:
#     cuda_state = torch.tensor([state], device=device, dtype=torch.float)
#     action = ddpg.get_actions(cuda_state).cpu()
#     action = ou.get_noisy_action(action[0].numpy())
#     next_state, reward, done, info = env.step(action)
#     next_state = np.concatenate((next_state["observation"], next_state["achieved_goal"], next_state["desired_goal"]), axis=0)
#     replay_memory.add(state, action, reward, next_state, done)
#     if t_total != 0 and t_total % 4 == 0:
#         experience_batch = replay_memory.sample(BATCH_SIZE)
#         ddpg.train_batch(experience_batch.states, experience_batch.actions, experience_batch.rewards, experience_batch.next_states, experience_batch.dones)
    
#     if t_total != 0 and t_total % 1_00_000 == 0:
#         torch.save(ddpg.target_actor.state_dict(), "mc_" + str(count))
#         count += 1
#         print("saved")
            
#     if done:
#         state = env.reset()
#         state = np.concatenate((state["observation"], state["achieved_goal"], state["desired_goal"]), axis=0)
#     else:
#         state = next_state

# torch.save(ddpg.target_actor.state_dict(), "mc_" + str(count))


HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=1), HTML(value='')), layout=Layout(display='i…

*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
*******
{'state': {'observation': array([1.43554725e+00, 8.40139887e-01, 4.36115517e-01, 0.00000000e+00,
       0.00000000e+00, 1.20485296e-02, 2.11136234e-02, 2.22120075e-03,
       4.34532739e-04, 1.12561556e-04]), 'achieved_goal': array([1.43554725, 0.84013989, 0.43611552]), 'desired_goal': array([1.43554725, 0.84013989, 0.43611552])}, 'action': array([0.42742689, 0.96687077, 0.41928199, 0.80452785]), 'reward': -1.0, 'next_state': {'observation': array([1.44859844e+00, 8.72084490e-01, 4.49256573e-01, 0.00000000e+00,
       0.00

In [14]:
replay_memory = ReplayMemory(1000000, 16, 4)
replay_memory.populate(env, 1000000)

10000


KeyboardInterrupt: 

In [10]:
# ddpg = DDPG(16, 4)
# ddpg.load_model("fetchreach_9")
# print(ddpg.load_model)





In [11]:
state = env.reset()
for _ in range(1000):
    env.render()
    state = np.concatenate((state["observation"], state["achieved_goal"], state["desired_goal"]), axis=0)
    cuda_state = torch.tensor([state], device=device, dtype=torch.float)
    action = ddpg.target_actor(cuda_state)
    action = action.cpu()
    #print(action.detach().numpy()[0])
    next_state, reward, done, info = env.step(action.detach().numpy()[0])
    state = np.concatenate((next_state["observation"], next_state["achieved_goal"], next_state["desired_goal"]), axis=0)
    state = next_state
    if done:
        print(next_state, reward, done, info)
        break
        
env.close()

Creating window glfw
{'observation': array([ 4.76456721e-01,  1.03230452e+00,  1.78120232e-01,  0.00000000e+00,
        0.00000000e+00, -2.14947340e-03, -1.32529089e-03,  1.21152092e-04,
        7.30808668e-08,  5.92609964e-05]), 'achieved_goal': array([0.47645672, 1.03230452, 0.17812023]), 'desired_goal': array([1.43235837, 0.65064789, 0.46270819])} -1.0 True {'is_success': 0.0, 'TimeLimit.truncated': True}


In [13]:
env.close()

In [15]:
torch.cuda.get_device_properties(0).total_memory

8589934592

In [20]:
x = torch.randn(25000, 25000).cuda()

In [21]:
x.device

device(type='cuda', index=0)

In [35]:
r = ReplayMemory(100, 16, 4)
r.populate(env, 10)

sam_state = r.sample(4).states[1].cpu().numpy()

In [37]:
print(sam_state)
print(sam_state[-3:])
sam_state[-3:] = [1, 2, 3]

[1.3490719e+00 7.5555569e-01 5.3789562e-01 0.0000000e+00 0.0000000e+00
 8.2962792e-03 3.7673535e-03 2.8055542e-06 7.5175289e-05 7.1640839e-05
 1.3490719e+00 7.5555569e-01 5.3789562e-01 1.3344340e+00 7.6176006e-01
 4.4385520e-01]
[1.334434   0.76176006 0.4438552 ]


In [38]:
sam_state

array([1.3490719e+00, 7.5555569e-01, 5.3789562e-01, 0.0000000e+00,
       0.0000000e+00, 8.2962792e-03, 3.7673535e-03, 2.8055542e-06,
       7.5175289e-05, 7.1640839e-05, 1.3490719e+00, 7.5555569e-01,
       5.3789562e-01, 1.0000000e+00, 2.0000000e+00, 3.0000000e+00],
      dtype=float32)