In [1]:
#%%
#IPython
from IPython import display
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm

# numpy
import numpy as np
from PIL import Image

# gym
import gym

# torch stuff
import torch
import torch.nn as nn
from torchvision.transforms import Compose, ToTensor, ToPILImage
from torch.utils.data import DataLoader, WeightedRandomSampler, random_split
from torchsummary import summary
import torch.nn.functional as F

# our stuff
import importlib
from model.mdnrnn import MDNRNN
from model.vaelin import VAELin

from constants import *

# cma
from cma import CMAEvolutionStrategy as CMAES

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
env = gym.make("MountainCarContinuous-v0")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [None]:
# load vae and rnn
vae = VAELin(z_size=LATENT_SIZE, device=device).to(device)
vae.load_state_dict(torch.load(VAE_PATH, map_location=device)['model_state_dict'])
rnn = MDNRNN(
    sequence_length=500,
    hidden_space_dim=32,
    action_space_dim=1,
    latent_space_dim=LATENT_SIZE,
    num_mixtures=10,
    rnn_type="lstm",
    n_layers=5
)
rnn.load_state_dict(torch.load(RNN_PATH, map_location=device)['model_state_dict'])

#%%

In [None]:
def to_latent(vae, frame):
    frame = frame.unsqueeze(0)
    latent_seq = vae(frame)[3].detach()
    return latent_seq.squeeze()

def get_action(latent, solution, hidden=None):
    w= solution[:-1]
    b = solution[-1]

    latent = latent.cpu().numpy()
    if hidden is None:
        hidden_size = len(solution) - len(latent) - 1
        hidden = np.zeros(hidden_size)
    else:
        hidden = torch.cat([hidden[0], hidden[1]], -1).float().detach().squeeze(1).view(-1).cpu().numpy()

    stacked_features = np.hstack([latent, hidden])
    res = w @stacked_features  + b
    return np.tanh(res)

def get_loss(solution):
    max_pos = -0.4

    rewards = 0
    for ep in range(10):
        state = env.reset()
        old_rnn_hidden = None
        for step in range(NUM_STEPS):
            im = env.render(mode='rgb_array')
            im = Image.fromarray(im).resize((HEIGHT, WIDTH), Image.BILINEAR)
            frame = ToTensor()(im)
            latent = to_latent(vae, frame)
            action = get_action(latent, solution, old_rnn_hidden)
            _, _, _, _, rnn_hidden = rnn(latent.unsqueeze(0).unsqueeze(0), torch.tensor([[[action]]]).float(), old_rnn_hidden)
            old_rnn_hidden = rnn_hidden
            state, reward, done, _ = env.step([action])
            if state[0] > max_pos:
                max_pos = state[0]
                reward += 10
            rewards += reward
            if done:
                break
    loss = -rewards
    return loss

In [4]:
def to_latent(vae, frame):
    frame = frame.unsqueeze(0)
    latent_seq = vae(frame)[3].detach()
    return latent_seq.squeeze()

def get_action(state, solution):
    w= solution[:-1]
    b = solution[-1]
    
    res = w @state + b
    return np.tanh(res)

def get_loss(solution):
    max_pos = -0.4

    rewards = 0
    for ep in range(10):
        state = env.reset()
        for step in range(NUM_STEPS):
            action = get_action(state, solution)
            state, reward, done, _ = env.step([action])
            if state[0] > max_pos:
                max_pos = state[0]
                reward += 10
            rewards += reward
            if done:
                break
    loss = -rewards
    return loss

In [13]:
env = gym.make("MountainCarContinuous-v0")


param_size = LATENT_SIZE + 32*2*5 + 1

param_size = 2 +1 
init_params = param_size*[0]
init_sigma = 1
popsize = 64
es = CMAES(init_params, init_sigma, {'popsize':popsize})
#%%
es.optimize(get_loss)
env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
(32_w,64)-aCMA-ES (mu_w=17.6,w_1=11%) in dimension 3 (seed=636534, Tue Dec 11 05:04:45 2018)
Iterat #Fevals   function value  axis ratio  sigma  min&max std  t[m:s]
    1     64 -1.223462597269289e+03 1.0e+00 1.50e+00  1e+00  2e+00 0:06.2
    2    128 -1.591305553888262e+03 1.9e+00 2.11e+00  1e+00  3e+00 0:12.4
    3    192 -2.297723572474240e+03 2.7e+00 2.65e+00  1e+00  3e+00 0:19.0
    4    256 -2.142965347004530e+03 2.9e+00 3.22e+00  1e+00  3e+00 0:25.0
    5    320 -2.115781790660407e+03 3.6e+00 3.85e+00  1e+00  3e+00 0:30.9
    6    384 -2.144215879567569e+03 5.6e+00 3.82e+00  8e-01  3e+00 0:36.1
    8    512 -2.533046148086903e+03 1.0e+01 3.63e+00  5e-01  2e+00 0:45.5
   10    640 -2.560047129762479e+03 1.4e+01 3.44e+00  4e-01  2e+00 0:54.0
   13    832 -2.45658916755

KeyboardInterrupt: 

In [14]:
w  = es.ask()[0][:2]

In [15]:
b =  es.ask()[0][-1]

In [21]:
env = gym.make("MountainCarContinuous-v0")
done = False
state = env.reset()
while not done:
    action = np.tanh(w @ state + b)
    env.render(mode='rgb_array')
    state, reward, done, _ = env.step([action])
env.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [19]:
env.close()