In [2]:
import numpy as np

In [3]:
import numpy as np
import gym
import random
import os
from constants import ROLLOUT_DIR, NUM_EPISODES, NUM_STEPS, HEIGHT, WIDTH
from PIL import Image


In [4]:
import cma

In [55]:
import random, math
import numpy as np

class PendulumEnv:
    def __init__(self, maxabs_tau=20):
        self.observation_dim = 3    # observation is [x, y, omega]
        self.action_dim = 1
        self.params = {
            'dt': 0.1,
            'm': 1.0,
            'g': 9.8,
            'l': 1.0,
            'b': 1.0,
            'maxabs_tau': maxabs_tau,
            'maxabs_omega': 15,
            'maxabs_theta_init': math.pi,
            'maxabs_omega_init': 5,
            'horizon': 200,
        }
        self.reset()

    def s2z(self, s):
        """
        Converts observation (x,y,omega) to state z (theta,omega)
        """
        theta = math.atan2(s[0], -s[1])
        omega = s[2]
        return np.array([theta, omega])

    def z2s(self, z):
        """
        Converts state z (theta,omega) to observation (cos,sin,omega)
        """
        return np.array([math.sin(z[0]), -math.cos(z[0]), z[1]])

    def _dzdt(self, z, tau):
        theta, omega = z[0], z[1]
        p = self.params
        theta_ddot =  (tau - self.params['b'] * omega
                       - self.params['m'] * self.params['g'] * self.params['l'] * np.sin(theta)) \
                       / (self.params['m'] * self.params['l']**2)
        return np.array([omega, theta_ddot])

    def step(self, a):
        done = False
        self.n_steps += 1

        # Check if the horizon has been reached (if so, reset and return with zero reward)
        if self.n_steps >= self.params['horizon']:
            self.reset()
            return (self.s, 0, done)

        # Compute new state
        z = self.s2z(self.s)
        tau = np.clip(a * 10, -self.params['maxabs_tau'], self.params['maxabs_tau'])
        z_new = z + self.params['dt'] * self._dzdt(z, tau)
        self.s = self.z2s(z_new)

        # Check if constraints have been violated (if so, reset and return with large negative reward)
        if abs(self.s[2]) > self.params['maxabs_omega']:
            r = -100
            self.reset()
            return (self.s, r, done)

        # Compute reward
        r = - self.s[0]**2 - (self.s[1] - 1)**2 - 0.01 * self.s[2]**2 - 1 * a**2
        r = r.item()

        return (self.s, r, done)

    def reset(self):
        z = np.random.uniform([-self.params['maxabs_theta_init'], -self.params['maxabs_omega_init']],
                              [self.params['maxabs_theta_init'], self.params['maxabs_omega_init']])
        self.s = self.z2s(z)
        self.n_steps = 0
        return self.s

    def copy(self):
        c = PendulumEnv()
        c.s = self.s.copy()
        c.params = self.params.copy()
        c.n_steps = self.n_steps
        return c

    def render(self):
        pass

In [59]:
env = PendulumEnv()
env.reset()

array([0.04704546, 0.99889275, 1.14596405])

In [60]:
NUM_STEPS = 500
NUM_EPOCHS = 16

In [61]:
es = cma.CMAEvolutionStrategy(4*[0], 1, {'popsize':64})

(32_w,64)-aCMA-ES (mu_w=17.6,w_1=11%) in dimension 4 (seed=535913, Sun Dec  9 23:09:37 2018)


In [62]:
# def get_action(state, solution):
#     w1 = solution[:20].reshape((10,2))
#     b1 = solution[20:30]
#     w2 = solution[30:130].reshape((10,10))
#     b2 = solution[130:140]
#     w3 = solution[140:150]
#     b3 = solution[150:151]
#     x1 = w1 @ state + b1
#     x2 = w2 @ x1 + b2
#     x3 = w3 @ x2 + b3
#     return (np.tanh(x3[0]))
    
def get_action(state, w, b):
    return 2*np.tanh(w @ state + b)



In [67]:
def get_reward(solution):

    rewards = 0
    for ep in range(NUM_EPOCHS):
        state = env.reset()
        for step in range(NUM_STEPS):
            action = get_action(state, solution[:3], solution[3])
            state, reward, done = env.step(action)
            rewards += reward
            if done:
                break
    return -rewards

In [70]:
es.optimize(get_reward)

  182  11648 1.060263430865247e+03 2.3e+03 1.43e+00  3e-03  1e-02 32:34.3
  188  12032 1.019454479495275e+03 2.9e+03 1.29e+00  2e-03  7e-03 33:38.1


KeyboardInterrupt: 

In [69]:
pol = es.ask()[0]
pol

array([ 3.27013982,  0.48874501, -0.76949266, -0.49071158])

In [643]:
w = pol[:3]
b = pol[3]

In [645]:
state = env.reset()
rewards = np.zeros(NUM_STEPS)
for step in range(NUM_STEPS):
    action = get_action(state, w, b)
    state, reward, done, _ = env.step((action,))
    rewards[step] = reward
    env.render()v
    if done:
        break


NoSuchDisplayException: Cannot connect to "None"