In [1]:
import gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
env = gym.make('CartPole-v0')
print('observation space:', env.observation_space)
print('action space:', env.action_space)

s_sz = 4
a_sz = 2

observation space: Box(4,)
action space: Discrete(2)




In [None]:
class PolicyNet():
    def __init__(self, s_size, a_size, pop_size, noise=1e-2, noise_min=1e-4, noise_max=2):
        self.w = 1e-4*np.random.rand(s_size, a_size)
        self.noise = noise
        self.nmin = noise_min
        self.nmax = noise_max
        self.popsize = pop_size
        
    def forward(self, state):
        x = np.dot(state, self.w)
        return np.exp(x)/sum(np.exp(x))#softmax(x)
    
    def act(self, state):
        probs = self.forward(state)
        action = np.random.choice(2, p=probs)
        return action
    
    def populate(self):
        population = np.array([self.w + self.noise*np.random.randn(*self.w.shape) for _ in range(self.popsize)])
        return population

In [None]:
def hill_climbing(n_episodes=1000, max_t=1000, gamma=1.0, print_every=100):
    scores_deque = deque(maxlen=print_every)
    scores = []
    best_R = -np.Inf
    best_w = policy.w
    
    for i_episode in range(1, n_episodes+1):
        rewards = []
        state=env.reset()
        
        for t in range(max_t):
            action = policy.act(state)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break
                
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        discounts = [gamma**i for i in range(len(rewards)+1)]
        R = sum([a*b for a,b in zip(discounts, rewards)])
        
        if R >= best_R:
            best_R = R
            best_w = policy.w
            policy.noise = max(policy.nmax,)

In [None]:
def hill_climbing_steepest_ANS(policy, n_episodes=1000, max_t=1000, gamma=1.0, print_every=100):
    scores_deque = deque(maxlen=100)
    scores = []
    best_Return = -np.inf
    best_w = policy.w
    noise = []
    
    for i_episode in range(1, n_episodes+1):
        rewards = np.empty((max_t, policy.popsize))
        returns = np.empty((policy.popsize))
        policies = policy.populate()
        state_alpha = env.reset()
        
        for p in range(policy.popsize):
            policy.w = policies[p]
            action = policy.act(state_alpha)
            next_state, reward, done, _ = env.step(action)
            rewards[0,p] = reward
            state = next_state
            for t in range(max_t-1):
                action = policy.act(state)
                next_state, reward, done, _ = env.step(action)
                rewards[t,p] = reward
                state = next_state
                if done:
                    break
                
        discounts = [gamma**i for i in range(len(rewards)+1)]
        returns = np.sum(rewards*discounts, axis=0) #sum([r*d for r,d in zip(rewards[:,p], discounts)])
        
        best_p = np.argmax(returns)
        #top5_p = (-returns).argsort(axis=-1)[:,:5]
        if max(returns)>=best_Return:
            best_w = policies[best_p]
            policy.noise = max(policy.nmin, policy.noise*0.66)
            policy.w = best_w
            best_Return = max(returns)
        else:
            policy.noise = min(policy.nmax, policy.noise*1.05)
            policy.w = policies[best_p]
        
        scores_deque.append(returns[best_p])
        scores.append(returns[best_p])
        noise.append(policy.noise)
        
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque)>=195.0:
            print('Environment solved in {} episodes!\tAverage Score: {}'.format(i_episode-print_every, np.mean(scores_deque)))
            best_p = np.argmax(returns)
            best_w = policies[best_p]
            policy.w = best_w
            break
        
    return scores, noise
            

In [None]:
policy_net = PolicyNet(s_sz, a_sz, pop_size=1)
scores, search_r = hill_climbing_steepest_ANS(policy_net, n_episodes=1000, print_every=10)

In [None]:
plt.plot(scores)
plt.figure()
plt.plot(search_r)
policy_net.w

In [None]:
pop = policy_net.populate()
print(pop)

In [None]:
torender_episodes = 3
torender_frames = 300

plt.figure(figsize=(9,9))

for i in range(0,torender_episodes):
    state = env.reset()
    img = plt.imshow(env.render(mode='rgb_array'))
    for j in range(torender_frames):
        action = policy_net.act(state)
        img.set_data(env.render(mode='rgb_array'))
        plt.axis('off')
        display.display(plt.gcf())
        display.clear_output(wait=True)
        state, reward, done, _ = env.step(action)
        if done:
            break 

env.close()

In [None]:
policy_net.w