In [1]:
import gym
import numpy as np
env = gym.make('MountainCar-v0')
from gym import wrappers
from time import time
import matplotlib.pyplot as plt
env = wrappers.Monitor(env, './videos/' + str(time()) + '/')

Let's take a look at this environment

In [2]:
env.action_space

Discrete(3)

In [3]:
env.observation_space

Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)

In [4]:
env.observation_space.high

array([0.6 , 0.07], dtype=float32)

In [5]:
env.observation_space.low

array([-1.2 , -0.07], dtype=float32)

Why don't we test a simple agent?

Say our agent accelerate forwards always. What do you think would happen?

In [6]:
def test_agent(env,agent,number_episodes,steps_per_episode,render=False):
    rewards = [0 for i in range(number_episodes)]
    for i_episode in range(number_episodes):
        observation = env.reset()
        for t in range(steps_per_episode):
            if render:
                env.render()
            #print(observation)
            observation, reward, done, info = env.step(agent(observation)) #iorio
            rewards[i_episode] += reward
            if done:
                #print("Episode finished after {} timesteps".format(t+1))
                break
        if render:
            env.close()
    return rewards
    

In [7]:
def go_forward(state):
    return 2

In [8]:
test_agent(env,go_forward,2,200,render=True)

[-200.0, -200.0]

We will produce "linear agents" i.e., for each possible action "a" in the action space,
we have vector w_a sucht that  

q(s,a) = w_a * x(s)  

where x is a feature function. The w_a's assemble in a matrix W.

In [9]:
def linear_agent(W,feature_func):
    def agent(state):
        features = feature_func(state)
        #print(features.shape)
        action = np.matmul(W,features).argmax()
        return action
    return agent

How do we train our agent?  

We follow semi-gradient SARSA

In [10]:
def semi_gradient_linear_SARSA(env,set_random,feature_func,d,alpha,epsilon,episodes,gamma,max_steps):
    
    #initialize weight vectors
    
    
    W = np.zeros((env.action_space.n,d))
    
    best = 500
    saved = None
    scores = []
    
    for episode in range(episodes):
#uncomment this to log how our agent is doing during training.
#         if episode%500 == 0:
#             rewards = test_agent(env,linear_agent(W,feature_func),200,200,render=False)
#             score = -sum(rewards)/len(rewards)
#             #print(score,"at",episode,"episode")
#             scores.append((episode,score))
#             if score < best:
#                 best = score
#                 saved = linear_agent(W,feature_func)
#                 #print("best agent so far")
        
        #we reset the environment
        set_random(env)
        
        
        #we transform the state into a feature state
        state1 = feature_func(env.state)
        
        #we pick an initial action, greedily
        action = greedy_pick(env,W,state1,epsilon)
        done = False
        step = 0
        
        while not done and step < max_steps:
            
            #perform the action, get new state and reward.
            state2, reward, done, info = env.step(action)
            step+=1
            state2 = feature_func(state2)

            if done:
                W[action,:] = W[action,:] + alpha*(reward-np.dot(W[action,:],state1))*state1
                break
            else:
                action2 =  greedy_pick(env,W,state2,epsilon)
                W[action,:] = (W[action,:] + 
                               alpha*(reward+gamma*np.dot(W[action2,:],state2)
                                     -np.dot(W[action,:],state1))*state1)
                state1 = state2
                action = action2
    print("best score is",best)
    print("\n")
    return W,saved,scores

In [11]:
def greedy_pick(env,W,features,epsilon):
    if np.random.rand(1)[0] < epsilon:
        action = env.action_space.sample()
    else:
        action = np.matmul(W,features).argmax()  
    return action

Now, for our particular env

In [12]:
def set_random_Mountain_Car(env):
    env.reset()
    env.state[0] = np.random.uniform(-1.2,0.6)
    env.state[1] = np.random.uniform(-0.07,0.07)
    

We now want to implement state aggregation.

In a future version we will do tile coding.

In [13]:
from bisect import bisect_left

def find_interval(partition,observation):
    return bisect_left(partition,observation)-1

def state_aggregation(env,N):
    
    #N represents the number of points in the partition (so there are N-1 boxes)
    
    high = env.observation_space.high
    low = env.observation_space.low
    mesh = np.linspace(low,high,N)
    D = high.shape[0]
    
    def feature(state):
        coordinates = [0]*len(state)
        for i in range(len(state)):
            coordinates[i] = find_interval(mesh[:,i],state[i]) #not vectorized yet
        answer = np.zeros((N-1,)*D)
        answer[coordinates[0],coordinates[1]]=1 #horrible
        return answer.reshape(-1)
    
    return feature
    

Picking hyperparameters is hard, so we'll do a small grid search.

We haven't really solved this environment. We need to at least get to 110. Maybe with tile coding and n-step SARSA?

In [15]:
def create_meshes(high,low,N,k):
    size = (high-low)/(N-1)
    return  [np.linspace(low-(i/k)*size,high+((k-i)/k)*size,N+1) for i in range(1,k)]+[np.linspace(low,high,N)]


def tile_coding(env,N,k):
    
    high = env.observation_space.high
    low = env.observation_space.low
    
    meshes = create_meshes(high,low,N,k)
    D = high.shape[0]
    
    def feature(state):
        answers = []
        for mesh in meshes:
            coordinates = [0]*len(state)
            for i in range(len(state)):
                coordinates[i] = find_interval(mesh[:,i],state[i]) #not vectorized yet
            answer = np.zeros((N,)*D)
            answer[coordinates[0],coordinates[1]]=1 #horrible
            answers += [answer.reshape(-1)]
    
        return np.concatenate(answers)

    return feature

In [16]:
def semi_gradient_linear_n_step_SARSA(env,set_random,feature_func,n,d,alpha,epsilon,episodes,gamma,max_steps):
    
    #initialize weight vectors
    
    
    W = np.zeros((env.action_space.n,d))
    
    best = inf
    saved = None
    saved_W = []
    scores = []
    
    for episode in range(episodes):
        if episode%500 == 0:
            rewards = test_agent(env,linear_agent(W,feature_func),200,200,render=False)
            score = -sum(rewards)/len(rewards)
            print(score,"at",episode,"episode")
            scores.append((episode,score))
            if score < best:
                best = score
                saved = linear_agent(W,feature_func)
                print("best agent so far")
        saved_W.append(W.copy())
        
        states = []
        actions = []
        rewards = [0]
        
        #we reset the environment
        set_random(env)
        
        #we transform the state into a feature state
        states.append(feature_func(env.state))
        
        #we pick an initial action, greedily
        actions.append(greedy_pick(env,W,states[0],epsilon))
        T = inf
        tau = None
        
        t = 0
        
        while tau != T-1:
            if t < T:
                state, reward, done, info = env.step(actions[t])
                states.append(feature_func(env.state))
                rewards.append(reward)
                
                if done:
                    T = t+1
                else:
                     actions.append(greedy_pick(env,W,states[-1],epsilon))

            tau = t - n + 1 #time whose estimate is being updated
            
            if tau >= 0:
                #we compute G
                G = sum([(gamma**i)*rewards[i+tau+1] for i in range(0,min(tau+n,T)-tau)])
                if tau + n < T:
                    G += (gamma**n)*np.dot(W[actions[tau+n],:],states[tau+n])

                #we update
                W[actions[tau],:] = (W[actions[tau],:] + 
                               (alpha)*(G-np.dot(W[actions[tau],:],states[tau]))*states[tau])
            t+=1
    return W,saved,saved_W

In [30]:
N = 9
k = 6
n = 9
alpha = 0.0325
d = k*(N**2)

from math import inf

feature_func = tile_coding(env,N,k)

result = semi_gradient_linear_n_step_SARSA(env,set_random_Mountain_Car,feature_func,
                                           n,d,
                                           alpha=alpha/k,
                                           epsilon = 0,
                                           episodes = 5001,
                                           gamma = 1,
                                           max_steps=200)

200.0 at 0 episode
best agent so far
187.24 at 500 episode
best agent so far
162.12 at 1000 episode
best agent so far
149.485 at 1500 episode
best agent so far
99.625 at 2000 episode
best agent so far
110.12 at 2500 episode
121.05 at 3000 episode
111.145 at 3500 episode
105.505 at 4000 episode
110.595 at 4500 episode
105.555 at 5000 episode
