In [27]:
#XVFB will be launched if you run on a server
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1 

In [28]:
import gym
import numpy as np, pandas as pd

env = gym.make("LunarLander-v2").env
env.reset()
env.render()

[33mWARN: gym.spaces.Box autodetected dtype as <type 'numpy.float32'>. Please provide explicit dtype.[0m


True

In [17]:
#actions = {'left': 0, 'stop': 1, 'right': 2}
n_actions = 4


In [29]:
#create agent
from sklearn.neural_network import MLPClassifier
agent = MLPClassifier(hidden_layer_sizes=(40,40),
                      activation='tanh',
                      warm_start=True, #keep progress between .fit(...) calls
                      max_iter=1 #make only 1 iteration on each .fit(...)
                     )
#initialize agent to the dimension of state an amount of actions
agent.fit([env.reset()]*n_actions,range(n_actions));

In [30]:
from joblib import Parallel

def generate_session(t_max=1000):
    
    states,actions = [],[]
    total_reward = 0
    
    s = env.reset()
    
    for t in range(t_max):
        
        #predict array of action probabilities
        probs = agent.predict_proba([s])[0] 
        
        a = np.random.choice(np.arange(n_actions) , p=probs)
        
        new_s,r,done,info = env.step(a)
        
        #record sessions like you did before
        states.append(s)
        actions.append(a)
        total_reward+=r
        
        s = new_s
        if done: break
    return states,actions,total_reward

In [31]:
def select_elites(states_batch,actions_batch,rewards_batch,percentile=50):
    """
    Select states and actions from games that have rewards >= percentile
    :param states_batch: list of lists of states, states_batch[session_i][t]
    :param actions_batch: list of lists of actions, actions_batch[session_i][t]
    :param rewards_batch: list of rewards, rewards_batch[session_i][t]
    
    :returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions
    
    Please return elite states and actions in their original order 
    [i.e. sorted by session number and timestep within session]
    
    If you're confused, see examples below. Please don't assume that states are integers (they'll get different later).
    """
    states_batch = np.array([np.array(xi) for xi in states_batch])
    rewards_batch = np.array(rewards_batch)
    actions_batch = np.array([np.array(xi) for xi in actions_batch])
    reward_threshold = np.percentile(rewards_batch, percentile) 
    elite_states  = states_batch[rewards_batch > reward_threshold]
    elite_actions = actions_batch[rewards_batch > reward_threshold]
    return np.concatenate(elite_states, axis=0),np.hstack(elite_actions)

In [32]:
def update_policy(elite_states,elite_actions):
    """
    Given old policy and a list of elite states/actions from select_elites,
    return new updated policy where each action probability is proportional to
    
    policy[s_i,a_i] ~ #[occurences of si and ai in elite states/actions]
    
    Don't forget to normalize policy to get valid probabilities and handle 0/0 case.
    In case you never visited a state, set probabilities for all actions to 1./n_actions
    
    :param elite_states: 1D list of states from elite sessions
    :param elite_actions: 1D list of actions from elite sessions
    
    """
    new_policy = np.zeros([n_states,n_actions])
    for state, action in zip(elite_states,elite_actions): 
           new_policy[state, action] += 1
    
    for state in range(n_states): 
        tot = new_policy[state].sum()
        if tot != 0:
            new_policy[state, :] /= float(tot)
        else:
            new_policy[state, :] = 1/float(n_actions)
    
    #<Your code here: update probabilities for actions given elite states & actions>
    #Don't forget to set 1/n_actions for all actions in unvisited states.
    
    
    return new_policy


In [33]:
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import clear_output

def show_progress(rewards_batch,log, reward_range=[-990,+10]):
    """
    A convenience function that displays training progress. 
    No cool math here, just charts.
    """
    
    mean_reward = np.mean(rewards_batch)
    threshold = np.percentile(rewards_batch,percentile)
    log.append([mean_reward,threshold])

    clear_output(True)
    print("mean reward = %.3f, threshold=%.3f"%(mean_reward,threshold))
    plt.figure(figsize=[8,4])
    plt.subplot(1,2,1)
    plt.plot(list(zip(*log))[0],label='Mean rewards')
    plt.plot(list(zip(*log))[1],label='Reward thresholds')
    plt.legend()
    plt.grid()
    
    plt.subplot(1,2,2)
    plt.hist(rewards_batch,range=reward_range);
    plt.vlines([np.percentile(rewards_batch,percentile)],[0],[100],label="percentile",color='red')
    plt.legend()
    plt.grid()

    plt.show()



In [34]:
n_sessions = 100
percentile = 70
log = []

for i in range(100):
    #generate new sessions
    print i
    sessions = [generate_session() for _ in range(n_sessions)]
    
    states_batch,actions_batch,rewards_batch = map(np.array,zip(*sessions))

    elite_states, elite_actions = select_elites(states_batch,actions_batch,rewards_batch,percentile=percentile)
    
    agent.fit(elite_states, elite_actions, n_jobs=2)

    show_progress(rewards_batch,log,reward_range=[-10000,np.max(rewards_batch)])
    
    if np.mean(rewards_batch)>= 50:
        print("You Win! You may stop training now via KeyboardInterrupt.")

0
[array([[ 4.49657440e-04,  9.33721835e-01,  4.55252945e-02,
        -4.59442266e-01, -5.14218351e-04, -1.03121951e-02,
         0.00000000e+00,  0.00000000e+00],
       [ 9.84001160e-04,  9.26446253e-01,  5.60978115e-02,
        -4.85041237e-01, -3.15659889e-03, -5.28523088e-02,
         0.00000000e+00,  0.00000000e+00],
       [ 1.58805847e-03,  9.19379241e-01,  6.27530932e-02,
        -4.71142769e-01, -5.47134271e-03, -4.62989330e-02,
         0.00000000e+00,  0.00000000e+00],
       [ 2.19221115e-03,  9.11912066e-01,  6.27596676e-02,
        -4.97822634e-01, -7.78640760e-03, -4.63054776e-02,
         0.00000000e+00,  0.00000000e+00],
       [ 2.89039612e-03,  9.04045588e-01,  7.45508313e-02,
        -5.24466578e-01, -1.24638015e-02, -9.35570776e-02,
         0.00000000e+00,  0.00000000e+00],
       [ 3.49225998e-03,  8.95778758e-01,  6.24692619e-02,
        -5.51143583e-01, -1.47143519e-02, -4.50152427e-02,
         0.00000000e+00,  0.00000000e+00],
       [ 4.05244827e-03,  8.876

TypeError: fit() got an unexpected keyword argument 'n_jobs'