In [1]:
import keras
import gym
import numpy as np
import requests

def send_score(reward):
    my_id = 'rl_user_1'
    name = 'My Name goes Here'
    image = 'image_url'
    try:
        requests.post("http://workshop.sauray.com/score", data={'id': my_id, 'name': name, 'score': reward, 'image': image})
    except:
        pass

NUMBER_OF_OBSERVATIONS=8
NUMBER_OF_ACTIONS=4
NUMBER_OF_GAMES_TO_PLAY=400
NUMBER_OF_STEPS=40000
MAX_MEMORY_LENGTH = 100000
NUMBER_INITIAL_OBSERVATIONS = 0
TRAIN_EVERY_N_GAMES=50
TRAIN = True

# One hot encoding array https://fr.wikipedia.org/wiki/Encodage_one-hot
possible_actions = np.arange(0,NUMBER_OF_ACTIONS)
actions_one_hot_encoding = np.zeros((NUMBER_OF_ACTIONS,NUMBER_OF_ACTIONS))
actions_one_hot_encoding[np.arange(NUMBER_OF_ACTIONS),possible_actions] = 1

# Create enviroment
env = gym.make('LunarLander-v2')
env.reset()

Using TensorFlow backend.
[2018-01-18 15:21:15,471] Making new env: LunarLander-v2


array([ 0.00637188,  0.94257537,  0.64538345,  0.1307813 , -0.00737658,
       -0.146189  ,  0.        ,  0.        ])

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.externals import joblib
from sklearn.linear_model import LinearRegression
from sklearn.externals import joblib
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras import optimizers

# TODO
# Define the function which returns the model, i.e the function approximator
# You can find information here https://keras.io/models/sequential/
def model():
    model = Sequential()
    model.add(Dense(20, activation='relu', input_dim=NUMBER_OF_OBSERVATIONS+NUMBER_OF_ACTIONS))
    model.add(Dense(1))
    return model

def load_model(filename="solution-weights.h5"):
    m = model()
    m.load_weights(filename)
    return m

def save_model(m, filename="solution-weights.h5"):
    m.save_weights(filename)

# TODO
# The probability under which a random action is performed
# Try to find the balance between exploration and exploitation
def get_epsilon(game_no, game_max_no):
    return 0.5

# The epsilon greedy policy
# This function should return an action (0, 1, 2 or 3)
# It should call the get_epsilon function
# Under an epsilon probability, pick a random action
# Otherwise, find the best action according to your model
# You should predict the value of the vectors
# [observation1, observation2, ... , action0] [observation1, observation2, ... , action1] etc up to the number of actions (4)
def epsilon_greedy(env, observation, approximator, game):
    starting_probability = 0.0
    explore_prob = starting_probability - (starting_probability/NUMBER_OF_GAMES_TO_PLAY)*game
    #explore_prob = 0.35
    if np.random.rand(1)<explore_prob:
        # random
        return env.action_space.sample()
    else:
        x = np.zeros(shape=(0,NUMBER_OF_OBSERVATIONS+NUMBER_OF_ACTIONS))
        for action_vector in actions_one_hot_encoding:
            v = np.concatenate((observation, action_vector))
            x = np.vstack((x, v))
        predictions = approximator.predict(x)
        return np.argmax(predictions)

In [3]:
#Initialize Memory Array data array 
memoryX = np.zeros(shape=(1,NUMBER_OF_OBSERVATIONS+NUMBER_OF_ACTIONS))
memoryY = np.zeros(shape=(1,1))

try:
    approximator = load_model()
except:
    print("load failed")
    approximator = model()

def calculate_q_values(rewards, b_discount):
    rewards_length = len(rewards)
    for i in range(0, rewards_length):
        index = (rewards_length-1)-i
        if i==0:
            rewards[index] = (rewards[index])
        else:
            rewards[index] = rewards[index]+b_discount*rewards[index+1]
    return rewards

def update_memory(memoryX, memoryY, X, y):
    if memoryX.shape[0] == 1:
        memoryX = X
        memoryY = y
    else:
        #Add experience to memory
        memoryX = np.concatenate((memoryX, X),axis=0)
        memoryY = np.concatenate((memoryY, y),axis=0)
        # if memory is full remove first element
        if np.alen(memoryX) >= MAX_MEMORY_LENGTH:
            for l in range(np.alen(X)):
                memoryX = np.delete(memoryX, 0, axis=0)
                memoryY = np.delete(memoryY, 0, axis=0)
    return memoryX, memoryY

game_won = 0
game_lost = 0
for game in range(NUMBER_OF_GAMES_TO_PLAY):
    # the vector that combines the environment and the action
    X = np.zeros(shape=(1,NUMBER_OF_OBSERVATIONS+NUMBER_OF_ACTIONS))
    y = np.zeros(shape=(1,1))
    # reset the environment to start a new game
    qs = env.reset()
    sum_rewards = 0
    for step in range (NUMBER_OF_STEPS):

        action = epsilon_greedy(env, qs, approximator, game)
        env.render()
        qs_a = np.concatenate((qs, actions_one_hot_encoding[action]), axis=0)        
        observation,reward,done,info = env.step(action)
        sum_rewards = sum_rewards + reward
        if step == 0:
            X[0] = qs_a
            y = np.array([reward])
            memoryX[0] = qs_a
            memoryY[0] = np.array([reward])
        X = np.vstack((X,qs_a))
        y = np.vstack((y, np.array([reward])))
        
        if done:
            # calculate Q values from end to start, using the Bellman equation
            # You need to find a good parameter for b_discount (look for the Bellman equation)
            calculate_q_values(y, b_discount=0.98)
            (memoryX, memoryY) = update_memory(memoryX, memoryY, X, y)
        
        # Update the states
        qs=observation
        # Train every X game after num_initial_observation
        if done:
            if TRAIN and game >= NUMBER_INITIAL_OBSERVATIONS and game%TRAIN_EVERY_N_GAMES == 0 and game >= TRAIN_EVERY_N_GAMES:
                print("Training  game# ", game,"memory size", memoryX.shape[0])
                approximator.fit(memoryX,memoryY)
                np.save('memoryX.csv', memoryX)
                np.save('memoryY.csv', memoryY)
                save_model(approximator)
                env.reset()
            send_score(sum_rewards)
            if reward >= 0 and reward <99:
                print("Game ",game," ended with positive reward ")
            if reward > 50:
                game_won = game_won + 1
                print("Game ", game," WON *** with reward "+str(reward) )
            else:
                game_lost = game_lost + 1
                print("Game ", game," FAILED *** with reward "+str(reward))
            win_ratio = game_won/(game_won+game_lost)
            print("Success: " + str(win_ratio*100) +"%")
            break
        



load failed
Game  0  FAILED *** with reward -100
Success: 0.0%
Game  1  FAILED *** with reward -100
Success: 0.0%
Game  2  FAILED *** with reward -100
Success: 0.0%
Game  3  FAILED *** with reward -100
Success: 0.0%
Game  4  FAILED *** with reward -100
Success: 0.0%
Game  5  FAILED *** with reward -100
Success: 0.0%
Game  6  FAILED *** with reward -100
Success: 0.0%
Game  7  FAILED *** with reward -100
Success: 0.0%
Game  8  FAILED *** with reward -100
Success: 0.0%
Game  9  FAILED *** with reward -100
Success: 0.0%
Game  10  FAILED *** with reward -100
Success: 0.0%
Game  11  FAILED *** with reward -100
Success: 0.0%
Game  12  FAILED *** with reward -100
Success: 0.0%
Game  13  FAILED *** with reward -100
Success: 0.0%
Game  14  FAILED *** with reward -100
Success: 0.0%
Game  15  FAILED *** with reward -100
Success: 0.0%
Game  16  FAILED *** with reward -100
Success: 0.0%


[2018-01-18 15:21:37,480] Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/antoinesauray/anaconda/envs/idp/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-3-783a127cbfc9>", line 47, in <module>
    action = epsilon_greedy(env, qs, approximator, game)
  File "<ipython-input-2-d6073607d0fe>", line 52, in epsilon_greedy
    predictions = approximator.predict(x)
  File "/Users/antoinesauray/anaconda/envs/idp/lib/python3.5/site-packages/keras/models.py", line 1006, in predict
    return self.model.predict(x, batch_size=batch_size, verbose=verbose)
  File "/Users/antoinesauray/anaconda/envs/idp/lib/python3.5/site-packages/keras/engine/training.py", line 1790, in predict
    verbose=verbose, steps=steps)
  File "/Users/antoinesauray/anaconda/envs/idp/lib/python3.5/site-packages/keras/engine/training.py", line 1299, in _predict_loop
    batch_outs = f(ins_batch)
  File "/Users/antoinesauray/anaconda/envs/

KeyboardInterrupt: 