In [None]:
import keras
import gym
import numpy as np
import requests

def send_score(reward):
    my_id = 'rl_user_1'
    name = 'My Name goes Here'
    image = 'image_url'
    try:
        requests.post("http://workshop.sauray.com/score", data={'id': my_id, 'name': name, 'score': reward, 'image': image})
    except:
        pass

NUMBER_OF_OBSERVATIONS=8
NUMBER_OF_ACTIONS=4
NUMBER_OF_GAMES_TO_PLAY=400
NUMBER_OF_STEPS=40000
MAX_MEMORY_LENGTH = 100000
NUMBER_INITIAL_OBSERVATIONS = 0
TRAIN_EVERY_N_GAMES=50
TRAIN = True

# One hot encoding array https://fr.wikipedia.org/wiki/Encodage_one-hot
possible_actions = np.arange(0,NUMBER_OF_ACTIONS)
actions_one_hot_encoding = np.zeros((NUMBER_OF_ACTIONS,NUMBER_OF_ACTIONS))
actions_one_hot_encoding[np.arange(NUMBER_OF_ACTIONS),possible_actions] = 1

# Create enviroment
env = gym.make('LunarLander-v2')
env.reset()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.externals import joblib

# TODO
# Define the function which returns the model, i.e the function approximator
# You can find information here https://keras.io/models/sequential/
def model():
    return LinearRegression()

def load_model(m, filename='model.pkl'):
    return joblib.load(filename) 

def save_model(m, filename='model.pkl'):
    joblib.dump(m, filename) 

# TODO
# The probability under which a random action is performed
# Try to find the balance between exploration and exploitation
def get_epsilon(game_no, game_max_no):
    return 0.5

# The epsilon greedy policy
# This function should return an action (0, 1, 2 or 3)
# It should call the get_epsilon function
# Under an epsilon probability, pick a random action
# Otherwise, find the best action according to your model
# You should predict the value of the vectors
# [observation1, observation2, ... , action0] [observation1, observation2, ... , action1] etc up to the number of actions (4)
def epsilon_greedy(env, observation, approximator, game):
    return env.action_space.sample()

In [None]:
#Initialize Memory Array data array 
memoryX = np.zeros(shape=(1,NUMBER_OF_OBSERVATIONS+NUMBER_OF_ACTIONS))
memoryY = np.zeros(shape=(1,1))

try:
    approximator = load_model()
except:
    print("load failed")
    approximator = model()

def calculate_q_values(rewards, b_discount):
    rewards_length = len(rewards)
    for i in range(0, rewards_length):
        # compte the discounted reward and set the rewards
        index = (rewards_length-1)-i # this is the reverse index, we start from end to beginning
        if i == 0:
            # avoid crashing if i==0 (remember, we are iterating reversly)
            pass
        else:
            pass
    return rewards

def update_memory(memoryX, memoryY, X, y):
    if memoryX.shape[0] == 1:
        memoryX = X
        memoryY = y
    else:
        #Add experience to memory
        memoryX = np.concatenate((memoryX, X),axis=0)
        memoryY = np.concatenate((memoryY, y),axis=0)
        # if memory is full remove first element
        if np.alen(memoryX) >= MAX_MEMORY_LENGTH:
            for l in range(np.alen(X)):
                memoryX = np.delete(memoryX, 0, axis=0)
                memoryY = np.delete(memoryY, 0, axis=0)
    return memoryX, memoryY

game_won = 0
game_lost = 0
for game in range(NUMBER_OF_GAMES_TO_PLAY):
    # the vector that combines the environment and the action
    X = np.zeros(shape=(1,NUMBER_OF_OBSERVATIONS+NUMBER_OF_ACTIONS))
    y = np.zeros(shape=(1,1))
    # reset the environment to start a new game
    qs = env.reset()
    sum_rewards = 0
    for step in range (NUMBER_OF_STEPS):

        action = epsilon_greedy(env, qs, approximator, game)
        env.render()
        qs_a = np.concatenate((qs, actions_one_hot_encoding[action]), axis=0)        
        observation,reward,done,info = env.step(action)
        sum_rewards = sum_rewards + reward
        if step == 0:
            X[0] = qs_a
            y = np.array([reward])
            memoryX[0] = qs_a
            memoryY[0] = np.array([reward])
        X = np.vstack((X,qs_a))
        y = np.vstack((y, np.array([reward])))
        
        if done:
            # calculate Q values from end to start, using the Bellman equation
            # You need to find a good parameter for b_discount (look for the Bellman equation)
            calculate_q_values(y, b_discount=0.98)
            (memoryX, memoryY) = update_memory(memoryX, memoryY, X, y)
        
        # Update the states
        qs=observation
        # Train every X game after num_initial_observation
        if done:
            if TRAIN and game >= NUMBER_INITIAL_OBSERVATIONS and game%TRAIN_EVERY_N_GAMES == 0 and game >= TRAIN_EVERY_N_GAMES:
                print("Training  game# ", game,"memory size", memoryX.shape[0])
                approximator.fit(memoryX,memoryY)
                np.save('memoryX.csv', memoryX)
                np.save('memoryY.csv', memoryY)
                save_model(approximator)
                env.reset()
            send_score(sum_rewards)
            if reward >= 0 and reward <99:
                print("Game ",game," ended with positive reward ")
            if reward > 50:
                game_won = game_won + 1
                print("Game ", game," WON *** with reward "+str(reward) )
            else:
                game_lost = game_lost + 1
                print("Game ", game," FAILED *** with reward "+str(reward))
            win_ratio = game_won/(game_won+game_lost)
            print("Success: " + str(win_ratio*100) +"%")
            break
        

