In [9]:
import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output
import tensorflow as tf

In [10]:
from tensorflow.contrib.layers import fully_connected

In [11]:
env_name = "FrozenLake-v0"
env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)
type(env.action_space)


Observation space: Discrete(16)
Action space: Discrete(4)


gym.spaces.discrete.Discrete

In [12]:
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery':False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
        )
except:
    pass

In [13]:
class Agent(object):
    def __init__(self,env):
        self.is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high  = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
            
            
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                       self.action_high,
                                       self.action_shape)
        return action

In [14]:
class QNAgent(Agent):
    def __init__(self,env,discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.total_reward = 0
        self.state_size = env.observation_space.n   # Size of the state space, we need this to construct the q-table.
        print("State size:",self.state_size)
        
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.eps = 1    # Epsilon is the factor/probability we are using to choose between qtable actions and random actions.   
        
        # Construct the q(action,value) table/matrix
        self.buildQNet()   
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
    def buildQNet(self):
        tf.reset_default_graph()
        
        
        # Placeholders - states,action, and target q-values. ONLY PLACEHOLDERS AND NOT THE ONE-HOT GO INTO THE FEED_DICT
        self.state_feed = tf.placeholder(tf.int32,shape = [1])  # The shape is [1],here because we'll create the depth through the one-hot-encoding
        self.action_feed = tf.placeholder(tf.int32,shape= [1])
        self.q_value_target_feed = tf.placeholder(tf.int32,shape = [1]) # To calculate loss
        
        #One-hot-encoding for the placeholders
        self.state = tf.one_hot(self.state_feed,depth = self.state_size)   # We won't be using this the next time, instead we'll use the *kwargs
        self.action = tf.one_hot(self.action_feed,depth= self.action_size)
        
        #Create the NEURAL NET with dense layers.
        self.q_state_nn = fully_connected(self.state,self.action_size,activation_fn= None,scope= 'q_state_nn')   # The default activation for contrib.layers is RelU,we need linear activation.
#         self.q_state_nn = tf.layers.dense()
        # q_action to perform from NN.
        self.q_action_nn = tf.reduce_sum(tf.multiply(self.q_state_nn,self.action),axis= 1)  # VERY IMP - 1. Multiply one_hot action vector with the q_value_output from NN, and to get 1 single q_value from the NN, do a tf.reduce_mean
        
        # LOSS FUNCTION
        self.loss = tf.losses.mean_squared_error(self.q_value_target_feed,self.q_action_nn)
        # OPTIMIZER
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        # TRAIN VARIABLE 
        self.train_var = self.optimizer.minimize(self.loss)  # TRAIN METHOD ALREADY EXISTS,SO USED 'train_var' 
    
    def get_action(self,state):
#         q_state = self.Qtable[state]
        q_state = self.sess.run(self.q_state_nn, feed_dict = {self.state_feed: [state]})    # we are outputting a VECTOR OF SIZE = action_size, by feeding into the q_state_nn, NOTICE HOW WE DON'T MENTION THE ONE_HOT_ENCODED VECTORS.
        q_action_greedy = np.argmax(q_state)
        q_action_random = super().get_action(state)   # Randomised action from the parent class get_action method.
        self.random = random.random()
        return q_action_random if self.random < self.eps else q_action_greedy
    
        
    def train(self,experience):
        state, action, next_state, reward, done = ([exp] for exp in experience)  #This experience tuple will have to be created by you, after accepting values from env.step().
#         q_state_next = self.Qtable[next_state]    # Get the corresponding q_state arrays containing actions from the q-table.
        
        # Get next list of actions: q_state_next
        q_state_next = self.sess.run(self.q_state_nn, feed_dict={self.state_feed: next_state})
        q_state_next[done] = np.zeros([self.action_size])   # IF Didn't understand this
        
        #Bellman equation
        q_value_target  = reward + self.discount_rate* np.max(q_state_next)   # The new estimated/target value is based on the recieved reward and the discounted value of the next state.

        #RUN OPTIMIZER,i.e TRAIN THE NET
        feed = {self.state_feed: state, self.action_feed: action, self.q_value_target_feed: q_value_target}         # The q_error and updating q_table replaced with the ACTUAL NEURAL NET
        self.sess.run(self.train_var,feed_dict=feed)
        
        
        # Decrease random actions as the number of episodes increase
        if experience[4]:
            self.eps = self.eps*.99
    
    def __del__(self):
        self.sess.close()


In [15]:
agent = QNAgent(env)

Action size: 4
State size: 16


In [16]:

# agent = Agent(env)
agent.total_reward = 0

for ep in range(100):
    done = False
    state = env.reset()
    while not done:
        # Do an action for the state you're in, either random or from q_table, so initially it will do random actions
        action = agent.get_action(state)
        
        # Recieve the 4 golden values,after doing the action. (Read- NCfOM)
        next_state,reward,done,info = env.step(action)
        
        # Train the agent
        agent.train((state, action, next_state, reward, done))  # Now that you have the results from your actions, use them to better your agent in future experiences.
        
        state = next_state 
        agent.total_reward += reward
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, Current Reward:{}, eps: {},random: {}".format(ep,agent.total_reward,reward,agent.eps,agent.random))
#         print(agent.total_reward,agent.eps)
        env.render()
        
        with tf.variable_scope("q_state_nn", reuse=True):
            weights = agent.sess.run(tf.get_variable("kernel"))
            tf.get_variable()
            print(weights)
        time.sleep(0.05)
        clear_output(wait=True
    

SyntaxError: unexpected EOF while parsing (<ipython-input-16-e961d51a2a67>, line 32)

In [None]:
state_one_hot = tf.one_hot()    #Default is axis = -1, i.e shape = features x depth