In [0]:
import os
import random
import sys
import operator
import numpy as np
!pip install environs



In [0]:
class QLearningModel:

    def __init__(self, col=5, row=5):
        self.col = col  
        self.row = row  
        self.stateProperties = (col, row)
        self.actionNum = (4,)
        self.action = {"up": 0, "right": 1, "down": 2, "left": 3}
        self.coordinates = [(-1, 0), (0, 1), (1, 0), (0, -1)]  # translations
        # Define the table for rewards
        self.R = self.rewardsCollection()  

    def Intializer(self):
        # Place agent to top-left grid corner
        self.state = (0, 0)
        return self.state

    def move(self, action):
       
        nextState = (self.state[0] + self.coordinates[action][0], self.state[1] + self.coordinates[action][1])
        reward = self.R[self.state + (action,)]
        # Terminate if we reach target position
        finished = (nextState[0] == self.col - 1) and (nextState[1] == self.row - 1)
       
        self.state = nextState
        return nextState, reward, finished

    def actionPossible(self):
        actions_allowed = []
        y, x = self.state[0], self.state[1]
        if (y > 0): 
            actions_allowed.append(self.action["up"])
        if (y < self.col - 1): 
            actions_allowed.append(self.action["down"])
        if (x > 0):  
            actions_allowed.append(self.action["left"])
        if (x < self.row - 1):  
            actions_allowed.append(self.action["right"])
        actions_allowed = np.array(actions_allowed, dtype=int)
        return actions_allowed

    def rewardsCollection(self):
        # Define agent rewards R[s,a]
        goalRewards = 600  # reward if reach to final state
        nonGoalRewards = -5  # penalty 
        R = nonGoalRewards * np.ones(self.stateProperties + self.actionNum, dtype=float) 
        R[self.col - 2, self.row - 1, self.action["down"]] = goalRewards  
        R[self.col - 1, self.row - 2, self.action["right"]] = goalRewards  
        return R

In [0]:
class Agent:

    def __init__(self, env):
        self.stateProperties = env.stateProperties
        self.actionNum = env.actionNum
        # Agent learning parameters
        self.epsilon = 0.04  # probability
        self.epsilon_decay = 0.1 
        self.beta = 0.1 
        self.gamma = 0.7 # Reward discount
        # Initialize Q[s,a] table
        self.Q = np.zeros(self.stateProperties + self.actionNum, dtype=float)

    def get_action(self, env):
        
        if random.uniform(0, 1) < self.epsilon:
            return np.random.choice(env.actionPossible())
        else:
            # exploit on allowed actions
            state = env.state;
            actions_allowed = env.actionPossible()
            Q_s = self.Q[state[0], state[1], actions_allowed]
            greedyActions = actions_allowed[np.flatnonzero(Q_s == np.max(Q_s))]
            return np.random.choice(greedyActions)

    def train(self, memory):
        (state, action, nextState, reward, finished) = memory
        sa = state + (action,)
        self.Q[sa] += self.beta * (reward + self.gamma * np.max(self.Q[nextState]) - self.Q[sa])

    def ViewGreedyPolicy(self):
        greedyPolicy = np.zeros((self.stateProperties[0], self.stateProperties[1]), dtype=int)
        for x in range(self.stateProperties[0]):
            for y in range(self.stateProperties[1]):
                greedyPolicy[y, x] = np.argmax(self.Q[y, x, :])
        print("\nGreedy policy:")
        print(greedyPolicy)
        print()


In [0]:
env = QLearningModel(col=5, row=5)
agent = Agent(env)
for item in range(600):
    i, reward_i = 0, 0
    state = env.Intializer()
    while 1:
        action = agent.get_action(env)
        nextState, reward, finished = env.move(action)
        agent.train((state, action, nextState, reward, finished))
        i += 1
        reward_i += reward
        if finished:
            break
        state = nextState
    agent.epsilon = max(agent.epsilon * agent.epsilon_decay, 0.01)

    if (item == 599):
        agent.ViewGreedyPolicy()
        for (key, val) in sorted(env.action.items(), key=operator.itemgetter(1)):
            print(" Action['{}'] = {}".format(key, val))
        print()


Greedy policy:
[[2 1 1 1 2]
 [2 0 1 1 2]
 [2 3 2 1 2]
 [2 1 1 2 2]
 [1 1 1 1 0]]

 Action['up'] = 0
 Action['right'] = 1
 Action['down'] = 2
 Action['left'] = 3

