In [109]:
import numpy as np
from gym import Env
from gym.spaces import Box, Discrete
import random
from IPython.display import clear_output
from time import sleep, time

In [110]:
class Point2():
    def __init__(self,x=0,y=0):
        self._info = np.array([x,y],ndmin=1)

    @property
    def x(self):
        return self._info[0]

    @property
    def y(self):
        return self._info[1]

    @x.setter
    def x(self, a):
        self._info[0] = a

    @y.setter
    def y(self, a):
        self._info[1] = a

    def __add__(self, o):
        return Point2(self._info[0] + o._info[0], self._info[1] + o._info[1])

    def __sub__(self, o):
        return Point2(self._info[0] - o._info[0], self._info[1] - o._info[1])

    def __eq__(self, other):
        if np.all(np.equal(self._info,other._info)):
            return True
        else:
            return False

In [127]:
class CustomEnv(Env):
    def __init__(self, w):
        self.WIDTH = w
        self.action_space = Discrete(4)
        self.observation_space = Discrete((self.WIDTH+2)**2)
        self.player = Point2(0,0)
        self.gift = Point2(0,0)
        self.aim = Point2(0,-1)
        self.holes = [Point2(0,0)]
        self.endGame = False
        '''
        t.onkey(lambda: self.change(1, 0), 'Right')
        t.onkey(lambda: self.change(-1, 0), 'Left')
        t.onkey(lambda: self.change(0, 1), 'Up')
        t.onkey(lambda: self.change(0, -1), 'Down')
        '''

    def inside(self):
        return -1 < self.player .x < self.WIDTH and -1 < self.player.y < self.WIDTH

    def change(self,action):
        if action == 0:
            self.aim.x = 0
            self.aim.y = -1
        elif action == 1:
            self.aim.x = -1
            self.aim.y = 0
        elif action == 2:
            self.aim.x = 0
            self.aim.y = 1
        elif action == 3:
            self.aim.x = 1
            self.aim.y = 0
        else:
            print('Error, no such key\n')

    def step(self, action):
        self.change(action)
        t = False
        oldpos = self.player
        self.player = self.player + self.aim

        if not self.inside() or self.player in self.holes:
            self.endGame = True
            reward = -20
            done = True
            t = False
        else:
            done = False
            if self.player == self.gift:
                reward = 70
                done = True
                t = True
            else:
                prev = np.array([np.abs(oldpos.x-self.gift.x),np.abs(oldpos.y-self.gift.y)])
                cur =  np.array([np.abs(self.player.x-self.gift.x),np.abs(self.player.y-self.gift.y)])

                if prev[0] < cur[0] or prev[1] < cur[1]:
                    reward = -2
                else:
                    reward = 0
        # Setting the placeholder for info
        info = {}
        # Returning the step information
        return self.playerPos(), reward, done,t , info

    def render(self,episode,t):
        clear_output(wait=True)
        s = ''
        for i in range(-1,self.WIDTH+1):
            for j in range(-1,self.WIDTH+1):
                if Point2(i,j) == self.player:
                    s = s + '@'
                elif Point2(i,j) in self.holes:
                    s = s + '█'
                elif Point2(i,j) == self.gift:
                    s = s + 'X'
                elif j == -1 or j == self.WIDTH:
                    s = s + '‖'
                elif i == -1 or i == self.WIDTH:
                    s = s + '='
                else:
                    s = s + '·'
            s = s + '\n'
        s = s + 'Episode:{} Win:{}'.format(episode,t)
        print(s)

    def playerPos(self):
        return self.player.x*self.WIDTH + self.player.y

    def rPoint(self):
        x = random.randint(0,self.WIDTH-1)
        y = random.randint(0,self.WIDTH-1)
        return Point2(x, y)

    def reset(self,seed=2,amountHoles=-1,playerSeed=False):
        if amountHoles == -1:
            amountHoles = self.WIDTH
        random.seed(1)
        self.gift = self.rPoint()
        for i in range(amountHoles):
            self.holes.append(self.rPoint())
        self.aim = Point2(0, -1)
        if playerSeed:
          t = 1000 * time() # current time in milliseconds
          random.seed(int(t) % 2**32)
        self.player = self.rPoint()
        while self.player in self.holes and self.player != self.gift:
            self.player = self.rPoint()

        self.endGame = False
        return self.playerPos()


In [128]:
env = CustomEnv(10)
Q = np.zeros([env.observation_space.n,env.action_space.n])
# env.observation.n, env.action_space.n gives number of states and action in env loaded
print(f'Observation space {env.observation_space.n}'.format(env.observation_space.n))
print(f'Action space {env.action_space.n}'.format(env.action_space.n))

Observation space 144
Action space 4


In [129]:
# 2. Parameters of Q-learning
eta = .628
gma = .9
epis = 100
rev_list = [] # rewards per episode calculate

In [114]:
# 3. Q-learning Algorithm
for i in range(epis):
    # Reset environment
    s = env.reset(seed=5,playerSeed=True)
    rAll = 0
    d = False
    j = 0
    tr = False
    #The Q-Table learning algorithm
    while not d:
        #env.render()
        j += 1
        # Choose action from Q table
        a = np.argmax(Q[s, :] + np.random.randn(1, env.action_space.n) * (1. / (i + 1)))
        #Get new state & reward from environment
        s1, r, d, tr, _ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s, a] = Q[s, a] + eta * (r + gma * np.max(Q[s1, :]) - Q[s, a])
        rAll += r
        s = s1
        print('\r'+'interation'+str(j),end='')
        #print()
        #env.render(i,tr)
        #sleep(0.2)
    rev_list.append(rAll)
    print()
    env.render(i,tr)
print("Reward Sum on all episodes " + str(sum(rev_list) / epis))
print("Final Values Q-Table")
print(Q)

‖█·····@█··‖
‖····█··█··‖
‖·········X‖
‖··········‖
‖···█······‖
‖··········‖
‖···█·····█‖
‖·······█··‖
‖··········‖
‖·█········‖
Episode:99 Win:False
Reward Sum on all episodes 26.68
Final Values Q-Table
[[  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [  0.         -12.56         0.           0.        ]
 [  0.           0.           0.           0.        ]
 [  0.           0.           0.           0.        ]
 [-12.56         0.           0.           0.        ]
 [ -1.256      -12.56       -12.56        24.846192  ]
 [  0.         -12.56         0.           0.        ]
 [ -1.256        0.           0.           0.        ]
 [ -1.256       -1.256        0.           7.80656249]
 [ -1.256        0.         -12.56         0.        ]
 [  0.           0.      

In [131]:
env = CustomEnv(10)
# Reset environment

tr = False
ep = 0
# The Q-Table learning algorithm
for i in range(10):
    s = env.reset(seed=5,playerSeed=True)
    d = False
    while not d:
        # Choose action from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(10+1)))
        #Get new state & reward from environment
        s1,r,d,tr,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + eta*(r + gma*np.max(Q[s1,:]) - Q[s,a])
        s = s1
        sleep(0.2)
        env.render(ep,tr)
    env.render(ep,tr)
    sleep(1)
    ep = ep+1
# Code will stop at d == True, and render one state before it

‖█·····██··‖
‖····█··█··‖
‖·········X‖
‖··········‖
‖···█······‖
‖··········‖
‖···█·····█‖
‖·······█··‖
‖··········‖
‖·@········‖
Episode:9 Win:False


towardsdatascience.com/reinforcement-learning-with-openai-d445c2c687d2
https://www.mlq.ai/guide-to-deep-reinforcement-learning/
https://www.mlq.ai/what-are-convolutional-neural-networks/
https://keras.io/guides/writing_a_training_loop_from_scratch/#lowlevel-handling-of-losses-tracked-by-the-model