# Задание  2
Реализовать поведение "поиска" добычи тигром - тигр исследует карту и выслеживает добычу (оказывается в 3 клетках от нее - добыча выслежена). Далее он следует к добыче и пытается ее поймать (как на 1 семинаре). Если кролик уворачивается от тигра, то он отбегает на 5 клеток в любом направлении. Каждый раз, после неудачной ловли, тигр усовершенствует свой уровень охотника на 10 %. Для обеспечения поиска добычи использовать DQN.

In [1]:
class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def dist(self, other):
        return abs(self.x - other.x) + abs(self.y - other.y)
    
    def __eq__(self, other):
        return self.x == other.x and self.y == other.y
    
    def __lt__(self, other):
        return self.x < other.x and self.y < other.y
    
    def __repr__(self):
        return '(' + str(self.x) + ', ' + str(self.y) + ')'

In [2]:
import gym
from gym import spaces
from IPython.display import display
from IPython.display import clear_output
import copy
import numpy as np
import pandas as pd


class HunterEnv(gym.Env):
    def __init__(self, size=10, n_enemy=4, escape=np.random.random()):
        self.size = size
        self.rabbit_escape = escape
        self.tiger_attack = 0.1
        self.enemy, self.pray = self.gen_loc(size, n_enemy)
        self.state = self.encode(0, 0, self.pray.y,  self.pray.x, 0)
        self.action_space = spaces.Discrete(5)
        self.observation_space = spaces.Discrete(size ** 4 * 2)
        self.hunt = False
        self.home = False
        
    def gen_loc(self, size, n_enemy):
        loc = []
        tmp = [(0, 0)]
        
        for enemy in range(n_enemy):
            i = np.random.randint(0, size)
            j = np.random.randint(0, size)
            while (i, j) in tmp:
                i = np.random.randint(0, size)
                j = np.random.randint(0, size)
            tmp.append((i, j))
            loc.append(Point(i, j))
            
        i = np.random.randint(0, size)
        j = np.random.randint(0, size)
        while (i, j) in tmp:
            i = np.random.randint(0, size)
            j = np.random.randint(0, size)
        
        return loc, Point(i, j)
        
    def encode(self, tiger_i, tiger_j, rabbit_i, rabbit_j, purpose):
        i  = tiger_i
        i *= self.size
        i += tiger_j
        i *= self.size
        i += rabbit_i
        i *= self.size
        i += rabbit_j
        i *= 2
        i += purpose
        return i
    
    def decode(self, i):
        out = []
        out.append(i % 2)
        i = i // 2
        out.append(i % self.size)
        i = i // self.size
        out.append(i % self.size)
        i = i // self.size
        out.append(i % self.size)
        i = i // self.size
        out.append(i)
        return reversed(out)
    
    def render(self):
        clear_output(wait=True)
        def cell_color(val):
            color = 'white'
            if val == 'T':
                color = 'blue'
            if val == 'R':
                color = 'green'
            if val == 'X':
                color = 'red'
            return 'color: %s' % color
        n = self.size
        desk = np.full((n, n), '.').astype(str)
        for enemy in self.enemy:
            desk[enemy.y, enemy.x] = 'X'
        tiger_i, tiger_j, rabbit_i, rabbit_j, _ = self.decode(self.state)
        if not self.hunt:
            desk[rabbit_i, rabbit_j] = 'R'
        desk[tiger_i, tiger_j] = 'T'
        display(pd.DataFrame(desk).style.applymap(cell_color))
        
    def reset(self):
        self.state = self.encode(0, 0, self.pray.y,  self.pray.x, 0)
        self.tiger_attack = 0.1
        self.hunt = False
        return self.state
        
    def step(self, action):
        def check(location):
            border1 = Point(self.size, self.size)
            border2 = Point(-1, -1)
            return border2 < location < border1 and location not in self.enemy
                
        cur_state = self.state
        tiger_i, tiger_j, rabbit_i, rabbit_j, purpose = self.decode(cur_state)
        
        tiger = Point(tiger_j, tiger_i)
        rabbit = Point(rabbit_j, rabbit_i)
        home = Point(0, 0)
        goal = home if purpose == 1 else rabbit if tiger.dist(rabbit) < 4 else None
        
        
        if action == 4:
            if purpose == 0 and goal is not None and goal == tiger:
                if self.rabbit_escape < self.tiger_attack:
                    self.state = self.encode(tiger.y, tiger.x, rabbit.y, rabbit.x, 1)
                    self.hunt = True
                    return self.state, 5000, False
                
                self.tiger_attack += 0.1
                dx = np.random.randint(-5, 5)
                dy = 5 - abs(dx) if np.random.random() > 0.5 else abs(dx) - 5
                while not check(Point(rabbit.x + dx, rabbit.y + dy)):
                    dx = np.random.randint(-5, 5)
                    dy = 5 - abs(dx) if np.random.random() > 0.5 else abs(dx) - 5
                rabbit.x += dx
                rabbit.y += dy
                self.state = self.encode(tiger.y, tiger.x, rabbit.y, rabbit.x, purpose)
                return self.state, 1000, False
            
            return cur_state, -2000, False
        
        if action == 0:
            tiger = Point(tiger.x + 1, tiger.y)
        if action == 1:
            tiger = Point(tiger.x - 1, tiger.y)
        if action == 2:
            tiger = Point(tiger.x, tiger.y + 1)
        if action == 3:
            tiger = Point(tiger.x, tiger.y - 1)
            
        if not check(tiger):
            return cur_state, -10000, False
        
        self.state = self.encode(tiger.y, tiger.x, rabbit.y, rabbit.x, purpose)
        
        if goal is None:
            return self.state, -1, False
        
        if tiger == goal:
            if purpose == 0:
                return self.state, 1000, False
            if purpose == 1:
                return self.state, 5000, True
        
        return self.state, -goal.dist(tiger), False

In [4]:
from time import sleep
import random
random.seed = 42
np.random.seed(42)


def progress(i, n_episod):
    clear_output(wait=True)
    print('Episod: %d/%d' % (i, n_episod - 1), end='')
    

size = 10
env = HunterEnv(size=size, n_enemy=10)
env.render()
sleep(3)


q_table = np.zeros((size ** 4 * 2, 5))
alpha = 0.1
gamma = 0.6
epsilon = 0.2
episod = 2000
for i in range(0, episod):
    eps = 1 - (i + 1) / episod
    progress(i, episod)
    state = env.reset()
    done = False
    while not done:
        action = env.action_space.sample() if np.random.uniform(0, 1) < eps else np.argmax(q_table[state])
        next_state, reward, done = env.step(action) 
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value
        state = next_state

Episod: 1999/1999

In [5]:
state = env.reset()
env.render()
sleep(1)
done = False

while not done:
    action = np.argmax(q_table[state])
    state, reward, done = env.step(action)
    env.render()
    sleep(0.3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,T,.,.,.,X,.,.,.,.,.
1,.,.,.,.,.,X,.,.,.,.
2,.,.,.,.,.,.,.,X,.,.
3,.,.,.,.,.,.,X,.,.,.
4,.,.,.,.,.,X,.,X,.,.
5,.,.,.,.,.,.,.,.,.,.
6,.,.,X,.,.,.,.,.,.,.
7,.,X,.,X,.,.,.,.,.,.
8,.,.,.,.,.,.,.,.,.,.
9,.,.,.,.,.,.,X,.,.,.
