# Задание  1
Установить gym (python3 - https://github.com/openai/gym, для тех, кто делает на java - https://github.com/deeplearning4j/gym-java-client), реализовать среду из предыдущего семинара в gym, агенты - тигр и кролик. Реализовать задачу из семинара 2 в openai gym. 


In [1]:
class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def dist(self, other):
        return abs(self.x - other.x) + abs(self.y - other.y)
    
    def __eq__(self, other):
        return self.x == other.x and self.y == other.y
    
    def __lt__(self, other):
        return self.x < other.x and self.y < other.y
    
    def __repr__(self):
        return '(' + str(self.x) + ', ' + str(self.y) + ')'

In [2]:
import gym
from gym import spaces
from IPython.display import display
from IPython.display import clear_output
import copy
import numpy as np
import pandas as pd


class HunterEnv(gym.Env):
    def __init__(self, size=10, n_enemy=4, n_pray=3):
        self.state = 0
        self.size = size
        self.loc = self.gen_loc(size, n_enemy, n_pray)
        self.action_space = spaces.Discrete(5)
        self.observation_space = spaces.Discrete(size * size * 2)
        
    def gen_loc(self, size, n_enemy, n_pray):
        loc = {'pray': [], 'enemy': []}
        tmp = [(0, 0)]
        
        for pray in range(n_pray):
            i = np.random.randint(0, size)
            j = np.random.randint(0, size)
            while (i, j) in tmp:
                i = np.random.randint(0, size)
                j = np.random.randint(0, size)
            tmp.append((i, j))
            loc['pray'].append(Point(i, j))
            
        for enemy in range(n_enemy):
            i = np.random.randint(0, size)
            j = np.random.randint(0, size)
            while (i, j) in tmp:
                i = np.random.randint(0, size)
                j = np.random.randint(0, size)
            tmp.append((i, j))
            loc['enemy'].append(Point(i, j))
            
        self.backup = copy.deepcopy(loc)
        return loc
        
    def encode(self, tiger_i, tiger_j, purpose):
        i  = tiger_i
        i *= self.size
        i += tiger_j
        i *= 2
        i += purpose
        return i
    
    def decode(self, i):
        out = []
        out.append(i % 2)
        i = i // 2
        out.append(i % self.size)
        i = i // self.size
        out.append(i)
        return reversed(out)
    
    def render(self):
        clear_output(wait=True)
        def cell_color(val):
            color = 'white'
            if val == 'T':
                color = 'blue'
            if val == '+':
                color = 'green'
            if val == 'X':
                color = 'red'
            return 'color: %s' % color
        n = self.size
        desk = np.full((n, n), '.').astype(str)
        for pray in self.loc['pray']:
            desk[pray.y][pray.x] = '+'
        for enemy in self.loc['enemy']:
            desk[enemy.y][enemy.x] = 'X'
        tiger_i, tiger_j, _ = self.decode(self.state)
        desk[tiger_i][tiger_j] = 'T'
        display(pd.DataFrame(desk).style.applymap(cell_color))
        
    def reset(self):
        self.state = 0
        self.loc['pray'] = list(self.backup['pray'])
        self.loc['enemy'] = list(self.backup['enemy'])
        self.tiger_attack = 0.1
        return 0
        
    def step(self, action):
        def check(location):
            border1 = Point(self.size, self.size)
            border2 = Point(-1, -1)
            return border2 < location < border1 and location not in self.loc['enemy']

        def pray(location):
            pray_list = self.loc['pray']
            pray = [(location.dist(pray), pray) for pray in pray_list]
            pray = min(pray)
            return pray[1]
                
        
        s = self.state
        tiger_i, tiger_j, purpose = self.decode(s)
        tiger = Point(tiger_j, tiger_i)
        p = Point(0, 0) if purpose == 1 else pray(tiger)
        
        if action == 4:
            if purpose == 0 and p == tiger:
                self.state = self.encode(tiger.y, tiger.x, 1)
                self.loc['pray'].remove(p)
                return self.state, 5000, False     
            return s, -500, False
            
        if action == 0:
            state = Point(tiger.x + 1, tiger.y)
        if action == 1:
            state = Point(tiger.x - 1, tiger.y)
        if action == 2:
            state = Point(tiger.x, tiger.y + 1)
        if action == 3:
            state = Point(tiger.x, tiger.y - 1)
            
        if not check(state):
            return s, -500, False
        
        self.state = self.encode(state.y, state.x, purpose)
        
        if state == p:
            if purpose == 0:
                return self.state, 1000, False
            if purpose == 1:
                return self.state, 5000, True
        
        return self.state, -p.dist(state), False

In [4]:
np.random.seed(42)


size = 10
env = HunterEnv(size=size, n_enemy=40, n_pray=3)
env.render()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,T,X,X,.,X,.,.,X,X,.
1,.,.,.,X,.,X,.,.,X,.
2,X,.,.,.,X,X,.,X,X,X
3,.,X,.,.,.,.,+,X,.,.
4,.,X,.,.,.,X,X,+,.,X
5,.,X,.,X,.,.,.,.,.,X
6,.,.,X,.,.,.,X,.,X,.
7,.,X,X,X,.,.,X,X,X,.
8,X,.,.,.,.,.,X,.,X,X
9,.,X,.,.,X,X,+,X,.,.


In [5]:
def progress(i, n_episod):
    clear_output(wait=True)
    print('Epoch %d/%d: |' % (i, n_episod - 1), end='')
    for _ in range(i):
        print('-', end='')
    for _ in range(n_episod - i - 1):
        print(' ', end='')
    print('|')


q_table = np.zeros((size * size * 2, 5))
alpha = 0.1
gamma = 0.6
epsilon = 0.2

episod = 100
for i in range(0, episod):
    progress(i, episod)
    state = env.reset()
    done = False
    eps = 1 - (i + 1) / episod
    while not done:
        action = env.action_space.sample() if np.random.uniform(0, 1) < eps else np.argmax(q_table[state])
        next_state, reward, done = env.step(action) 
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value
        state = next_state

Epoch 99/99: |---------------------------------------------------------------------------------------------------|


In [6]:
from time import sleep


state = env.reset()
env.render()
sleep(1)

done = False
while not done:
    action = np.argmax(q_table[state])
    state, reward, done = env.step(action)
    env.render()
    sleep(0.5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,T,X,X,.,X,.,.,X,X,.
1,.,.,.,X,.,X,.,.,X,.
2,X,.,.,.,X,X,.,X,X,X
3,.,X,.,.,.,.,.,X,.,.
4,.,X,.,.,.,X,X,+,.,X
5,.,X,.,X,.,.,.,.,.,X
6,.,.,X,.,.,.,X,.,X,.
7,.,X,X,X,.,.,X,X,X,.
8,X,.,.,.,.,.,X,.,X,X
9,.,X,.,.,X,X,+,X,.,.
