# Importação dos módulos

In [1]:
import random, operator

# Funções utilitárias

In [2]:
def argmax(seq, fn):
    best = seq[0]
    best_score = fn(best)
    for x in seq:
        x_score = fn(x)
        if x_score > best_score:
            best, best_score = x, x_score
    return best

In [3]:
def vector_add(a, b):
    return tuple(map(operator.add, a, b))

In [4]:
def isnumber(x):
    return hasattr(x, '__int__')

# Ações do agente

In [5]:
# Conjunto de orientações possíveis

In [6]:
# Funções de rotação do agente

# Definição do modelo (Markov decision process)

In [7]:
class MDP:
    def __init__(self, init_pos, actlist, terminals, transitions={}, states=None, gamma=0.99):
        if not (0 < gamma <= 1):
            raise ValueError('Gamma value')
        if states:
            self.states = states
        else:
            self.states = set()
            self.init_pos = init_pos
            self.actlist = actlist
            self.terminals = terminals
            self.transitions = transitions
            self.gamma = gamma
            self.reward = {}
            
    def R(self, state):
        return self.reward[state]
    
    def T(self, state, action):
        if (self.transitions == {}):
            raise ValueError('Transition model missing')
        else:
            return self.transitions[state][action]
        
    def actions(self, state):
        if state in self.terminals:
            return [None]
        else:
            return self.actlist

# Definição do ambiente (grid)

In [8]:
class GridMDP(MDP):
    def __init__(self, grid, terminals, init_pos=(0, 0), gamma=0.99):
        grid.reverse()
        MDP.__init__(self, init_pos, actlist=orientations, terminals=terminals, gamma=gamma)
        self.grid = grid
        self.rows = len(grid)
        self.cols = len(grid[0])
        for x in range(self.cols):
            for y in range(self.rows):
                self.reward[x, y] = grid[y][x]
                if grid[y][x] is not None:
                    self.states.add((x, y))
                    
    def T(self, state, action):
        # Função de transição de estado dada a ação do agente
        
    def go(self, state, direction):
        # Função de movimentação do agente
    
    def to_grid(self, mapping):
        return list(reversed([[mapping.get((x, y), None)
                              for x in range(self.cols)]
                             for y in range(self.rows)]))
    
    def to_arrows(self, policy):
        chars = {
            (1, 0): '>',
            (0, 1): '^',
            (-1, 0): '<',
            (0, -1): 'v',
            None: '.'
        }
        return self.to_grid({
            s: chars[a] for (s, a) in policy.items()
        })

# Value iteration

In [9]:
def value_iteration(mdp, epsilon=0.001):
    # Algoritmo de value iteration

def best_policy(mdp, STS):
    # Seleção da melhor política

# Policy iteration

In [10]:
def expected_utility(a, s, STS, mdp):
    # Função do valor esperado

def policy_iteration(mdp):
    # Algoritmo de policy iteration

def policy_evaluation(pi, STS, mdp, k=20):
    # Função de avaliação da política

# Impressão do grid

In [11]:
def print_table(table, header=None, sep=' ', numfmt='{}'):
    justs = ['rjust' if isnumber(x) else 'ljust' for x in table[0]]
    if header:
        table.insert(0, header)
    table = [[numfmt.format(x) if isnumber(x) else x for x in row]
            for row in table]
    sizes = list(map(lambda seq: max(map(len, seq)),
                    list(zip(*[map(str, row) for row in table]))))
    for row in table:
        print(sep.join(getattr(str(x), j)(size) for (j, size, x)
                      in zip(justs, sizes, row)))

# Criação do ambiente

In [12]:
# Criação do ambiente

# Solução por value iteration

In [13]:
# Solução do grid por value iteration

print('Optimal Policy based on value iteration')
print_table(sequential_decision_environment.to_arrows(value_iter))

Optimal Policy based on value iteration
> >    > .
^ None ^ .
^ <    < <


# Solução por policy iteration

In [14]:
# Solução do grid por policy iteration

print('Optimal Policy based on policy iteration and evaluation')
print_table(sequential_decision_environment.to_arrows(policy_iter))

Optimal Policy based on policy iteration and evaluation
> >    > .
^ None ^ .
^ <    < <
