In [9]:
import numpy as np

In [59]:

# Paramètres
gamma = 0.9
rows, cols = 3, 4  # Taille de la grille
goal_state = (0, 3)
fire_state = (1, 3)
obstacle = (1, 1)
reward_goal = 1
reward_fire = -1
reward_step = 0  # Encourage le chemin le plus court

# Initialisation des valeurs des états et des récompenses
V = np.zeros((rows, cols))
policy = np.full((rows, cols),' ', dtype=str)
rewards = np.full((rows, cols), reward_step, dtype=float)
rewards[goal_state] = reward_goal
rewards[fire_state] = reward_fire
rewards[obstacle] = None  # Obstacle infranchissable

# Déplacements possibles et leurs représentations
actions = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # Haut, Bas, Gauche, Droite
action_symbols = {(-1, 0): '↑', (1, 0): '↓', (0, -1): '←', (0, 1): '→'}

# Algorithme de Value Iteration
def value_iteration(V, rewards, gamma,policy,iterations=100):
    for _ in range(iterations):
        new_V = np.copy(V)
        new_policy=np.copy(policy)
        for i in range(rows):
            for j in range(cols):
                if (i, j) in [goal_state, fire_state, obstacle]:  # États terminaux ou obstacles
                    continue
                values = []
                for action in actions:
                    ni, nj = i + action[0], j + action[1]
                    if 0 <= ni < rows and 0 <= nj < cols and (ni, nj) != obstacle:
                        values.append((rewards[(ni, nj)] + gamma * V[ni, nj], action_symbols[action]))
                if values:
                    new_V[i, j], new_policy[i,j] = max(values)
        V = new_V
        policy=new_policy
        policy[goal_state] = 'G'
        policy[fire_state] = '🔥'
        policy[obstacle] = '█'
    return V,policy

# Détermination de la meilleure action pour chaque état
def policy_extraction(V, rewards):
    policy = np.full((rows, cols), ' ', dtype=str)
    for i in range(rows):
        for j in range(cols):
            if (i, j) == goal_state:
                policy[i, j] = 'G'
            elif (i, j) == fire_state:
                policy[i, j] = '🔥'
            elif (i, j) == obstacle:
                policy[i, j] = '█'
            else:
                best_action = None
                best_value = float('-inf')
                for action in actions:
                    ni, nj = i + action[0], j + action[1]
                    if 0 <= ni < rows and 0 <= nj < cols and (ni, nj) != obstacle:
                        value = rewards[(ni, nj)] + gamma * V[ni, nj]
                        if value > best_value:
                            best_value = value
                            best_action = action
                if best_action:
                    policy[i, j] = action_symbols[best_action]
    return policy

# Exécution des algorithmes
V,policy = value_iteration(V, rewards, gamma,policy)

print('Values')
print(V)
print('Policy')
print(policy)
print('Rewards')
print(rewards)

Values
[[0.81   0.9    1.     0.    ]
 [0.729  0.     0.9    0.    ]
 [0.6561 0.729  0.81   0.729 ]]
Policy
[['→' '→' '→' 'G']
 ['↑' '█' '↑' '🔥']
 ['→' '→' '↑' '←']]
Rewards
[[ 0.  0.  0.  1.]
 [ 0. nan  0. -1.]
 [ 0.  0.  0.  0.]]
