In [32]:
import numpy as np
import gym
from IPython.display import clear_output
from time import sleep

# Создайте окружение, указав использование нового API Step
env = gym.make('Taxi-v3', new_step_api=True)
env.reset (seed=42)

# Получение состояния среды и пространства действий
num_states = env.observation_space.n
num_actions = env.action_space.n
def print_frames(frames):
  for i, frame in enumerate(frames):
    clear_output(wait=True)
    print(frame['frame'])
    print(f"Timestep: {i + 1}")
    print(f"State: {frame['state']}")
    print(f"Action: {frame['action']}")
    print(f"Reward: {frame['reward']}")
    sleep(.1)

def policy_evaluation(policy, env, discount_factor=0.9, theta=1e-8):
  V = np. zeros(num_states)
  while True:
    delta = 0
    for s in range(num_states):
      v = 0
      for a, action_prob in enumerate(policy[s]):
        for prob, next_state, reward, done in env.P[s][a]:
          v += action_prob * prob * (reward + discount_factor * V[next_state])
      delta = max(delta, np.abs(v - V[s]))
      V[s]=v
    if delta < theta:
      break
  return V

def policy_improvement(V, env, discount_factor=0.9):
  policy = np. zeros([num_states, num_actions])
  for s in range(num_states):
    q = np. zeros (num_actions)
    for a in range(num_actions):
      for prob, next_state, reward, done in env. P[s][a]:
        q[a] += prob * (reward + discount_factor * V[next_state])
    best_a = np. argmax(q)
    policy[s, best_a] = 1.0
  return policy


def policy_iteration(env, discount_factor=0.9, max_iterations=200) :
  policy = np.ones([num_states, num_actions]) / num_actions
  for i in range(max_iterations):
    V = policy_evaluation(policy, env, discount_factor)
    new_policy = policy_improvement(V, env, discount_factor)
    if (new_policy == policy).all():
      print(f"Policy converged at iteration (i+1).")
      break
    policy = new_policy
  return policy, V

policy, V = policy_iteration(env)


state = env. reset()
done = False
frames = []

while not done:
  action = np.argmax(policy[state])
  state, reward, done, truncated, info = env.step(action)
  frames.append({
    'frame': env.render (mode='ansi'),
    'state': state,
    'action': action,
    'reward': reward
  })

print_frames(frames)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

Timestep: 14
State: 410
Action: 5
Reward: 20
