# Алгоритм градиента стратеги

In [None]:
import gymnasium as gym
import torch

In [None]:
cartpole_env = gym.make('CartPole-v1')
n_state = cartpole_env.observation_space.shape[0]
n_action = cartpole_env.action_space.n

print(f'Размерность матрицы весов: {n_state}x{n_action}')

In [None]:
current_state, _ = cartpole_env.reset()
current_state = torch.from_numpy(current_state).float()
print(f'Начальное состояние системы: S={current_state}')


In [None]:
# Генерим случайную матрицу весов
episode_weight = torch.rand(n_state, n_action)
print(f'Случайная матрица весов: {episode_weight}')

In [None]:
actions = torch.matmul(current_state, episode_weight)
print(f'Умножение вектора состояния S на матрицу весов дает вектор действий: A={actions}')

In [None]:
current_probs = torch.nn.Softmax(dim=None)(actions)
print(f'Распределение вероятностей действий: {current_probs}')

In [None]:
# выбираем действие с заданной вероятностью
current_action = int(torch.bernoulli(current_probs[1]).item())
print(f'Выбранное действие согласно распределению вероятностей: {current_action}')

In [None]:
# расчет градиента
current_d_softmax = torch.diag(current_probs) - current_probs.view(-1, 1) * current_probs
print(f'Производные: d_softmax={current_d_softmax}')
current_d_log = current_d_softmax[current_action] / current_probs[current_action]
print(f'Производные логарифма стратегии: {current_d_log}')
current_grad = current_state.view(-1, 1) * current_d_log
print(f'Градиент: {current_grad}')

In [None]:
def run_episode(env : gym.Env, weight):
    state, _ = env.reset()
    grads = []
    episode_reward = 0
    is_done = False
    is_truncated = False
    while not is_done and not is_truncated:
        state = torch.from_numpy(state).float()
        z = torch.matmul(state, weight)
        probs = torch.nn.Softmax()(z)
        action = int(torch.bernoulli(probs[1]).item())
        d_softmax = torch.diag(probs) - probs.view(-1, 1) * probs
        d_log = d_softmax[action] / probs[action]
        grad = state.view(-1, 1) * d_log
        grads.append(grad)
        state, reward, is_truncated, is_done, _ = env.step(action)
        episode_reward += reward
        if is_done:
            break
    return episode_reward, grads

## Обучение модели через обновление весов с помощью градиентов

In [None]:
n_episode = 1000
learning_rate = 0.001

total_rewards = []

# Веса задают политику выбора действия
weight = torch.rand(n_state, n_action)

for episode in range(n_episode):
    total_reward, gradients = run_episode(cartpole_env, weight)
    print('Episode {}: {}'.format(episode + 1, total_reward))
    # веса обновляем после прохождения всего эпизода - стратегия Монте-Карло
    for i, gradient in enumerate(gradients):
        weight += learning_rate * gradient * (total_reward - i)
    total_rewards.append(total_reward)

print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards) / n_episode))

## Тестирование модели, используя готовые веса (политику)

In [None]:
n_episode_eval = 100
total_rewards_eval = []
for episode in range(n_episode_eval):
    # Используем готовые веса (политику)
    total_reward, _ = run_episode(cartpole_env, weight)
    print('Episode {}: {}'.format(episode+1, total_reward))
    total_rewards_eval.append(total_reward)

print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards_eval) / n_episode_eval))