# Стратегия случайного поиска

Находит наилучшее линейное отображение из пространства состояний в пространство действий через случайную матрицу весов.

[A] = [S] х [Weights]

action = argmax([A]) 

In [1]:
import gymnasium as gym
import torch

In [19]:
cartpole_env = gym.make('CartPole-v1')
n_state = cartpole_env.observation_space.shape[0]
n_action = cartpole_env.action_space.n

print(f'Размерность матрицы весов: {n_state}x{n_action}')

Размерность матрицы весов: 4x2


In [20]:
# Генерим случайную матрицу весов
episode_weight = torch.rand(n_state, n_action)
print(f'Случайная матрица весов: {episode_weight}')

Случайная матрица весов: tensor([[0.1599, 0.3559],
        [0.7156, 0.4566],
        [0.7774, 0.4132],
        [0.3168, 0.5085]])


In [21]:
current_state, _ = cartpole_env.reset()
current_state = torch.from_numpy(current_state).float()
print(f'Начальное состояние системы: S={current_state}')


Начальное состояние системы: S=tensor([ 0.0442, -0.0030,  0.0027,  0.0232])


In [22]:
# Умножение вектора состояния S на матрицу весов дает вектор действий A
actions = torch.matmul(current_state, episode_weight)
print(f'Умножение вектора состояния S на матрицу весов дает вектор действий: A={actions}')

Умножение вектора состояния S на матрицу весов дает вектор действий: A=tensor([0.0143, 0.0272])


In [23]:
best_action = torch.argmax(actions)
print(f'Выбираем действие с наибольшим весом: {best_action}')

Выбираем действие с наибольшим весом: 1


In [26]:
# Прогоняем эпизод с фиксированной случайной матрицей и находим суммарную награду
def run_episode(env, weight):
    state, _ = env.reset()
    episode_reward = 0
    is_done = False
    is_truncated = False
    while not is_done and not is_truncated:
        state = torch.from_numpy(state).float()
        action = torch.argmax(torch.matmul(state, weight))
        state, reward, is_truncated, is_done, _ = env.step(action.item())
        episode_reward += reward
    return episode_reward


In [27]:
n_episode = 1000

best_total_reward = 0
best_weight = None

total_rewards = []

for episode in range(n_episode):
    # В каждом эпизоде генерим матрицу весов случайным образом
    episode_weight = torch.rand(n_state, n_action)
    total_reward = run_episode(cartpole_env, episode_weight)
    print(f'Episode {episode+1}: {total_reward}')
    if total_reward > best_total_reward:
        # Выбираем матрицу весов с максимальной наградой по всем эпизодам
        best_weight = episode_weight
        best_total_reward = total_reward
    total_rewards.append(total_reward)

print(f'Average total reward over {n_episode} episode: {sum(total_rewards) / n_episode}')
print(f'Best weight: {best_weight}')

Episode 1: 10.0
Episode 2: 191.0
Episode 3: 9.0
Episode 4: 36.0
Episode 5: 9.0
Episode 6: 9.0
Episode 7: 40.0
Episode 8: 10.0
Episode 9: 65.0
Episode 10: 9.0
Episode 11: 9.0
Episode 12: 10.0
Episode 13: 19.0
Episode 14: 20.0
Episode 15: 10.0
Episode 16: 10.0
Episode 17: 10.0
Episode 18: 77.0
Episode 19: 10.0
Episode 20: 10.0
Episode 21: 468.0
Episode 22: 9.0
Episode 23: 10.0
Episode 24: 9.0
Episode 25: 37.0
Episode 26: 9.0
Episode 27: 9.0
Episode 28: 9.0
Episode 29: 30.0
Episode 30: 8.0
Episode 31: 18.0
Episode 32: 34.0
Episode 33: 289.0
Episode 34: 9.0
Episode 35: 227.0
Episode 36: 36.0
Episode 37: 18.0
Episode 38: 28.0
Episode 39: 34.0
Episode 40: 8.0
Episode 41: 9.0
Episode 42: 8.0
Episode 43: 9.0
Episode 44: 55.0
Episode 45: 10.0
Episode 46: 9.0
Episode 47: 9.0
Episode 48: 8.0
Episode 49: 86.0
Episode 50: 49.0
Episode 51: 8.0
Episode 52: 204.0
Episode 53: 188.0
Episode 54: 101.0
Episode 55: 58.0
Episode 56: 9.0
Episode 57: 500.0
Episode 58: 11.0
Episode 59: 9.0
Episode 60: 12.0
Epi