In [6]:
# !pip install gym --no-cache-dir
# !pip install pywin32
# !pip install torch
# !pip install pygame

# !pip install pygame==2.1.0
# !pip install gym[classic_control] pygame
!pip install gym[box2d]

Defaulting to user installation because normal site-packages is not writeable
Collecting box2d-py==2.3.5 (from gym[box2d])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting swig==4.* (from gym[box2d])
  Using cached swig-4.2.0.post0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.6 kB)
Using cached swig-4.2.0.post0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25ldone
[?25h  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp310-cp310-linux_x86_64.whl size=2349113 sha256=683bf954bb7610044b6af38a5ee5cda56a4dd7f7653a20582441084e73bf5e44
  Stored in directory: /home/zea/.cache/pip/wheels/db/8f/6a/eaaadf056fba10a98d986f6dce954e6201ba3126926fc5ad9e
Successfully built box2d-py
Installing collected packages: swig, box2d-py
  Attempting uninstall: box2d-py
    Found existing installation: box2d-py 2.3.8

In [7]:
import gym
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
from gym.utils.play import play

In [None]:
# Функция для прогона одного эпизода с заданной стратегией
def run_episode(policy, env):
    state = env.reset()
    is_done = False
    total_reward = 0
    while not is_done:
        env.render()
        action = policy.make_action(state)
        state, reward, is_done, _ = env.step(action)
        total_reward += reward
    env.close()
    return total_reward

In [None]:
# Болванка для стратегий
class Policy:
    def __init__(self, n_actions):
        pass

    def make_action(self, state):
        pass

    def update(self, state, next_state, action, reward, gamma=1):
        pass

In [None]:
# Стратегия рандомного действия
class RandomPolicy:
    def __init__(self, n_actions):
        self.n_actions = n_actions

    def make_action(self, state):
        return torch.randint(self.n_actions, (1,)).item()

In [None]:
for _ in range(4):
    print(run_episode(RandomPolicy(4), env))

In [None]:
# Агент с DQN-Стратегией
class DQNPolicy:
    def __init__(self, n_actions, station_space, n_hidden = 128, lr=0.005):
        self.n_actions = n_actions

        self.dqn = torch.nn.Sequential(
            torch.nn.Linear(station_space, n_hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(n_hidden, n_actions)
        )
        self.loss = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.dqn.parameters(), lr=0.001)

    def make_action(self, state, eps=0):
        with torch.no_grad():
            best_action = torch.argmax(self.dqn(torch.Tensor(state)))
            if torch.rand((1,)).item() > eps:
                return best_action.item()
            return torch.randint(self.n_actions, (1,)).item()


    def update(self, state, next_state, action, reward, gamma=1):
        q_values = self.dqn(torch.Tensor(state))
        q_values_next = self.dqn(torch.Tensor(next_state))

        q_values_should_be = self.dqn(torch.Tensor(state)).tolist().copy()
        q_values_should_be[action] = reward + gamma*torch.max(q_values_next).item()

        self.optimizer.zero_grad()
        self.loss(q_values, torch.Tensor(q_values_should_be)).backward()
        self.optimizer.step()

In [None]:
# Обучение данной политики в течении n эпизодов
def learn_policy(env, n_episodes, policy):
    total_rewards = []

    for episode in tqdm(range(n_episodes)):
        state = env.reset()
        is_done = False
        total_reward = 0
        while not is_done:
            if episode % 20 == 0:
                env.render()
            action = policy.make_action(state, eps=0.1)
            next_state, reward, is_done, _ = env.step(action)
            total_reward += reward

            policy.update(state, next_state, action, reward)
            state = next_state
        total_rewards.append(total_reward)
    env.close()
    return total_rewards

In [None]:
# Создаем агента
policy = DQNPolicy(env.action_space.n, env.observation_space.shape[0])

In [None]:
# Пока агент ничего не умеет
for _ in range(4):
    print(run_episode(policy, env))

In [None]:
# Обучаем агента на 1000 эпизодах
total_rewards = learn_policy(env, 1000, policy)

In [None]:
# Смотрим, как ведет себя агент
for _ in range(4):
    print(run_episode(policy, env))

In [None]:
# Библиотека, которая берет все на себя!

In [None]:
!pip install stable_baselines3

In [None]:
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
model = DQN('MlpPolicy', env, verbose=1)

In [None]:
evaluate_policy(model, env, render=True, n_eval_episodes=1)
env.close()

In [None]:
model.learn(total_timesteps=300000)

In [None]:
model.save('my_model')

In [None]:
model = DQN.load('my_model')

In [None]:
evaluate_policy(model, env, render=True, n_eval_episodes=4)
env.close()